1 /* 2 * Copyright (c) 1997, 2017, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2012, 2017, SAP SE. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.inline.hpp" 28 #include "compiler/disassembler.hpp" 29 #include "gc/shared/cardTableModRefBS.hpp" 30 #include "gc/shared/collectedHeap.inline.hpp" 31 #include "interpreter/interpreter.hpp" 32 #include "memory/resourceArea.hpp" 33 #include "nativeInst_ppc.hpp" 34 #include "prims/methodHandles.hpp" 35 #include "runtime/biasedLocking.hpp" 36 #include "runtime/icache.hpp" 37 #include "runtime/interfaceSupport.hpp" 38 #include "runtime/objectMonitor.hpp" 39 #include "runtime/os.hpp" 40 #include "runtime/sharedRuntime.hpp" 41 #include "runtime/stubRoutines.hpp" 42 #include "utilities/macros.hpp" 43 #if INCLUDE_ALL_GCS 44 #include "gc/g1/g1CollectedHeap.inline.hpp" 45 #include "gc/g1/g1SATBCardTableModRefBS.hpp" 46 #include "gc/g1/heapRegion.hpp" 47 #endif // INCLUDE_ALL_GCS 48 #ifdef COMPILER2 49 #include "opto/intrinsicnode.hpp" 50 #endif 51 52 #ifdef PRODUCT 53 #define BLOCK_COMMENT(str) // nothing 54 #else 55 #define BLOCK_COMMENT(str) block_comment(str) 56 #endif 57 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 58 59 #ifdef ASSERT 60 // On RISC, there's no benefit to verifying instruction boundaries. 61 bool AbstractAssembler::pd_check_instruction_mark() { return false; } 62 #endif 63 64 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) { 65 assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range"); 66 if (Assembler::is_simm(si31, 16)) { 67 ld(d, si31, a); 68 if (emit_filler_nop) nop(); 69 } else { 70 const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31); 71 const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31); 72 addis(d, a, hi); 73 ld(d, lo, d); 74 } 75 } 76 77 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) { 78 assert_different_registers(d, a); 79 ld_largeoffset_unchecked(d, si31, a, emit_filler_nop); 80 } 81 82 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base, 83 size_t size_in_bytes, bool is_signed) { 84 switch (size_in_bytes) { 85 case 8: ld(dst, offs, base); break; 86 case 4: is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break; 87 case 2: is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break; 88 case 1: lbz(dst, offs, base); if (is_signed) extsb(dst, dst); break; // lba doesn't exist :( 89 default: ShouldNotReachHere(); 90 } 91 } 92 93 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base, 94 size_t size_in_bytes) { 95 switch (size_in_bytes) { 96 case 8: std(dst, offs, base); break; 97 case 4: stw(dst, offs, base); break; 98 case 2: sth(dst, offs, base); break; 99 case 1: stb(dst, offs, base); break; 100 default: ShouldNotReachHere(); 101 } 102 } 103 104 void MacroAssembler::align(int modulus, int max, int rem) { 105 int padding = (rem + modulus - (offset() % modulus)) % modulus; 106 if (padding > max) return; 107 for (int c = (padding >> 2); c > 0; --c) { nop(); } 108 } 109 110 // Issue instructions that calculate given TOC from global TOC. 111 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16, 112 bool add_relocation, bool emit_dummy_addr) { 113 int offset = -1; 114 if (emit_dummy_addr) { 115 offset = -128; // dummy address 116 } else if (addr != (address)(intptr_t)-1) { 117 offset = MacroAssembler::offset_to_global_toc(addr); 118 } 119 120 if (hi16) { 121 addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset)); 122 } 123 if (lo16) { 124 if (add_relocation) { 125 // Relocate at the addi to avoid confusion with a load from the method's TOC. 126 relocate(internal_word_Relocation::spec(addr)); 127 } 128 addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset)); 129 } 130 } 131 132 int MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) { 133 const int offset = MacroAssembler::offset_to_global_toc(addr); 134 135 const address inst2_addr = a; 136 const int inst2 = *(int *)inst2_addr; 137 138 // The relocation points to the second instruction, the addi, 139 // and the addi reads and writes the same register dst. 140 const int dst = inv_rt_field(inst2); 141 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 142 143 // Now, find the preceding addis which writes to dst. 144 int inst1 = 0; 145 address inst1_addr = inst2_addr - BytesPerInstWord; 146 while (inst1_addr >= bound) { 147 inst1 = *(int *) inst1_addr; 148 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 149 // Stop, found the addis which writes dst. 150 break; 151 } 152 inst1_addr -= BytesPerInstWord; 153 } 154 155 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 156 set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset)); 157 set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset)); 158 return (int)((intptr_t)addr - (intptr_t)inst1_addr); 159 } 160 161 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) { 162 const address inst2_addr = a; 163 const int inst2 = *(int *)inst2_addr; 164 165 // The relocation points to the second instruction, the addi, 166 // and the addi reads and writes the same register dst. 167 const int dst = inv_rt_field(inst2); 168 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 169 170 // Now, find the preceding addis which writes to dst. 171 int inst1 = 0; 172 address inst1_addr = inst2_addr - BytesPerInstWord; 173 while (inst1_addr >= bound) { 174 inst1 = *(int *) inst1_addr; 175 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 176 // stop, found the addis which writes dst 177 break; 178 } 179 inst1_addr -= BytesPerInstWord; 180 } 181 182 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 183 184 int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0); 185 // -1 is a special case 186 if (offset == -1) { 187 return (address)(intptr_t)-1; 188 } else { 189 return global_toc() + offset; 190 } 191 } 192 193 #ifdef _LP64 194 // Patch compressed oops or klass constants. 195 // Assembler sequence is 196 // 1) compressed oops: 197 // lis rx = const.hi 198 // ori rx = rx | const.lo 199 // 2) compressed klass: 200 // lis rx = const.hi 201 // clrldi rx = rx & 0xFFFFffff // clearMS32b, optional 202 // ori rx = rx | const.lo 203 // Clrldi will be passed by. 204 int MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) { 205 assert(UseCompressedOops, "Should only patch compressed oops"); 206 207 const address inst2_addr = a; 208 const int inst2 = *(int *)inst2_addr; 209 210 // The relocation points to the second instruction, the ori, 211 // and the ori reads and writes the same register dst. 212 const int dst = inv_rta_field(inst2); 213 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 214 // Now, find the preceding addis which writes to dst. 215 int inst1 = 0; 216 address inst1_addr = inst2_addr - BytesPerInstWord; 217 bool inst1_found = false; 218 while (inst1_addr >= bound) { 219 inst1 = *(int *)inst1_addr; 220 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; } 221 inst1_addr -= BytesPerInstWord; 222 } 223 assert(inst1_found, "inst is not lis"); 224 225 int xc = (data >> 16) & 0xffff; 226 int xd = (data >> 0) & 0xffff; 227 228 set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo 229 set_imm((int *)inst2_addr, (xd)); // unsigned int 230 return (int)((intptr_t)inst2_addr - (intptr_t)inst1_addr); 231 } 232 233 // Get compressed oop or klass constant. 234 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) { 235 assert(UseCompressedOops, "Should only patch compressed oops"); 236 237 const address inst2_addr = a; 238 const int inst2 = *(int *)inst2_addr; 239 240 // The relocation points to the second instruction, the ori, 241 // and the ori reads and writes the same register dst. 242 const int dst = inv_rta_field(inst2); 243 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 244 // Now, find the preceding lis which writes to dst. 245 int inst1 = 0; 246 address inst1_addr = inst2_addr - BytesPerInstWord; 247 bool inst1_found = false; 248 249 while (inst1_addr >= bound) { 250 inst1 = *(int *) inst1_addr; 251 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;} 252 inst1_addr -= BytesPerInstWord; 253 } 254 assert(inst1_found, "inst is not lis"); 255 256 uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff)); 257 uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16); 258 259 return (int) (xl | xh); 260 } 261 #endif // _LP64 262 263 // Returns true if successful. 264 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a, 265 Register toc, bool fixed_size) { 266 int toc_offset = 0; 267 // Use RelocationHolder::none for the constant pool entry, otherwise 268 // we will end up with a failing NativeCall::verify(x) where x is 269 // the address of the constant pool entry. 270 // FIXME: We should insert relocation information for oops at the constant 271 // pool entries instead of inserting it at the loads; patching of a constant 272 // pool entry should be less expensive. 273 address const_address = address_constant((address)a.value(), RelocationHolder::none); 274 if (const_address == NULL) { return false; } // allocation failure 275 // Relocate at the pc of the load. 276 relocate(a.rspec()); 277 toc_offset = (int)(const_address - code()->consts()->start()); 278 ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size); 279 return true; 280 } 281 282 bool MacroAssembler::is_load_const_from_method_toc_at(address a) { 283 const address inst1_addr = a; 284 const int inst1 = *(int *)inst1_addr; 285 286 // The relocation points to the ld or the addis. 287 return (is_ld(inst1)) || 288 (is_addis(inst1) && inv_ra_field(inst1) != 0); 289 } 290 291 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) { 292 assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc"); 293 294 const address inst1_addr = a; 295 const int inst1 = *(int *)inst1_addr; 296 297 if (is_ld(inst1)) { 298 return inv_d1_field(inst1); 299 } else if (is_addis(inst1)) { 300 const int dst = inv_rt_field(inst1); 301 302 // Now, find the succeeding ld which reads and writes to dst. 303 address inst2_addr = inst1_addr + BytesPerInstWord; 304 int inst2 = 0; 305 while (true) { 306 inst2 = *(int *) inst2_addr; 307 if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) { 308 // Stop, found the ld which reads and writes dst. 309 break; 310 } 311 inst2_addr += BytesPerInstWord; 312 } 313 return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2); 314 } 315 ShouldNotReachHere(); 316 return 0; 317 } 318 319 // Get the constant from a `load_const' sequence. 320 long MacroAssembler::get_const(address a) { 321 assert(is_load_const_at(a), "not a load of a constant"); 322 const int *p = (const int*) a; 323 unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48); 324 if (is_ori(*(p+1))) { 325 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32); 326 x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16); 327 x |= (((unsigned long) (get_imm(a,4) & 0xffff))); 328 } else if (is_lis(*(p+1))) { 329 x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32); 330 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16); 331 x |= (((unsigned long) (get_imm(a,3) & 0xffff))); 332 } else { 333 ShouldNotReachHere(); 334 return (long) 0; 335 } 336 return (long) x; 337 } 338 339 // Patch the 64 bit constant of a `load_const' sequence. This is a low 340 // level procedure. It neither flushes the instruction cache nor is it 341 // mt safe. 342 void MacroAssembler::patch_const(address a, long x) { 343 assert(is_load_const_at(a), "not a load of a constant"); 344 int *p = (int*) a; 345 if (is_ori(*(p+1))) { 346 set_imm(0 + p, (x >> 48) & 0xffff); 347 set_imm(1 + p, (x >> 32) & 0xffff); 348 set_imm(3 + p, (x >> 16) & 0xffff); 349 set_imm(4 + p, x & 0xffff); 350 } else if (is_lis(*(p+1))) { 351 set_imm(0 + p, (x >> 48) & 0xffff); 352 set_imm(2 + p, (x >> 32) & 0xffff); 353 set_imm(1 + p, (x >> 16) & 0xffff); 354 set_imm(3 + p, x & 0xffff); 355 } else { 356 ShouldNotReachHere(); 357 } 358 } 359 360 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) { 361 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 362 int index = oop_recorder()->allocate_metadata_index(obj); 363 RelocationHolder rspec = metadata_Relocation::spec(index); 364 return AddressLiteral((address)obj, rspec); 365 } 366 367 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) { 368 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 369 int index = oop_recorder()->find_index(obj); 370 RelocationHolder rspec = metadata_Relocation::spec(index); 371 return AddressLiteral((address)obj, rspec); 372 } 373 374 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) { 375 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 376 int oop_index = oop_recorder()->allocate_oop_index(obj); 377 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 378 } 379 380 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) { 381 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 382 int oop_index = oop_recorder()->find_index(obj); 383 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 384 } 385 386 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr, 387 Register tmp, int offset) { 388 intptr_t value = *delayed_value_addr; 389 if (value != 0) { 390 return RegisterOrConstant(value + offset); 391 } 392 393 // Load indirectly to solve generation ordering problem. 394 // static address, no relocation 395 int simm16_offset = load_const_optimized(tmp, delayed_value_addr, noreg, true); 396 ld(tmp, simm16_offset, tmp); // must be aligned ((xa & 3) == 0) 397 398 if (offset != 0) { 399 addi(tmp, tmp, offset); 400 } 401 402 return RegisterOrConstant(tmp); 403 } 404 405 #ifndef PRODUCT 406 void MacroAssembler::pd_print_patched_instruction(address branch) { 407 Unimplemented(); // TODO: PPC port 408 } 409 #endif // ndef PRODUCT 410 411 // Conditional far branch for destinations encodable in 24+2 bits. 412 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) { 413 414 // If requested by flag optimize, relocate the bc_far as a 415 // runtime_call and prepare for optimizing it when the code gets 416 // relocated. 417 if (optimize == bc_far_optimize_on_relocate) { 418 relocate(relocInfo::runtime_call_type); 419 } 420 421 // variant 2: 422 // 423 // b!cxx SKIP 424 // bxx DEST 425 // SKIP: 426 // 427 428 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 429 opposite_bcond(inv_boint_bcond(boint))); 430 431 // We emit two branches. 432 // First, a conditional branch which jumps around the far branch. 433 const address not_taken_pc = pc() + 2 * BytesPerInstWord; 434 const address bc_pc = pc(); 435 bc(opposite_boint, biint, not_taken_pc); 436 437 const int bc_instr = *(int*)bc_pc; 438 assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition"); 439 assert(opposite_boint == inv_bo_field(bc_instr), "postcondition"); 440 assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))), 441 opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))), 442 "postcondition"); 443 assert(biint == inv_bi_field(bc_instr), "postcondition"); 444 445 // Second, an unconditional far branch which jumps to dest. 446 // Note: target(dest) remembers the current pc (see CodeSection::target) 447 // and returns the current pc if the label is not bound yet; when 448 // the label gets bound, the unconditional far branch will be patched. 449 const address target_pc = target(dest); 450 const address b_pc = pc(); 451 b(target_pc); 452 453 assert(not_taken_pc == pc(), "postcondition"); 454 assert(dest.is_bound() || target_pc == b_pc, "postcondition"); 455 } 456 457 // 1 or 2 instructions 458 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) { 459 if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) { 460 bc(boint, biint, dest); 461 } else { 462 bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate); 463 } 464 } 465 466 bool MacroAssembler::is_bc_far_at(address instruction_addr) { 467 return is_bc_far_variant1_at(instruction_addr) || 468 is_bc_far_variant2_at(instruction_addr) || 469 is_bc_far_variant3_at(instruction_addr); 470 } 471 472 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) { 473 if (is_bc_far_variant1_at(instruction_addr)) { 474 const address instruction_1_addr = instruction_addr; 475 const int instruction_1 = *(int*)instruction_1_addr; 476 return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr); 477 } else if (is_bc_far_variant2_at(instruction_addr)) { 478 const address instruction_2_addr = instruction_addr + 4; 479 return bxx_destination(instruction_2_addr); 480 } else if (is_bc_far_variant3_at(instruction_addr)) { 481 return instruction_addr + 8; 482 } 483 // variant 4 ??? 484 ShouldNotReachHere(); 485 return NULL; 486 } 487 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) { 488 489 if (is_bc_far_variant3_at(instruction_addr)) { 490 // variant 3, far cond branch to the next instruction, already patched to nops: 491 // 492 // nop 493 // endgroup 494 // SKIP/DEST: 495 // 496 return; 497 } 498 499 // first, extract boint and biint from the current branch 500 int boint = 0; 501 int biint = 0; 502 503 ResourceMark rm; 504 const int code_size = 2 * BytesPerInstWord; 505 CodeBuffer buf(instruction_addr, code_size); 506 MacroAssembler masm(&buf); 507 if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) { 508 // Far branch to next instruction: Optimize it by patching nops (produce variant 3). 509 masm.nop(); 510 masm.endgroup(); 511 } else { 512 if (is_bc_far_variant1_at(instruction_addr)) { 513 // variant 1, the 1st instruction contains the destination address: 514 // 515 // bcxx DEST 516 // nop 517 // 518 const int instruction_1 = *(int*)(instruction_addr); 519 boint = inv_bo_field(instruction_1); 520 biint = inv_bi_field(instruction_1); 521 } else if (is_bc_far_variant2_at(instruction_addr)) { 522 // variant 2, the 2nd instruction contains the destination address: 523 // 524 // b!cxx SKIP 525 // bxx DEST 526 // SKIP: 527 // 528 const int instruction_1 = *(int*)(instruction_addr); 529 boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))), 530 opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1)))); 531 biint = inv_bi_field(instruction_1); 532 } else { 533 // variant 4??? 534 ShouldNotReachHere(); 535 } 536 537 // second, set the new branch destination and optimize the code 538 if (dest != instruction_addr + 4 && // the bc_far is still unbound! 539 masm.is_within_range_of_bcxx(dest, instruction_addr)) { 540 // variant 1: 541 // 542 // bcxx DEST 543 // nop 544 // 545 masm.bc(boint, biint, dest); 546 masm.nop(); 547 } else { 548 // variant 2: 549 // 550 // b!cxx SKIP 551 // bxx DEST 552 // SKIP: 553 // 554 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 555 opposite_bcond(inv_boint_bcond(boint))); 556 const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord; 557 masm.bc(opposite_boint, biint, not_taken_pc); 558 masm.b(dest); 559 } 560 } 561 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 562 } 563 564 // Emit a NOT mt-safe patchable 64 bit absolute call/jump. 565 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) { 566 // get current pc 567 uint64_t start_pc = (uint64_t) pc(); 568 569 const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last 570 const address pc_of_b = (address) (start_pc + (0*BytesPerInstWord)); // b is first 571 572 // relocate here 573 if (rt != relocInfo::none) { 574 relocate(rt); 575 } 576 577 if ( ReoptimizeCallSequences && 578 (( link && is_within_range_of_b(dest, pc_of_bl)) || 579 (!link && is_within_range_of_b(dest, pc_of_b)))) { 580 // variant 2: 581 // Emit an optimized, pc-relative call/jump. 582 583 if (link) { 584 // some padding 585 nop(); 586 nop(); 587 nop(); 588 nop(); 589 nop(); 590 nop(); 591 592 // do the call 593 assert(pc() == pc_of_bl, "just checking"); 594 bl(dest, relocInfo::none); 595 } else { 596 // do the jump 597 assert(pc() == pc_of_b, "just checking"); 598 b(dest, relocInfo::none); 599 600 // some padding 601 nop(); 602 nop(); 603 nop(); 604 nop(); 605 nop(); 606 nop(); 607 } 608 609 // Assert that we can identify the emitted call/jump. 610 assert(is_bxx64_patchable_variant2_at((address)start_pc, link), 611 "can't identify emitted call"); 612 } else { 613 // variant 1: 614 mr(R0, R11); // spill R11 -> R0. 615 616 // Load the destination address into CTR, 617 // calculate destination relative to global toc. 618 calculate_address_from_global_toc(R11, dest, true, true, false); 619 620 mtctr(R11); 621 mr(R11, R0); // spill R11 <- R0. 622 nop(); 623 624 // do the call/jump 625 if (link) { 626 bctrl(); 627 } else{ 628 bctr(); 629 } 630 // Assert that we can identify the emitted call/jump. 631 assert(is_bxx64_patchable_variant1b_at((address)start_pc, link), 632 "can't identify emitted call"); 633 } 634 635 // Assert that we can identify the emitted call/jump. 636 assert(is_bxx64_patchable_at((address)start_pc, link), 637 "can't identify emitted call"); 638 assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest, 639 "wrong encoding of dest address"); 640 } 641 642 // Identify a bxx64_patchable instruction. 643 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) { 644 return is_bxx64_patchable_variant1b_at(instruction_addr, link) 645 //|| is_bxx64_patchable_variant1_at(instruction_addr, link) 646 || is_bxx64_patchable_variant2_at(instruction_addr, link); 647 } 648 649 // Does the call64_patchable instruction use a pc-relative encoding of 650 // the call destination? 651 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) { 652 // variant 2 is pc-relative 653 return is_bxx64_patchable_variant2_at(instruction_addr, link); 654 } 655 656 // Identify variant 1. 657 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) { 658 unsigned int* instr = (unsigned int*) instruction_addr; 659 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 660 && is_mtctr(instr[5]) // mtctr 661 && is_load_const_at(instruction_addr); 662 } 663 664 // Identify variant 1b: load destination relative to global toc. 665 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) { 666 unsigned int* instr = (unsigned int*) instruction_addr; 667 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 668 && is_mtctr(instr[3]) // mtctr 669 && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr); 670 } 671 672 // Identify variant 2. 673 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) { 674 unsigned int* instr = (unsigned int*) instruction_addr; 675 if (link) { 676 return is_bl (instr[6]) // bl dest is last 677 && is_nop(instr[0]) // nop 678 && is_nop(instr[1]) // nop 679 && is_nop(instr[2]) // nop 680 && is_nop(instr[3]) // nop 681 && is_nop(instr[4]) // nop 682 && is_nop(instr[5]); // nop 683 } else { 684 return is_b (instr[0]) // b dest is first 685 && is_nop(instr[1]) // nop 686 && is_nop(instr[2]) // nop 687 && is_nop(instr[3]) // nop 688 && is_nop(instr[4]) // nop 689 && is_nop(instr[5]) // nop 690 && is_nop(instr[6]); // nop 691 } 692 } 693 694 // Set dest address of a bxx64_patchable instruction. 695 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) { 696 ResourceMark rm; 697 int code_size = MacroAssembler::bxx64_patchable_size; 698 CodeBuffer buf(instruction_addr, code_size); 699 MacroAssembler masm(&buf); 700 masm.bxx64_patchable(dest, relocInfo::none, link); 701 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 702 } 703 704 // Get dest address of a bxx64_patchable instruction. 705 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) { 706 if (is_bxx64_patchable_variant1_at(instruction_addr, link)) { 707 return (address) (unsigned long) get_const(instruction_addr); 708 } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) { 709 unsigned int* instr = (unsigned int*) instruction_addr; 710 if (link) { 711 const int instr_idx = 6; // bl is last 712 int branchoffset = branch_destination(instr[instr_idx], 0); 713 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 714 } else { 715 const int instr_idx = 0; // b is first 716 int branchoffset = branch_destination(instr[instr_idx], 0); 717 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 718 } 719 // Load dest relative to global toc. 720 } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) { 721 return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, 722 instruction_addr); 723 } else { 724 ShouldNotReachHere(); 725 return NULL; 726 } 727 } 728 729 // Uses ordering which corresponds to ABI: 730 // _savegpr0_14: std r14,-144(r1) 731 // _savegpr0_15: std r15,-136(r1) 732 // _savegpr0_16: std r16,-128(r1) 733 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) { 734 std(R14, offset, dst); offset += 8; 735 std(R15, offset, dst); offset += 8; 736 std(R16, offset, dst); offset += 8; 737 std(R17, offset, dst); offset += 8; 738 std(R18, offset, dst); offset += 8; 739 std(R19, offset, dst); offset += 8; 740 std(R20, offset, dst); offset += 8; 741 std(R21, offset, dst); offset += 8; 742 std(R22, offset, dst); offset += 8; 743 std(R23, offset, dst); offset += 8; 744 std(R24, offset, dst); offset += 8; 745 std(R25, offset, dst); offset += 8; 746 std(R26, offset, dst); offset += 8; 747 std(R27, offset, dst); offset += 8; 748 std(R28, offset, dst); offset += 8; 749 std(R29, offset, dst); offset += 8; 750 std(R30, offset, dst); offset += 8; 751 std(R31, offset, dst); offset += 8; 752 753 stfd(F14, offset, dst); offset += 8; 754 stfd(F15, offset, dst); offset += 8; 755 stfd(F16, offset, dst); offset += 8; 756 stfd(F17, offset, dst); offset += 8; 757 stfd(F18, offset, dst); offset += 8; 758 stfd(F19, offset, dst); offset += 8; 759 stfd(F20, offset, dst); offset += 8; 760 stfd(F21, offset, dst); offset += 8; 761 stfd(F22, offset, dst); offset += 8; 762 stfd(F23, offset, dst); offset += 8; 763 stfd(F24, offset, dst); offset += 8; 764 stfd(F25, offset, dst); offset += 8; 765 stfd(F26, offset, dst); offset += 8; 766 stfd(F27, offset, dst); offset += 8; 767 stfd(F28, offset, dst); offset += 8; 768 stfd(F29, offset, dst); offset += 8; 769 stfd(F30, offset, dst); offset += 8; 770 stfd(F31, offset, dst); 771 } 772 773 // Uses ordering which corresponds to ABI: 774 // _restgpr0_14: ld r14,-144(r1) 775 // _restgpr0_15: ld r15,-136(r1) 776 // _restgpr0_16: ld r16,-128(r1) 777 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) { 778 ld(R14, offset, src); offset += 8; 779 ld(R15, offset, src); offset += 8; 780 ld(R16, offset, src); offset += 8; 781 ld(R17, offset, src); offset += 8; 782 ld(R18, offset, src); offset += 8; 783 ld(R19, offset, src); offset += 8; 784 ld(R20, offset, src); offset += 8; 785 ld(R21, offset, src); offset += 8; 786 ld(R22, offset, src); offset += 8; 787 ld(R23, offset, src); offset += 8; 788 ld(R24, offset, src); offset += 8; 789 ld(R25, offset, src); offset += 8; 790 ld(R26, offset, src); offset += 8; 791 ld(R27, offset, src); offset += 8; 792 ld(R28, offset, src); offset += 8; 793 ld(R29, offset, src); offset += 8; 794 ld(R30, offset, src); offset += 8; 795 ld(R31, offset, src); offset += 8; 796 797 // FP registers 798 lfd(F14, offset, src); offset += 8; 799 lfd(F15, offset, src); offset += 8; 800 lfd(F16, offset, src); offset += 8; 801 lfd(F17, offset, src); offset += 8; 802 lfd(F18, offset, src); offset += 8; 803 lfd(F19, offset, src); offset += 8; 804 lfd(F20, offset, src); offset += 8; 805 lfd(F21, offset, src); offset += 8; 806 lfd(F22, offset, src); offset += 8; 807 lfd(F23, offset, src); offset += 8; 808 lfd(F24, offset, src); offset += 8; 809 lfd(F25, offset, src); offset += 8; 810 lfd(F26, offset, src); offset += 8; 811 lfd(F27, offset, src); offset += 8; 812 lfd(F28, offset, src); offset += 8; 813 lfd(F29, offset, src); offset += 8; 814 lfd(F30, offset, src); offset += 8; 815 lfd(F31, offset, src); 816 } 817 818 // For verify_oops. 819 void MacroAssembler::save_volatile_gprs(Register dst, int offset) { 820 std(R2, offset, dst); offset += 8; 821 std(R3, offset, dst); offset += 8; 822 std(R4, offset, dst); offset += 8; 823 std(R5, offset, dst); offset += 8; 824 std(R6, offset, dst); offset += 8; 825 std(R7, offset, dst); offset += 8; 826 std(R8, offset, dst); offset += 8; 827 std(R9, offset, dst); offset += 8; 828 std(R10, offset, dst); offset += 8; 829 std(R11, offset, dst); offset += 8; 830 std(R12, offset, dst); offset += 8; 831 832 stfd(F0, offset, dst); offset += 8; 833 stfd(F1, offset, dst); offset += 8; 834 stfd(F2, offset, dst); offset += 8; 835 stfd(F3, offset, dst); offset += 8; 836 stfd(F4, offset, dst); offset += 8; 837 stfd(F5, offset, dst); offset += 8; 838 stfd(F6, offset, dst); offset += 8; 839 stfd(F7, offset, dst); offset += 8; 840 stfd(F8, offset, dst); offset += 8; 841 stfd(F9, offset, dst); offset += 8; 842 stfd(F10, offset, dst); offset += 8; 843 stfd(F11, offset, dst); offset += 8; 844 stfd(F12, offset, dst); offset += 8; 845 stfd(F13, offset, dst); 846 } 847 848 // For verify_oops. 849 void MacroAssembler::restore_volatile_gprs(Register src, int offset) { 850 ld(R2, offset, src); offset += 8; 851 ld(R3, offset, src); offset += 8; 852 ld(R4, offset, src); offset += 8; 853 ld(R5, offset, src); offset += 8; 854 ld(R6, offset, src); offset += 8; 855 ld(R7, offset, src); offset += 8; 856 ld(R8, offset, src); offset += 8; 857 ld(R9, offset, src); offset += 8; 858 ld(R10, offset, src); offset += 8; 859 ld(R11, offset, src); offset += 8; 860 ld(R12, offset, src); offset += 8; 861 862 lfd(F0, offset, src); offset += 8; 863 lfd(F1, offset, src); offset += 8; 864 lfd(F2, offset, src); offset += 8; 865 lfd(F3, offset, src); offset += 8; 866 lfd(F4, offset, src); offset += 8; 867 lfd(F5, offset, src); offset += 8; 868 lfd(F6, offset, src); offset += 8; 869 lfd(F7, offset, src); offset += 8; 870 lfd(F8, offset, src); offset += 8; 871 lfd(F9, offset, src); offset += 8; 872 lfd(F10, offset, src); offset += 8; 873 lfd(F11, offset, src); offset += 8; 874 lfd(F12, offset, src); offset += 8; 875 lfd(F13, offset, src); 876 } 877 878 void MacroAssembler::save_LR_CR(Register tmp) { 879 mfcr(tmp); 880 std(tmp, _abi(cr), R1_SP); 881 mflr(tmp); 882 std(tmp, _abi(lr), R1_SP); 883 // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad) 884 } 885 886 void MacroAssembler::restore_LR_CR(Register tmp) { 887 assert(tmp != R1_SP, "must be distinct"); 888 ld(tmp, _abi(lr), R1_SP); 889 mtlr(tmp); 890 ld(tmp, _abi(cr), R1_SP); 891 mtcr(tmp); 892 } 893 894 address MacroAssembler::get_PC_trash_LR(Register result) { 895 Label L; 896 bl(L); 897 bind(L); 898 address lr_pc = pc(); 899 mflr(result); 900 return lr_pc; 901 } 902 903 void MacroAssembler::resize_frame(Register offset, Register tmp) { 904 #ifdef ASSERT 905 assert_different_registers(offset, tmp, R1_SP); 906 andi_(tmp, offset, frame::alignment_in_bytes-1); 907 asm_assert_eq("resize_frame: unaligned", 0x204); 908 #endif 909 910 // tmp <- *(SP) 911 ld(tmp, _abi(callers_sp), R1_SP); 912 // addr <- SP + offset; 913 // *(addr) <- tmp; 914 // SP <- addr 915 stdux(tmp, R1_SP, offset); 916 } 917 918 void MacroAssembler::resize_frame(int offset, Register tmp) { 919 assert(is_simm(offset, 16), "too big an offset"); 920 assert_different_registers(tmp, R1_SP); 921 assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned"); 922 // tmp <- *(SP) 923 ld(tmp, _abi(callers_sp), R1_SP); 924 // addr <- SP + offset; 925 // *(addr) <- tmp; 926 // SP <- addr 927 stdu(tmp, offset, R1_SP); 928 } 929 930 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) { 931 // (addr == tmp1) || (addr == tmp2) is allowed here! 932 assert(tmp1 != tmp2, "must be distinct"); 933 934 // compute offset w.r.t. current stack pointer 935 // tmp_1 <- addr - SP (!) 936 subf(tmp1, R1_SP, addr); 937 938 // atomically update SP keeping back link. 939 resize_frame(tmp1/* offset */, tmp2/* tmp */); 940 } 941 942 void MacroAssembler::push_frame(Register bytes, Register tmp) { 943 #ifdef ASSERT 944 assert(bytes != R0, "r0 not allowed here"); 945 andi_(R0, bytes, frame::alignment_in_bytes-1); 946 asm_assert_eq("push_frame(Reg, Reg): unaligned", 0x203); 947 #endif 948 neg(tmp, bytes); 949 stdux(R1_SP, R1_SP, tmp); 950 } 951 952 // Push a frame of size `bytes'. 953 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) { 954 long offset = align_addr(bytes, frame::alignment_in_bytes); 955 if (is_simm(-offset, 16)) { 956 stdu(R1_SP, -offset, R1_SP); 957 } else { 958 load_const_optimized(tmp, -offset); 959 stdux(R1_SP, R1_SP, tmp); 960 } 961 } 962 963 // Push a frame of size `bytes' plus abi_reg_args on top. 964 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) { 965 push_frame(bytes + frame::abi_reg_args_size, tmp); 966 } 967 968 // Setup up a new C frame with a spill area for non-volatile GPRs and 969 // additional space for local variables. 970 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes, 971 Register tmp) { 972 push_frame(bytes + frame::abi_reg_args_size + frame::spill_nonvolatiles_size, tmp); 973 } 974 975 // Pop current C frame. 976 void MacroAssembler::pop_frame() { 977 ld(R1_SP, _abi(callers_sp), R1_SP); 978 } 979 980 #if defined(ABI_ELFv2) 981 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) { 982 // TODO(asmundak): make sure the caller uses R12 as function descriptor 983 // most of the times. 984 if (R12 != r_function_entry) { 985 mr(R12, r_function_entry); 986 } 987 mtctr(R12); 988 // Do a call or a branch. 989 if (and_link) { 990 bctrl(); 991 } else { 992 bctr(); 993 } 994 _last_calls_return_pc = pc(); 995 996 return _last_calls_return_pc; 997 } 998 999 // Call a C function via a function descriptor and use full C 1000 // calling conventions. Updates and returns _last_calls_return_pc. 1001 address MacroAssembler::call_c(Register r_function_entry) { 1002 return branch_to(r_function_entry, /*and_link=*/true); 1003 } 1004 1005 // For tail calls: only branch, don't link, so callee returns to caller of this function. 1006 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) { 1007 return branch_to(r_function_entry, /*and_link=*/false); 1008 } 1009 1010 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) { 1011 load_const(R12, function_entry, R0); 1012 return branch_to(R12, /*and_link=*/true); 1013 } 1014 1015 #else 1016 // Generic version of a call to C function via a function descriptor 1017 // with variable support for C calling conventions (TOC, ENV, etc.). 1018 // Updates and returns _last_calls_return_pc. 1019 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call, 1020 bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) { 1021 // we emit standard ptrgl glue code here 1022 assert((function_descriptor != R0), "function_descriptor cannot be R0"); 1023 1024 // retrieve necessary entries from the function descriptor 1025 ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor); 1026 mtctr(R0); 1027 1028 if (load_toc_of_callee) { 1029 ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor); 1030 } 1031 if (load_env_of_callee) { 1032 ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor); 1033 } else if (load_toc_of_callee) { 1034 li(R11, 0); 1035 } 1036 1037 // do a call or a branch 1038 if (and_link) { 1039 bctrl(); 1040 } else { 1041 bctr(); 1042 } 1043 _last_calls_return_pc = pc(); 1044 1045 return _last_calls_return_pc; 1046 } 1047 1048 // Call a C function via a function descriptor and use full C calling 1049 // conventions. 1050 // We don't use the TOC in generated code, so there is no need to save 1051 // and restore its value. 1052 address MacroAssembler::call_c(Register fd) { 1053 return branch_to(fd, /*and_link=*/true, 1054 /*save toc=*/false, 1055 /*restore toc=*/false, 1056 /*load toc=*/true, 1057 /*load env=*/true); 1058 } 1059 1060 address MacroAssembler::call_c_and_return_to_caller(Register fd) { 1061 return branch_to(fd, /*and_link=*/false, 1062 /*save toc=*/false, 1063 /*restore toc=*/false, 1064 /*load toc=*/true, 1065 /*load env=*/true); 1066 } 1067 1068 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) { 1069 if (rt != relocInfo::none) { 1070 // this call needs to be relocatable 1071 if (!ReoptimizeCallSequences 1072 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1073 || fd == NULL // support code-size estimation 1074 || !fd->is_friend_function() 1075 || fd->entry() == NULL) { 1076 // it's not a friend function as defined by class FunctionDescriptor, 1077 // so do a full call-c here. 1078 load_const(R11, (address)fd, R0); 1079 1080 bool has_env = (fd != NULL && fd->env() != NULL); 1081 return branch_to(R11, /*and_link=*/true, 1082 /*save toc=*/false, 1083 /*restore toc=*/false, 1084 /*load toc=*/true, 1085 /*load env=*/has_env); 1086 } else { 1087 // It's a friend function. Load the entry point and don't care about 1088 // toc and env. Use an optimizable call instruction, but ensure the 1089 // same code-size as in the case of a non-friend function. 1090 nop(); 1091 nop(); 1092 nop(); 1093 bl64_patchable(fd->entry(), rt); 1094 _last_calls_return_pc = pc(); 1095 return _last_calls_return_pc; 1096 } 1097 } else { 1098 // This call does not need to be relocatable, do more aggressive 1099 // optimizations. 1100 if (!ReoptimizeCallSequences 1101 || !fd->is_friend_function()) { 1102 // It's not a friend function as defined by class FunctionDescriptor, 1103 // so do a full call-c here. 1104 load_const(R11, (address)fd, R0); 1105 return branch_to(R11, /*and_link=*/true, 1106 /*save toc=*/false, 1107 /*restore toc=*/false, 1108 /*load toc=*/true, 1109 /*load env=*/true); 1110 } else { 1111 // it's a friend function, load the entry point and don't care about 1112 // toc and env. 1113 address dest = fd->entry(); 1114 if (is_within_range_of_b(dest, pc())) { 1115 bl(dest); 1116 } else { 1117 bl64_patchable(dest, rt); 1118 } 1119 _last_calls_return_pc = pc(); 1120 return _last_calls_return_pc; 1121 } 1122 } 1123 } 1124 1125 // Call a C function. All constants needed reside in TOC. 1126 // 1127 // Read the address to call from the TOC. 1128 // Read env from TOC, if fd specifies an env. 1129 // Read new TOC from TOC. 1130 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd, 1131 relocInfo::relocType rt, Register toc) { 1132 if (!ReoptimizeCallSequences 1133 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1134 || !fd->is_friend_function()) { 1135 // It's not a friend function as defined by class FunctionDescriptor, 1136 // so do a full call-c here. 1137 assert(fd->entry() != NULL, "function must be linked"); 1138 1139 AddressLiteral fd_entry(fd->entry()); 1140 bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true); 1141 mtctr(R11); 1142 if (fd->env() == NULL) { 1143 li(R11, 0); 1144 nop(); 1145 } else { 1146 AddressLiteral fd_env(fd->env()); 1147 success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true); 1148 } 1149 AddressLiteral fd_toc(fd->toc()); 1150 // Set R2_TOC (load from toc) 1151 success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true); 1152 bctrl(); 1153 _last_calls_return_pc = pc(); 1154 if (!success) { return NULL; } 1155 } else { 1156 // It's a friend function, load the entry point and don't care about 1157 // toc and env. Use an optimizable call instruction, but ensure the 1158 // same code-size as in the case of a non-friend function. 1159 nop(); 1160 bl64_patchable(fd->entry(), rt); 1161 _last_calls_return_pc = pc(); 1162 } 1163 return _last_calls_return_pc; 1164 } 1165 #endif // ABI_ELFv2 1166 1167 void MacroAssembler::call_VM_base(Register oop_result, 1168 Register last_java_sp, 1169 address entry_point, 1170 bool check_exceptions) { 1171 BLOCK_COMMENT("call_VM {"); 1172 // Determine last_java_sp register. 1173 if (!last_java_sp->is_valid()) { 1174 last_java_sp = R1_SP; 1175 } 1176 set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1); 1177 1178 // ARG1 must hold thread address. 1179 mr(R3_ARG1, R16_thread); 1180 #if defined(ABI_ELFv2) 1181 address return_pc = call_c(entry_point, relocInfo::none); 1182 #else 1183 address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none); 1184 #endif 1185 1186 reset_last_Java_frame(); 1187 1188 // Check for pending exceptions. 1189 if (check_exceptions) { 1190 // We don't check for exceptions here. 1191 ShouldNotReachHere(); 1192 } 1193 1194 // Get oop result if there is one and reset the value in the thread. 1195 if (oop_result->is_valid()) { 1196 get_vm_result(oop_result); 1197 } 1198 1199 _last_calls_return_pc = return_pc; 1200 BLOCK_COMMENT("} call_VM"); 1201 } 1202 1203 void MacroAssembler::call_VM_leaf_base(address entry_point) { 1204 BLOCK_COMMENT("call_VM_leaf {"); 1205 #if defined(ABI_ELFv2) 1206 call_c(entry_point, relocInfo::none); 1207 #else 1208 call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none); 1209 #endif 1210 BLOCK_COMMENT("} call_VM_leaf"); 1211 } 1212 1213 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) { 1214 call_VM_base(oop_result, noreg, entry_point, check_exceptions); 1215 } 1216 1217 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, 1218 bool check_exceptions) { 1219 // R3_ARG1 is reserved for the thread. 1220 mr_if_needed(R4_ARG2, arg_1); 1221 call_VM(oop_result, entry_point, check_exceptions); 1222 } 1223 1224 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, 1225 bool check_exceptions) { 1226 // R3_ARG1 is reserved for the thread 1227 mr_if_needed(R4_ARG2, arg_1); 1228 assert(arg_2 != R4_ARG2, "smashed argument"); 1229 mr_if_needed(R5_ARG3, arg_2); 1230 call_VM(oop_result, entry_point, check_exceptions); 1231 } 1232 1233 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3, 1234 bool check_exceptions) { 1235 // R3_ARG1 is reserved for the thread 1236 mr_if_needed(R4_ARG2, arg_1); 1237 assert(arg_2 != R4_ARG2, "smashed argument"); 1238 mr_if_needed(R5_ARG3, arg_2); 1239 mr_if_needed(R6_ARG4, arg_3); 1240 call_VM(oop_result, entry_point, check_exceptions); 1241 } 1242 1243 void MacroAssembler::call_VM_leaf(address entry_point) { 1244 call_VM_leaf_base(entry_point); 1245 } 1246 1247 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) { 1248 mr_if_needed(R3_ARG1, arg_1); 1249 call_VM_leaf(entry_point); 1250 } 1251 1252 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) { 1253 mr_if_needed(R3_ARG1, arg_1); 1254 assert(arg_2 != R3_ARG1, "smashed argument"); 1255 mr_if_needed(R4_ARG2, arg_2); 1256 call_VM_leaf(entry_point); 1257 } 1258 1259 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) { 1260 mr_if_needed(R3_ARG1, arg_1); 1261 assert(arg_2 != R3_ARG1, "smashed argument"); 1262 mr_if_needed(R4_ARG2, arg_2); 1263 assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument"); 1264 mr_if_needed(R5_ARG3, arg_3); 1265 call_VM_leaf(entry_point); 1266 } 1267 1268 // Check whether instruction is a read access to the polling page 1269 // which was emitted by load_from_polling_page(..). 1270 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext, 1271 address* polling_address_ptr) { 1272 if (!is_ld(instruction)) 1273 return false; // It's not a ld. Fail. 1274 1275 int rt = inv_rt_field(instruction); 1276 int ra = inv_ra_field(instruction); 1277 int ds = inv_ds_field(instruction); 1278 if (!(ds == 0 && ra != 0 && rt == 0)) { 1279 return false; // It's not a ld(r0, X, ra). Fail. 1280 } 1281 1282 if (!ucontext) { 1283 // Set polling address. 1284 if (polling_address_ptr != NULL) { 1285 *polling_address_ptr = NULL; 1286 } 1287 return true; // No ucontext given. Can't check value of ra. Assume true. 1288 } 1289 1290 #ifdef LINUX 1291 // Ucontext given. Check that register ra contains the address of 1292 // the safepoing polling page. 1293 ucontext_t* uc = (ucontext_t*) ucontext; 1294 // Set polling address. 1295 address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds; 1296 if (polling_address_ptr != NULL) { 1297 *polling_address_ptr = addr; 1298 } 1299 return os::is_poll_address(addr); 1300 #else 1301 // Not on Linux, ucontext must be NULL. 1302 ShouldNotReachHere(); 1303 return false; 1304 #endif 1305 } 1306 1307 bool MacroAssembler::is_memory_serialization(int instruction, JavaThread* thread, void* ucontext) { 1308 #ifdef LINUX 1309 ucontext_t* uc = (ucontext_t*) ucontext; 1310 1311 if (is_stwx(instruction) || is_stwux(instruction)) { 1312 int ra = inv_ra_field(instruction); 1313 int rb = inv_rb_field(instruction); 1314 1315 // look up content of ra and rb in ucontext 1316 address ra_val=(address)uc->uc_mcontext.regs->gpr[ra]; 1317 long rb_val=(long)uc->uc_mcontext.regs->gpr[rb]; 1318 return os::is_memory_serialize_page(thread, ra_val+rb_val); 1319 } else if (is_stw(instruction) || is_stwu(instruction)) { 1320 int ra = inv_ra_field(instruction); 1321 int d1 = inv_d1_field(instruction); 1322 1323 // look up content of ra in ucontext 1324 address ra_val=(address)uc->uc_mcontext.regs->gpr[ra]; 1325 return os::is_memory_serialize_page(thread, ra_val+d1); 1326 } else { 1327 return false; 1328 } 1329 #else 1330 // workaround not needed on !LINUX :-) 1331 ShouldNotCallThis(); 1332 return false; 1333 #endif 1334 } 1335 1336 void MacroAssembler::bang_stack_with_offset(int offset) { 1337 // When increasing the stack, the old stack pointer will be written 1338 // to the new top of stack according to the PPC64 abi. 1339 // Therefore, stack banging is not necessary when increasing 1340 // the stack by <= os::vm_page_size() bytes. 1341 // When increasing the stack by a larger amount, this method is 1342 // called repeatedly to bang the intermediate pages. 1343 1344 // Stack grows down, caller passes positive offset. 1345 assert(offset > 0, "must bang with positive offset"); 1346 1347 long stdoffset = -offset; 1348 1349 if (is_simm(stdoffset, 16)) { 1350 // Signed 16 bit offset, a simple std is ok. 1351 if (UseLoadInstructionsForStackBangingPPC64) { 1352 ld(R0, (int)(signed short)stdoffset, R1_SP); 1353 } else { 1354 std(R0,(int)(signed short)stdoffset, R1_SP); 1355 } 1356 } else if (is_simm(stdoffset, 31)) { 1357 const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset); 1358 const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset); 1359 1360 Register tmp = R11; 1361 addis(tmp, R1_SP, hi); 1362 if (UseLoadInstructionsForStackBangingPPC64) { 1363 ld(R0, lo, tmp); 1364 } else { 1365 std(R0, lo, tmp); 1366 } 1367 } else { 1368 ShouldNotReachHere(); 1369 } 1370 } 1371 1372 // If instruction is a stack bang of the form 1373 // std R0, x(Ry), (see bang_stack_with_offset()) 1374 // stdu R1_SP, x(R1_SP), (see push_frame(), resize_frame()) 1375 // or stdux R1_SP, Rx, R1_SP (see push_frame(), resize_frame()) 1376 // return the banged address. Otherwise, return 0. 1377 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) { 1378 #ifdef LINUX 1379 ucontext_t* uc = (ucontext_t*) ucontext; 1380 int rs = inv_rs_field(instruction); 1381 int ra = inv_ra_field(instruction); 1382 if ( (is_ld(instruction) && rs == 0 && UseLoadInstructionsForStackBangingPPC64) 1383 || (is_std(instruction) && rs == 0 && !UseLoadInstructionsForStackBangingPPC64) 1384 || (is_stdu(instruction) && rs == 1)) { 1385 int ds = inv_ds_field(instruction); 1386 // return banged address 1387 return ds+(address)uc->uc_mcontext.regs->gpr[ra]; 1388 } else if (is_stdux(instruction) && rs == 1) { 1389 int rb = inv_rb_field(instruction); 1390 address sp = (address)uc->uc_mcontext.regs->gpr[1]; 1391 long rb_val = (long)uc->uc_mcontext.regs->gpr[rb]; 1392 return ra != 1 || rb_val >= 0 ? NULL // not a stack bang 1393 : sp + rb_val; // banged address 1394 } 1395 return NULL; // not a stack bang 1396 #else 1397 // workaround not needed on !LINUX :-) 1398 ShouldNotCallThis(); 1399 return NULL; 1400 #endif 1401 } 1402 1403 void MacroAssembler::reserved_stack_check(Register return_pc) { 1404 // Test if reserved zone needs to be enabled. 1405 Label no_reserved_zone_enabling; 1406 1407 ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread); 1408 cmpld(CCR0, R1_SP, R0); 1409 blt_predict_taken(CCR0, no_reserved_zone_enabling); 1410 1411 // Enable reserved zone again, throw stack overflow exception. 1412 push_frame_reg_args(0, R0); 1413 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread); 1414 pop_frame(); 1415 mtlr(return_pc); 1416 load_const_optimized(R0, StubRoutines::throw_delayed_StackOverflowError_entry()); 1417 mtctr(R0); 1418 bctr(); 1419 1420 should_not_reach_here(); 1421 1422 bind(no_reserved_zone_enabling); 1423 } 1424 1425 void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base, 1426 bool cmpxchgx_hint) { 1427 Label retry; 1428 bind(retry); 1429 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1430 stdcx_(exchange_value, addr_base); 1431 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1432 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1433 } else { 1434 bne( CCR0, retry); // StXcx_ sets CCR0. 1435 } 1436 } 1437 1438 void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base, 1439 Register tmp, bool cmpxchgx_hint) { 1440 Label retry; 1441 bind(retry); 1442 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1443 add(tmp, dest_current_value, inc_value); 1444 stdcx_(tmp, addr_base); 1445 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1446 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1447 } else { 1448 bne( CCR0, retry); // StXcx_ sets CCR0. 1449 } 1450 } 1451 1452 // Word/sub-word atomic helper functions 1453 1454 // Temps and addr_base are killed if size < 4 and processor does not support respective instructions. 1455 // Only signed types are supported with size < 4. 1456 // Atomic add always kills tmp1. 1457 void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value, 1458 Register addr_base, Register tmp1, Register tmp2, Register tmp3, 1459 bool cmpxchgx_hint, bool is_add, int size) { 1460 // Sub-word instructions are available since Power 8. 1461 // For older processors, instruction_type != size holds, and we 1462 // emulate the sub-word instructions by constructing a 4-byte value 1463 // that leaves the other bytes unchanged. 1464 const int instruction_type = VM_Version::has_lqarx() ? size : 4; 1465 1466 Label retry; 1467 Register shift_amount = noreg, 1468 val32 = dest_current_value, 1469 modval = is_add ? tmp1 : exchange_value; 1470 1471 if (instruction_type != size) { 1472 assert_different_registers(tmp1, tmp2, tmp3, dest_current_value, exchange_value, addr_base); 1473 modval = tmp1; 1474 shift_amount = tmp2; 1475 val32 = tmp3; 1476 // Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned. 1477 #ifdef VM_LITTLE_ENDIAN 1478 rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8; 1479 clrrdi(addr_base, addr_base, 2); 1480 #else 1481 xori(shift_amount, addr_base, (size == 1) ? 3 : 2); 1482 clrrdi(addr_base, addr_base, 2); 1483 rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16; 1484 #endif 1485 } 1486 1487 // atomic emulation loop 1488 bind(retry); 1489 1490 switch (instruction_type) { 1491 case 4: lwarx(val32, addr_base, cmpxchgx_hint); break; 1492 case 2: lharx(val32, addr_base, cmpxchgx_hint); break; 1493 case 1: lbarx(val32, addr_base, cmpxchgx_hint); break; 1494 default: ShouldNotReachHere(); 1495 } 1496 1497 if (instruction_type != size) { 1498 srw(dest_current_value, val32, shift_amount); 1499 } 1500 1501 if (is_add) { add(modval, dest_current_value, exchange_value); } 1502 1503 if (instruction_type != size) { 1504 // Transform exchange value such that the replacement can be done by one xor instruction. 1505 xorr(modval, dest_current_value, is_add ? modval : exchange_value); 1506 clrldi(modval, modval, (size == 1) ? 56 : 48); 1507 slw(modval, modval, shift_amount); 1508 xorr(modval, val32, modval); 1509 } 1510 1511 switch (instruction_type) { 1512 case 4: stwcx_(modval, addr_base); break; 1513 case 2: sthcx_(modval, addr_base); break; 1514 case 1: stbcx_(modval, addr_base); break; 1515 default: ShouldNotReachHere(); 1516 } 1517 1518 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1519 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1520 } else { 1521 bne( CCR0, retry); // StXcx_ sets CCR0. 1522 } 1523 1524 // l?arx zero-extends, but Java wants byte/short values sign-extended. 1525 if (size == 1) { 1526 extsb(dest_current_value, dest_current_value); 1527 } else if (size == 2) { 1528 extsh(dest_current_value, dest_current_value); 1529 }; 1530 } 1531 1532 // Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions. 1533 // Only signed types are supported with size < 4. 1534 void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value, 1535 Register compare_value, Register exchange_value, 1536 Register addr_base, Register tmp1, Register tmp2, 1537 Label &retry, Label &failed, bool cmpxchgx_hint, int size) { 1538 // Sub-word instructions are available since Power 8. 1539 // For older processors, instruction_type != size holds, and we 1540 // emulate the sub-word instructions by constructing a 4-byte value 1541 // that leaves the other bytes unchanged. 1542 const int instruction_type = VM_Version::has_lqarx() ? size : 4; 1543 1544 Register shift_amount = noreg, 1545 val32 = dest_current_value, 1546 modval = exchange_value; 1547 1548 if (instruction_type != size) { 1549 assert_different_registers(tmp1, tmp2, dest_current_value, compare_value, exchange_value, addr_base); 1550 shift_amount = tmp1; 1551 val32 = tmp2; 1552 modval = tmp2; 1553 // Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned. 1554 #ifdef VM_LITTLE_ENDIAN 1555 rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8; 1556 clrrdi(addr_base, addr_base, 2); 1557 #else 1558 xori(shift_amount, addr_base, (size == 1) ? 3 : 2); 1559 clrrdi(addr_base, addr_base, 2); 1560 rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16; 1561 #endif 1562 // Transform exchange value such that the replacement can be done by one xor instruction. 1563 xorr(exchange_value, compare_value, exchange_value); 1564 clrldi(exchange_value, exchange_value, (size == 1) ? 56 : 48); 1565 slw(exchange_value, exchange_value, shift_amount); 1566 } 1567 1568 // atomic emulation loop 1569 bind(retry); 1570 1571 switch (instruction_type) { 1572 case 4: lwarx(val32, addr_base, cmpxchgx_hint); break; 1573 case 2: lharx(val32, addr_base, cmpxchgx_hint); break; 1574 case 1: lbarx(val32, addr_base, cmpxchgx_hint); break; 1575 default: ShouldNotReachHere(); 1576 } 1577 1578 if (instruction_type != size) { 1579 srw(dest_current_value, val32, shift_amount); 1580 } 1581 if (size == 1) { 1582 extsb(dest_current_value, dest_current_value); 1583 } else if (size == 2) { 1584 extsh(dest_current_value, dest_current_value); 1585 }; 1586 1587 cmpw(flag, dest_current_value, compare_value); 1588 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1589 bne_predict_not_taken(flag, failed); 1590 } else { 1591 bne( flag, failed); 1592 } 1593 // branch to done => (flag == ne), (dest_current_value != compare_value) 1594 // fall through => (flag == eq), (dest_current_value == compare_value) 1595 1596 if (instruction_type != size) { 1597 xorr(modval, val32, exchange_value); 1598 } 1599 1600 switch (instruction_type) { 1601 case 4: stwcx_(modval, addr_base); break; 1602 case 2: sthcx_(modval, addr_base); break; 1603 case 1: stbcx_(modval, addr_base); break; 1604 default: ShouldNotReachHere(); 1605 } 1606 } 1607 1608 // CmpxchgX sets condition register to cmpX(current, compare). 1609 void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value, 1610 Register compare_value, Register exchange_value, 1611 Register addr_base, Register tmp1, Register tmp2, 1612 int semantics, bool cmpxchgx_hint, 1613 Register int_flag_success, bool contention_hint, bool weak, int size) { 1614 Label retry; 1615 Label failed; 1616 Label done; 1617 1618 // Save one branch if result is returned via register and 1619 // result register is different from the other ones. 1620 bool use_result_reg = (int_flag_success != noreg); 1621 bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value && 1622 int_flag_success != exchange_value && int_flag_success != addr_base && 1623 int_flag_success != tmp1 && int_flag_success != tmp2); 1624 assert(!weak || flag == CCR0, "weak only supported with CCR0"); 1625 assert(size == 1 || size == 2 || size == 4, "unsupported"); 1626 1627 if (use_result_reg && preset_result_reg) { 1628 li(int_flag_success, 0); // preset (assume cas failed) 1629 } 1630 1631 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1632 if (contention_hint) { // Don't try to reserve if cmp fails. 1633 switch (size) { 1634 case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break; 1635 case 2: lha(dest_current_value, 0, addr_base); break; 1636 case 4: lwz(dest_current_value, 0, addr_base); break; 1637 default: ShouldNotReachHere(); 1638 } 1639 cmpw(flag, dest_current_value, compare_value); 1640 bne(flag, failed); 1641 } 1642 1643 // release/fence semantics 1644 if (semantics & MemBarRel) { 1645 release(); 1646 } 1647 1648 cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base, tmp1, tmp2, 1649 retry, failed, cmpxchgx_hint, size); 1650 if (!weak || use_result_reg) { 1651 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1652 bne_predict_not_taken(CCR0, weak ? failed : retry); // StXcx_ sets CCR0. 1653 } else { 1654 bne( CCR0, weak ? failed : retry); // StXcx_ sets CCR0. 1655 } 1656 } 1657 // fall through => (flag == eq), (dest_current_value == compare_value), (swapped) 1658 1659 // Result in register (must do this at the end because int_flag_success can be the 1660 // same register as one above). 1661 if (use_result_reg) { 1662 li(int_flag_success, 1); 1663 } 1664 1665 if (semantics & MemBarFenceAfter) { 1666 fence(); 1667 } else if (semantics & MemBarAcq) { 1668 isync(); 1669 } 1670 1671 if (use_result_reg && !preset_result_reg) { 1672 b(done); 1673 } 1674 1675 bind(failed); 1676 if (use_result_reg && !preset_result_reg) { 1677 li(int_flag_success, 0); 1678 } 1679 1680 bind(done); 1681 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1682 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1683 } 1684 1685 // Preforms atomic compare exchange: 1686 // if (compare_value == *addr_base) 1687 // *addr_base = exchange_value 1688 // int_flag_success = 1; 1689 // else 1690 // int_flag_success = 0; 1691 // 1692 // ConditionRegister flag = cmp(compare_value, *addr_base) 1693 // Register dest_current_value = *addr_base 1694 // Register compare_value Used to compare with value in memory 1695 // Register exchange_value Written to memory if compare_value == *addr_base 1696 // Register addr_base The memory location to compareXChange 1697 // Register int_flag_success Set to 1 if exchange_value was written to *addr_base 1698 // 1699 // To avoid the costly compare exchange the value is tested beforehand. 1700 // Several special cases exist to avoid that unnecessary information is generated. 1701 // 1702 void MacroAssembler::cmpxchgd(ConditionRegister flag, 1703 Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value, 1704 Register addr_base, int semantics, bool cmpxchgx_hint, 1705 Register int_flag_success, Label* failed_ext, bool contention_hint, bool weak) { 1706 Label retry; 1707 Label failed_int; 1708 Label& failed = (failed_ext != NULL) ? *failed_ext : failed_int; 1709 Label done; 1710 1711 // Save one branch if result is returned via register and result register is different from the other ones. 1712 bool use_result_reg = (int_flag_success!=noreg); 1713 bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() && 1714 int_flag_success!=exchange_value && int_flag_success!=addr_base); 1715 assert(!weak || flag == CCR0, "weak only supported with CCR0"); 1716 assert(int_flag_success == noreg || failed_ext == NULL, "cannot have both"); 1717 1718 if (use_result_reg && preset_result_reg) { 1719 li(int_flag_success, 0); // preset (assume cas failed) 1720 } 1721 1722 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1723 if (contention_hint) { // Don't try to reserve if cmp fails. 1724 ld(dest_current_value, 0, addr_base); 1725 cmpd(flag, compare_value, dest_current_value); 1726 bne(flag, failed); 1727 } 1728 1729 // release/fence semantics 1730 if (semantics & MemBarRel) { 1731 release(); 1732 } 1733 1734 // atomic emulation loop 1735 bind(retry); 1736 1737 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1738 cmpd(flag, compare_value, dest_current_value); 1739 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1740 bne_predict_not_taken(flag, failed); 1741 } else { 1742 bne( flag, failed); 1743 } 1744 1745 stdcx_(exchange_value, addr_base); 1746 if (!weak || use_result_reg || failed_ext) { 1747 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1748 bne_predict_not_taken(CCR0, weak ? failed : retry); // stXcx_ sets CCR0 1749 } else { 1750 bne( CCR0, weak ? failed : retry); // stXcx_ sets CCR0 1751 } 1752 } 1753 1754 // result in register (must do this at the end because int_flag_success can be the same register as one above) 1755 if (use_result_reg) { 1756 li(int_flag_success, 1); 1757 } 1758 1759 if (semantics & MemBarFenceAfter) { 1760 fence(); 1761 } else if (semantics & MemBarAcq) { 1762 isync(); 1763 } 1764 1765 if (use_result_reg && !preset_result_reg) { 1766 b(done); 1767 } 1768 1769 bind(failed_int); 1770 if (use_result_reg && !preset_result_reg) { 1771 li(int_flag_success, 0); 1772 } 1773 1774 bind(done); 1775 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1776 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1777 } 1778 1779 // Look up the method for a megamorphic invokeinterface call. 1780 // The target method is determined by <intf_klass, itable_index>. 1781 // The receiver klass is in recv_klass. 1782 // On success, the result will be in method_result, and execution falls through. 1783 // On failure, execution transfers to the given label. 1784 void MacroAssembler::lookup_interface_method(Register recv_klass, 1785 Register intf_klass, 1786 RegisterOrConstant itable_index, 1787 Register method_result, 1788 Register scan_temp, 1789 Register sethi_temp, 1790 Label& L_no_such_interface) { 1791 assert_different_registers(recv_klass, intf_klass, method_result, scan_temp); 1792 assert(itable_index.is_constant() || itable_index.as_register() == method_result, 1793 "caller must use same register for non-constant itable index as for method"); 1794 1795 // Compute start of first itableOffsetEntry (which is at the end of the vtable). 1796 int vtable_base = in_bytes(Klass::vtable_start_offset()); 1797 int itentry_off = itableMethodEntry::method_offset_in_bytes(); 1798 int logMEsize = exact_log2(itableMethodEntry::size() * wordSize); 1799 int scan_step = itableOffsetEntry::size() * wordSize; 1800 int log_vte_size= exact_log2(vtableEntry::size_in_bytes()); 1801 1802 lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass); 1803 // %%% We should store the aligned, prescaled offset in the klassoop. 1804 // Then the next several instructions would fold away. 1805 1806 sldi(scan_temp, scan_temp, log_vte_size); 1807 addi(scan_temp, scan_temp, vtable_base); 1808 add(scan_temp, recv_klass, scan_temp); 1809 1810 // Adjust recv_klass by scaled itable_index, so we can free itable_index. 1811 if (itable_index.is_register()) { 1812 Register itable_offset = itable_index.as_register(); 1813 sldi(itable_offset, itable_offset, logMEsize); 1814 if (itentry_off) addi(itable_offset, itable_offset, itentry_off); 1815 add(recv_klass, itable_offset, recv_klass); 1816 } else { 1817 long itable_offset = (long)itable_index.as_constant(); 1818 load_const_optimized(sethi_temp, (itable_offset<<logMEsize)+itentry_off); // static address, no relocation 1819 add(recv_klass, sethi_temp, recv_klass); 1820 } 1821 1822 // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) { 1823 // if (scan->interface() == intf) { 1824 // result = (klass + scan->offset() + itable_index); 1825 // } 1826 // } 1827 Label search, found_method; 1828 1829 for (int peel = 1; peel >= 0; peel--) { 1830 // %%%% Could load both offset and interface in one ldx, if they were 1831 // in the opposite order. This would save a load. 1832 ld(method_result, itableOffsetEntry::interface_offset_in_bytes(), scan_temp); 1833 1834 // Check that this entry is non-null. A null entry means that 1835 // the receiver class doesn't implement the interface, and wasn't the 1836 // same as when the caller was compiled. 1837 cmpd(CCR0, method_result, intf_klass); 1838 1839 if (peel) { 1840 beq(CCR0, found_method); 1841 } else { 1842 bne(CCR0, search); 1843 // (invert the test to fall through to found_method...) 1844 } 1845 1846 if (!peel) break; 1847 1848 bind(search); 1849 1850 cmpdi(CCR0, method_result, 0); 1851 beq(CCR0, L_no_such_interface); 1852 addi(scan_temp, scan_temp, scan_step); 1853 } 1854 1855 bind(found_method); 1856 1857 // Got a hit. 1858 int ito_offset = itableOffsetEntry::offset_offset_in_bytes(); 1859 lwz(scan_temp, ito_offset, scan_temp); 1860 ldx(method_result, scan_temp, recv_klass); 1861 } 1862 1863 // virtual method calling 1864 void MacroAssembler::lookup_virtual_method(Register recv_klass, 1865 RegisterOrConstant vtable_index, 1866 Register method_result) { 1867 1868 assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg()); 1869 1870 const int base = in_bytes(Klass::vtable_start_offset()); 1871 assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); 1872 1873 if (vtable_index.is_register()) { 1874 sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord); 1875 add(recv_klass, vtable_index.as_register(), recv_klass); 1876 } else { 1877 addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord); 1878 } 1879 ld(R19_method, base + vtableEntry::method_offset_in_bytes(), recv_klass); 1880 } 1881 1882 /////////////////////////////////////////// subtype checking //////////////////////////////////////////// 1883 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, 1884 Register super_klass, 1885 Register temp1_reg, 1886 Register temp2_reg, 1887 Label* L_success, 1888 Label* L_failure, 1889 Label* L_slow_path, 1890 RegisterOrConstant super_check_offset) { 1891 1892 const Register check_cache_offset = temp1_reg; 1893 const Register cached_super = temp2_reg; 1894 1895 assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super); 1896 1897 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1898 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1899 1900 bool must_load_sco = (super_check_offset.constant_or_zero() == -1); 1901 bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset); 1902 1903 Label L_fallthrough; 1904 int label_nulls = 0; 1905 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 1906 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 1907 if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; } 1908 assert(label_nulls <= 1 || 1909 (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path), 1910 "at most one NULL in the batch, usually"); 1911 1912 // If the pointers are equal, we are done (e.g., String[] elements). 1913 // This self-check enables sharing of secondary supertype arrays among 1914 // non-primary types such as array-of-interface. Otherwise, each such 1915 // type would need its own customized SSA. 1916 // We move this check to the front of the fast path because many 1917 // type checks are in fact trivially successful in this manner, 1918 // so we get a nicely predicted branch right at the start of the check. 1919 cmpd(CCR0, sub_klass, super_klass); 1920 beq(CCR0, *L_success); 1921 1922 // Check the supertype display: 1923 if (must_load_sco) { 1924 // The super check offset is always positive... 1925 lwz(check_cache_offset, sco_offset, super_klass); 1926 super_check_offset = RegisterOrConstant(check_cache_offset); 1927 // super_check_offset is register. 1928 assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register()); 1929 } 1930 // The loaded value is the offset from KlassOopDesc. 1931 1932 ld(cached_super, super_check_offset, sub_klass); 1933 cmpd(CCR0, cached_super, super_klass); 1934 1935 // This check has worked decisively for primary supers. 1936 // Secondary supers are sought in the super_cache ('super_cache_addr'). 1937 // (Secondary supers are interfaces and very deeply nested subtypes.) 1938 // This works in the same check above because of a tricky aliasing 1939 // between the super_cache and the primary super display elements. 1940 // (The 'super_check_addr' can address either, as the case requires.) 1941 // Note that the cache is updated below if it does not help us find 1942 // what we need immediately. 1943 // So if it was a primary super, we can just fail immediately. 1944 // Otherwise, it's the slow path for us (no success at this point). 1945 1946 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); } 1947 1948 if (super_check_offset.is_register()) { 1949 beq(CCR0, *L_success); 1950 cmpwi(CCR0, super_check_offset.as_register(), sc_offset); 1951 if (L_failure == &L_fallthrough) { 1952 beq(CCR0, *L_slow_path); 1953 } else { 1954 bne(CCR0, *L_failure); 1955 FINAL_JUMP(*L_slow_path); 1956 } 1957 } else { 1958 if (super_check_offset.as_constant() == sc_offset) { 1959 // Need a slow path; fast failure is impossible. 1960 if (L_slow_path == &L_fallthrough) { 1961 beq(CCR0, *L_success); 1962 } else { 1963 bne(CCR0, *L_slow_path); 1964 FINAL_JUMP(*L_success); 1965 } 1966 } else { 1967 // No slow path; it's a fast decision. 1968 if (L_failure == &L_fallthrough) { 1969 beq(CCR0, *L_success); 1970 } else { 1971 bne(CCR0, *L_failure); 1972 FINAL_JUMP(*L_success); 1973 } 1974 } 1975 } 1976 1977 bind(L_fallthrough); 1978 #undef FINAL_JUMP 1979 } 1980 1981 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 1982 Register super_klass, 1983 Register temp1_reg, 1984 Register temp2_reg, 1985 Label* L_success, 1986 Register result_reg) { 1987 const Register array_ptr = temp1_reg; // current value from cache array 1988 const Register temp = temp2_reg; 1989 1990 assert_different_registers(sub_klass, super_klass, array_ptr, temp); 1991 1992 int source_offset = in_bytes(Klass::secondary_supers_offset()); 1993 int target_offset = in_bytes(Klass::secondary_super_cache_offset()); 1994 1995 int length_offset = Array<Klass*>::length_offset_in_bytes(); 1996 int base_offset = Array<Klass*>::base_offset_in_bytes(); 1997 1998 Label hit, loop, failure, fallthru; 1999 2000 ld(array_ptr, source_offset, sub_klass); 2001 2002 // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated."); 2003 lwz(temp, length_offset, array_ptr); 2004 cmpwi(CCR0, temp, 0); 2005 beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0 2006 2007 mtctr(temp); // load ctr 2008 2009 bind(loop); 2010 // Oops in table are NO MORE compressed. 2011 ld(temp, base_offset, array_ptr); 2012 cmpd(CCR0, temp, super_klass); 2013 beq(CCR0, hit); 2014 addi(array_ptr, array_ptr, BytesPerWord); 2015 bdnz(loop); 2016 2017 bind(failure); 2018 if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss) 2019 b(fallthru); 2020 2021 bind(hit); 2022 std(super_klass, target_offset, sub_klass); // save result to cache 2023 if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit) 2024 if (L_success != NULL) { b(*L_success); } 2025 else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided 2026 2027 bind(fallthru); 2028 } 2029 2030 // Try fast path, then go to slow one if not successful 2031 void MacroAssembler::check_klass_subtype(Register sub_klass, 2032 Register super_klass, 2033 Register temp1_reg, 2034 Register temp2_reg, 2035 Label& L_success) { 2036 Label L_failure; 2037 check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure); 2038 check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success); 2039 bind(L_failure); // Fallthru if not successful. 2040 } 2041 2042 void MacroAssembler::check_method_handle_type(Register mtype_reg, Register mh_reg, 2043 Register temp_reg, 2044 Label& wrong_method_type) { 2045 assert_different_registers(mtype_reg, mh_reg, temp_reg); 2046 // Compare method type against that of the receiver. 2047 load_heap_oop_not_null(temp_reg, delayed_value(java_lang_invoke_MethodHandle::type_offset_in_bytes, temp_reg), mh_reg); 2048 cmpd(CCR0, temp_reg, mtype_reg); 2049 bne(CCR0, wrong_method_type); 2050 } 2051 2052 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot, 2053 Register temp_reg, 2054 int extra_slot_offset) { 2055 // cf. TemplateTable::prepare_invoke(), if (load_receiver). 2056 int stackElementSize = Interpreter::stackElementSize; 2057 int offset = extra_slot_offset * stackElementSize; 2058 if (arg_slot.is_constant()) { 2059 offset += arg_slot.as_constant() * stackElementSize; 2060 return offset; 2061 } else { 2062 assert(temp_reg != noreg, "must specify"); 2063 sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize)); 2064 if (offset != 0) 2065 addi(temp_reg, temp_reg, offset); 2066 return temp_reg; 2067 } 2068 } 2069 2070 // Supports temp2_reg = R0. 2071 void MacroAssembler::biased_locking_enter(ConditionRegister cr_reg, Register obj_reg, 2072 Register mark_reg, Register temp_reg, 2073 Register temp2_reg, Label& done, Label* slow_case) { 2074 assert(UseBiasedLocking, "why call this otherwise?"); 2075 2076 #ifdef ASSERT 2077 assert_different_registers(obj_reg, mark_reg, temp_reg, temp2_reg); 2078 #endif 2079 2080 Label cas_label; 2081 2082 // Branch to done if fast path fails and no slow_case provided. 2083 Label *slow_case_int = (slow_case != NULL) ? slow_case : &done; 2084 2085 // Biased locking 2086 // See whether the lock is currently biased toward our thread and 2087 // whether the epoch is still valid 2088 // Note that the runtime guarantees sufficient alignment of JavaThread 2089 // pointers to allow age to be placed into low bits 2090 assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, 2091 "biased locking makes assumptions about bit layout"); 2092 2093 if (PrintBiasedLockingStatistics) { 2094 load_const(temp2_reg, (address) BiasedLocking::total_entry_count_addr(), temp_reg); 2095 lwzx(temp_reg, temp2_reg); 2096 addi(temp_reg, temp_reg, 1); 2097 stwx(temp_reg, temp2_reg); 2098 } 2099 2100 andi(temp_reg, mark_reg, markOopDesc::biased_lock_mask_in_place); 2101 cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern); 2102 bne(cr_reg, cas_label); 2103 2104 load_klass(temp_reg, obj_reg); 2105 2106 load_const_optimized(temp2_reg, ~((int) markOopDesc::age_mask_in_place)); 2107 ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg); 2108 orr(temp_reg, R16_thread, temp_reg); 2109 xorr(temp_reg, mark_reg, temp_reg); 2110 andr(temp_reg, temp_reg, temp2_reg); 2111 cmpdi(cr_reg, temp_reg, 0); 2112 if (PrintBiasedLockingStatistics) { 2113 Label l; 2114 bne(cr_reg, l); 2115 load_const(temp2_reg, (address) BiasedLocking::biased_lock_entry_count_addr()); 2116 lwzx(mark_reg, temp2_reg); 2117 addi(mark_reg, mark_reg, 1); 2118 stwx(mark_reg, temp2_reg); 2119 // restore mark_reg 2120 ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg); 2121 bind(l); 2122 } 2123 beq(cr_reg, done); 2124 2125 Label try_revoke_bias; 2126 Label try_rebias; 2127 2128 // At this point we know that the header has the bias pattern and 2129 // that we are not the bias owner in the current epoch. We need to 2130 // figure out more details about the state of the header in order to 2131 // know what operations can be legally performed on the object's 2132 // header. 2133 2134 // If the low three bits in the xor result aren't clear, that means 2135 // the prototype header is no longer biased and we have to revoke 2136 // the bias on this object. 2137 andi(temp2_reg, temp_reg, markOopDesc::biased_lock_mask_in_place); 2138 cmpwi(cr_reg, temp2_reg, 0); 2139 bne(cr_reg, try_revoke_bias); 2140 2141 // Biasing is still enabled for this data type. See whether the 2142 // epoch of the current bias is still valid, meaning that the epoch 2143 // bits of the mark word are equal to the epoch bits of the 2144 // prototype header. (Note that the prototype header's epoch bits 2145 // only change at a safepoint.) If not, attempt to rebias the object 2146 // toward the current thread. Note that we must be absolutely sure 2147 // that the current epoch is invalid in order to do this because 2148 // otherwise the manipulations it performs on the mark word are 2149 // illegal. 2150 2151 int shift_amount = 64 - markOopDesc::epoch_shift; 2152 // rotate epoch bits to right (little) end and set other bits to 0 2153 // [ big part | epoch | little part ] -> [ 0..0 | epoch ] 2154 rldicl_(temp2_reg, temp_reg, shift_amount, 64 - markOopDesc::epoch_bits); 2155 // branch if epoch bits are != 0, i.e. they differ, because the epoch has been incremented 2156 bne(CCR0, try_rebias); 2157 2158 // The epoch of the current bias is still valid but we know nothing 2159 // about the owner; it might be set or it might be clear. Try to 2160 // acquire the bias of the object using an atomic operation. If this 2161 // fails we will go in to the runtime to revoke the object's bias. 2162 // Note that we first construct the presumed unbiased header so we 2163 // don't accidentally blow away another thread's valid bias. 2164 andi(mark_reg, mark_reg, (markOopDesc::biased_lock_mask_in_place | 2165 markOopDesc::age_mask_in_place | 2166 markOopDesc::epoch_mask_in_place)); 2167 orr(temp_reg, R16_thread, mark_reg); 2168 2169 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2170 2171 // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg). 2172 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg, 2173 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg, 2174 /*where=*/obj_reg, 2175 MacroAssembler::MemBarAcq, 2176 MacroAssembler::cmpxchgx_hint_acquire_lock(), 2177 noreg, slow_case_int); // bail out if failed 2178 2179 // If the biasing toward our thread failed, this means that 2180 // another thread succeeded in biasing it toward itself and we 2181 // need to revoke that bias. The revocation will occur in the 2182 // interpreter runtime in the slow case. 2183 if (PrintBiasedLockingStatistics) { 2184 load_const(temp2_reg, (address) BiasedLocking::anonymously_biased_lock_entry_count_addr(), temp_reg); 2185 lwzx(temp_reg, temp2_reg); 2186 addi(temp_reg, temp_reg, 1); 2187 stwx(temp_reg, temp2_reg); 2188 } 2189 b(done); 2190 2191 bind(try_rebias); 2192 // At this point we know the epoch has expired, meaning that the 2193 // current "bias owner", if any, is actually invalid. Under these 2194 // circumstances _only_, we are allowed to use the current header's 2195 // value as the comparison value when doing the cas to acquire the 2196 // bias in the current epoch. In other words, we allow transfer of 2197 // the bias from one thread to another directly in this situation. 2198 load_klass(temp_reg, obj_reg); 2199 andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place); 2200 orr(temp2_reg, R16_thread, temp2_reg); 2201 ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg); 2202 orr(temp_reg, temp2_reg, temp_reg); 2203 2204 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2205 2206 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg, 2207 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg, 2208 /*where=*/obj_reg, 2209 MacroAssembler::MemBarAcq, 2210 MacroAssembler::cmpxchgx_hint_acquire_lock(), 2211 noreg, slow_case_int); // bail out if failed 2212 2213 // If the biasing toward our thread failed, this means that 2214 // another thread succeeded in biasing it toward itself and we 2215 // need to revoke that bias. The revocation will occur in the 2216 // interpreter runtime in the slow case. 2217 if (PrintBiasedLockingStatistics) { 2218 load_const(temp2_reg, (address) BiasedLocking::rebiased_lock_entry_count_addr(), temp_reg); 2219 lwzx(temp_reg, temp2_reg); 2220 addi(temp_reg, temp_reg, 1); 2221 stwx(temp_reg, temp2_reg); 2222 } 2223 b(done); 2224 2225 bind(try_revoke_bias); 2226 // The prototype mark in the klass doesn't have the bias bit set any 2227 // more, indicating that objects of this data type are not supposed 2228 // to be biased any more. We are going to try to reset the mark of 2229 // this object to the prototype value and fall through to the 2230 // CAS-based locking scheme. Note that if our CAS fails, it means 2231 // that another thread raced us for the privilege of revoking the 2232 // bias of this particular object, so it's okay to continue in the 2233 // normal locking code. 2234 load_klass(temp_reg, obj_reg); 2235 ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg); 2236 andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place); 2237 orr(temp_reg, temp_reg, temp2_reg); 2238 2239 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2240 2241 // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg). 2242 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg, 2243 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg, 2244 /*where=*/obj_reg, 2245 MacroAssembler::MemBarAcq, 2246 MacroAssembler::cmpxchgx_hint_acquire_lock()); 2247 2248 // reload markOop in mark_reg before continuing with lightweight locking 2249 ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg); 2250 2251 // Fall through to the normal CAS-based lock, because no matter what 2252 // the result of the above CAS, some thread must have succeeded in 2253 // removing the bias bit from the object's header. 2254 if (PrintBiasedLockingStatistics) { 2255 Label l; 2256 bne(cr_reg, l); 2257 load_const(temp2_reg, (address) BiasedLocking::revoked_lock_entry_count_addr(), temp_reg); 2258 lwzx(temp_reg, temp2_reg); 2259 addi(temp_reg, temp_reg, 1); 2260 stwx(temp_reg, temp2_reg); 2261 bind(l); 2262 } 2263 2264 bind(cas_label); 2265 } 2266 2267 void MacroAssembler::biased_locking_exit (ConditionRegister cr_reg, Register mark_addr, Register temp_reg, Label& done) { 2268 // Check for biased locking unlock case, which is a no-op 2269 // Note: we do not have to check the thread ID for two reasons. 2270 // First, the interpreter checks for IllegalMonitorStateException at 2271 // a higher level. Second, if the bias was revoked while we held the 2272 // lock, the object could not be rebiased toward another thread, so 2273 // the bias bit would be clear. 2274 2275 ld(temp_reg, 0, mark_addr); 2276 andi(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place); 2277 2278 cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern); 2279 beq(cr_reg, done); 2280 } 2281 2282 // allocation (for C1) 2283 void MacroAssembler::eden_allocate( 2284 Register obj, // result: pointer to object after successful allocation 2285 Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise 2286 int con_size_in_bytes, // object size in bytes if known at compile time 2287 Register t1, // temp register 2288 Register t2, // temp register 2289 Label& slow_case // continuation point if fast allocation fails 2290 ) { 2291 b(slow_case); 2292 } 2293 2294 void MacroAssembler::tlab_allocate( 2295 Register obj, // result: pointer to object after successful allocation 2296 Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise 2297 int con_size_in_bytes, // object size in bytes if known at compile time 2298 Register t1, // temp register 2299 Label& slow_case // continuation point if fast allocation fails 2300 ) { 2301 // make sure arguments make sense 2302 assert_different_registers(obj, var_size_in_bytes, t1); 2303 assert(0 <= con_size_in_bytes && is_simm13(con_size_in_bytes), "illegal object size"); 2304 assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment"); 2305 2306 const Register new_top = t1; 2307 //verify_tlab(); not implemented 2308 2309 ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread); 2310 ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread); 2311 if (var_size_in_bytes == noreg) { 2312 addi(new_top, obj, con_size_in_bytes); 2313 } else { 2314 add(new_top, obj, var_size_in_bytes); 2315 } 2316 cmpld(CCR0, new_top, R0); 2317 bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case); 2318 2319 #ifdef ASSERT 2320 // make sure new free pointer is properly aligned 2321 { 2322 Label L; 2323 andi_(R0, new_top, MinObjAlignmentInBytesMask); 2324 beq(CCR0, L); 2325 stop("updated TLAB free is not properly aligned", 0x934); 2326 bind(L); 2327 } 2328 #endif // ASSERT 2329 2330 // update the tlab top pointer 2331 std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread); 2332 //verify_tlab(); not implemented 2333 } 2334 void MacroAssembler::tlab_refill(Label& retry_tlab, Label& try_eden, Label& slow_case) { 2335 unimplemented("tlab_refill"); 2336 } 2337 void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register t1, Register t2) { 2338 unimplemented("incr_allocated_bytes"); 2339 } 2340 2341 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset, 2342 int insts_call_instruction_offset, Register Rtoc) { 2343 // Start the stub. 2344 address stub = start_a_stub(64); 2345 if (stub == NULL) { return NULL; } // CodeCache full: bail out 2346 2347 // Create a trampoline stub relocation which relates this trampoline stub 2348 // with the call instruction at insts_call_instruction_offset in the 2349 // instructions code-section. 2350 relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset)); 2351 const int stub_start_offset = offset(); 2352 2353 // For java_to_interp stubs we use R11_scratch1 as scratch register 2354 // and in call trampoline stubs we use R12_scratch2. This way we 2355 // can distinguish them (see is_NativeCallTrampolineStub_at()). 2356 Register reg_scratch = R12_scratch2; 2357 2358 // Now, create the trampoline stub's code: 2359 // - load the TOC 2360 // - load the call target from the constant pool 2361 // - call 2362 if (Rtoc == noreg) { 2363 calculate_address_from_global_toc(reg_scratch, method_toc()); 2364 Rtoc = reg_scratch; 2365 } 2366 2367 ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false); 2368 mtctr(reg_scratch); 2369 bctr(); 2370 2371 const address stub_start_addr = addr_at(stub_start_offset); 2372 2373 // Assert that the encoded destination_toc_offset can be identified and that it is correct. 2374 assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(), 2375 "encoded offset into the constant pool must match"); 2376 // Trampoline_stub_size should be good. 2377 assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size"); 2378 assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline"); 2379 2380 // End the stub. 2381 end_a_stub(); 2382 return stub; 2383 } 2384 2385 // TM on PPC64. 2386 void MacroAssembler::atomic_inc_ptr(Register addr, Register result, int simm16) { 2387 Label retry; 2388 bind(retry); 2389 ldarx(result, addr, /*hint*/ false); 2390 addi(result, result, simm16); 2391 stdcx_(result, addr); 2392 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 2393 bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0 2394 } else { 2395 bne( CCR0, retry); // stXcx_ sets CCR0 2396 } 2397 } 2398 2399 void MacroAssembler::atomic_ori_int(Register addr, Register result, int uimm16) { 2400 Label retry; 2401 bind(retry); 2402 lwarx(result, addr, /*hint*/ false); 2403 ori(result, result, uimm16); 2404 stwcx_(result, addr); 2405 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 2406 bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0 2407 } else { 2408 bne( CCR0, retry); // stXcx_ sets CCR0 2409 } 2410 } 2411 2412 #if INCLUDE_RTM_OPT 2413 2414 // Update rtm_counters based on abort status 2415 // input: abort_status 2416 // rtm_counters (RTMLockingCounters*) 2417 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters_Reg) { 2418 // Mapping to keep PreciseRTMLockingStatistics similar to x86. 2419 // x86 ppc (! means inverted, ? means not the same) 2420 // 0 31 Set if abort caused by XABORT instruction. 2421 // 1 ! 7 If set, the transaction may succeed on a retry. This bit is always clear if bit 0 is set. 2422 // 2 13 Set if another logical processor conflicted with a memory address that was part of the transaction that aborted. 2423 // 3 10 Set if an internal buffer overflowed. 2424 // 4 ?12 Set if a debug breakpoint was hit. 2425 // 5 ?32 Set if an abort occurred during execution of a nested transaction. 2426 const int tm_failure_bit[] = {Assembler::tm_tabort, // Note: Seems like signal handler sets this, too. 2427 Assembler::tm_failure_persistent, // inverted: transient 2428 Assembler::tm_trans_cf, 2429 Assembler::tm_footprint_of, 2430 Assembler::tm_non_trans_cf, 2431 Assembler::tm_suspended}; 2432 const bool tm_failure_inv[] = {false, true, false, false, false, false}; 2433 assert(sizeof(tm_failure_bit)/sizeof(int) == RTMLockingCounters::ABORT_STATUS_LIMIT, "adapt mapping!"); 2434 2435 const Register addr_Reg = R0; 2436 // Keep track of offset to where rtm_counters_Reg had pointed to. 2437 int counters_offs = RTMLockingCounters::abort_count_offset(); 2438 addi(addr_Reg, rtm_counters_Reg, counters_offs); 2439 const Register temp_Reg = rtm_counters_Reg; 2440 2441 //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically 2442 ldx(temp_Reg, addr_Reg); 2443 addi(temp_Reg, temp_Reg, 1); 2444 stdx(temp_Reg, addr_Reg); 2445 2446 if (PrintPreciseRTMLockingStatistics) { 2447 int counters_offs_delta = RTMLockingCounters::abortX_count_offset() - counters_offs; 2448 2449 //mftexasr(abort_status); done by caller 2450 for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) { 2451 counters_offs += counters_offs_delta; 2452 li(temp_Reg, counters_offs_delta); // can't use addi with R0 2453 add(addr_Reg, addr_Reg, temp_Reg); // point to next counter 2454 counters_offs_delta = sizeof(uintx); 2455 2456 Label check_abort; 2457 rldicr_(temp_Reg, abort_status, tm_failure_bit[i], 0); 2458 if (tm_failure_inv[i]) { 2459 bne(CCR0, check_abort); 2460 } else { 2461 beq(CCR0, check_abort); 2462 } 2463 //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically 2464 ldx(temp_Reg, addr_Reg); 2465 addi(temp_Reg, temp_Reg, 1); 2466 stdx(temp_Reg, addr_Reg); 2467 bind(check_abort); 2468 } 2469 } 2470 li(temp_Reg, -counters_offs); // can't use addi with R0 2471 add(rtm_counters_Reg, addr_Reg, temp_Reg); // restore 2472 } 2473 2474 // Branch if (random & (count-1) != 0), count is 2^n 2475 // tmp and CR0 are killed 2476 void MacroAssembler::branch_on_random_using_tb(Register tmp, int count, Label& brLabel) { 2477 mftb(tmp); 2478 andi_(tmp, tmp, count-1); 2479 bne(CCR0, brLabel); 2480 } 2481 2482 // Perform abort ratio calculation, set no_rtm bit if high ratio. 2483 // input: rtm_counters_Reg (RTMLockingCounters* address) - KILLED 2484 void MacroAssembler::rtm_abort_ratio_calculation(Register rtm_counters_Reg, 2485 RTMLockingCounters* rtm_counters, 2486 Metadata* method_data) { 2487 Label L_done, L_check_always_rtm1, L_check_always_rtm2; 2488 2489 if (RTMLockingCalculationDelay > 0) { 2490 // Delay calculation. 2491 ld(rtm_counters_Reg, (RegisterOrConstant)(intptr_t)RTMLockingCounters::rtm_calculation_flag_addr()); 2492 cmpdi(CCR0, rtm_counters_Reg, 0); 2493 beq(CCR0, L_done); 2494 load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload 2495 } 2496 // Abort ratio calculation only if abort_count > RTMAbortThreshold. 2497 // Aborted transactions = abort_count * 100 2498 // All transactions = total_count * RTMTotalCountIncrRate 2499 // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio) 2500 ld(R0, RTMLockingCounters::abort_count_offset(), rtm_counters_Reg); 2501 if (is_simm(RTMAbortThreshold, 16)) { // cmpdi can handle 16bit immediate only. 2502 cmpdi(CCR0, R0, RTMAbortThreshold); 2503 blt(CCR0, L_check_always_rtm2); // reload of rtm_counters_Reg not necessary 2504 } else { 2505 load_const_optimized(rtm_counters_Reg, RTMAbortThreshold); 2506 cmpd(CCR0, R0, rtm_counters_Reg); 2507 blt(CCR0, L_check_always_rtm1); // reload of rtm_counters_Reg required 2508 } 2509 mulli(R0, R0, 100); 2510 2511 const Register tmpReg = rtm_counters_Reg; 2512 ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg); 2513 mulli(tmpReg, tmpReg, RTMTotalCountIncrRate); // allowable range: int16 2514 mulli(tmpReg, tmpReg, RTMAbortRatio); // allowable range: int16 2515 cmpd(CCR0, R0, tmpReg); 2516 blt(CCR0, L_check_always_rtm1); // jump to reload 2517 if (method_data != NULL) { 2518 // Set rtm_state to "no rtm" in MDO. 2519 // Not using a metadata relocation. Method and Class Loader are kept alive anyway. 2520 // (See nmethod::metadata_do and CodeBuffer::finalize_oop_references.) 2521 load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg); 2522 atomic_ori_int(R0, tmpReg, NoRTM); 2523 } 2524 b(L_done); 2525 2526 bind(L_check_always_rtm1); 2527 load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload 2528 bind(L_check_always_rtm2); 2529 ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg); 2530 int64_t thresholdValue = RTMLockingThreshold / RTMTotalCountIncrRate; 2531 if (is_simm(thresholdValue, 16)) { // cmpdi can handle 16bit immediate only. 2532 cmpdi(CCR0, tmpReg, thresholdValue); 2533 } else { 2534 load_const_optimized(R0, thresholdValue); 2535 cmpd(CCR0, tmpReg, R0); 2536 } 2537 blt(CCR0, L_done); 2538 if (method_data != NULL) { 2539 // Set rtm_state to "always rtm" in MDO. 2540 // Not using a metadata relocation. See above. 2541 load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg); 2542 atomic_ori_int(R0, tmpReg, UseRTM); 2543 } 2544 bind(L_done); 2545 } 2546 2547 // Update counters and perform abort ratio calculation. 2548 // input: abort_status_Reg 2549 void MacroAssembler::rtm_profiling(Register abort_status_Reg, Register temp_Reg, 2550 RTMLockingCounters* rtm_counters, 2551 Metadata* method_data, 2552 bool profile_rtm) { 2553 2554 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2555 // Update rtm counters based on state at abort. 2556 // Reads abort_status_Reg, updates flags. 2557 assert_different_registers(abort_status_Reg, temp_Reg); 2558 load_const_optimized(temp_Reg, (address)rtm_counters, R0); 2559 rtm_counters_update(abort_status_Reg, temp_Reg); 2560 if (profile_rtm) { 2561 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2562 rtm_abort_ratio_calculation(temp_Reg, rtm_counters, method_data); 2563 } 2564 } 2565 2566 // Retry on abort if abort's status indicates non-persistent failure. 2567 // inputs: retry_count_Reg 2568 // : abort_status_Reg 2569 // output: retry_count_Reg decremented by 1 2570 void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, 2571 Label& retryLabel, Label* checkRetry) { 2572 Label doneRetry; 2573 rldicr_(R0, abort_status_Reg, tm_failure_persistent, 0); 2574 bne(CCR0, doneRetry); 2575 if (checkRetry) { bind(*checkRetry); } 2576 addic_(retry_count_Reg, retry_count_Reg, -1); 2577 blt(CCR0, doneRetry); 2578 smt_yield(); // Can't use wait(). No permission (SIGILL). 2579 b(retryLabel); 2580 bind(doneRetry); 2581 } 2582 2583 // Spin and retry if lock is busy. 2584 // inputs: owner_addr_Reg (monitor address) 2585 // : retry_count_Reg 2586 // output: retry_count_Reg decremented by 1 2587 // CTR is killed 2588 void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register owner_addr_Reg, Label& retryLabel) { 2589 Label SpinLoop, doneRetry; 2590 addic_(retry_count_Reg, retry_count_Reg, -1); 2591 blt(CCR0, doneRetry); 2592 2593 if (RTMSpinLoopCount > 1) { 2594 li(R0, RTMSpinLoopCount); 2595 mtctr(R0); 2596 } 2597 2598 bind(SpinLoop); 2599 smt_yield(); // Can't use waitrsv(). No permission (SIGILL). 2600 2601 if (RTMSpinLoopCount > 1) { 2602 bdz(retryLabel); 2603 ld(R0, 0, owner_addr_Reg); 2604 cmpdi(CCR0, R0, 0); 2605 bne(CCR0, SpinLoop); 2606 } 2607 2608 b(retryLabel); 2609 2610 bind(doneRetry); 2611 } 2612 2613 // Use RTM for normal stack locks. 2614 // Input: objReg (object to lock) 2615 void MacroAssembler::rtm_stack_locking(ConditionRegister flag, 2616 Register obj, Register mark_word, Register tmp, 2617 Register retry_on_abort_count_Reg, 2618 RTMLockingCounters* stack_rtm_counters, 2619 Metadata* method_data, bool profile_rtm, 2620 Label& DONE_LABEL, Label& IsInflated) { 2621 assert(UseRTMForStackLocks, "why call this otherwise?"); 2622 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); 2623 Label L_rtm_retry, L_decrement_retry, L_on_abort; 2624 2625 if (RTMRetryCount > 0) { 2626 load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 2627 bind(L_rtm_retry); 2628 } 2629 andi_(R0, mark_word, markOopDesc::monitor_value); // inflated vs stack-locked|neutral|biased 2630 bne(CCR0, IsInflated); 2631 2632 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2633 Label L_noincrement; 2634 if (RTMTotalCountIncrRate > 1) { 2635 branch_on_random_using_tb(tmp, RTMTotalCountIncrRate, L_noincrement); 2636 } 2637 assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM"); 2638 load_const_optimized(tmp, (address)stack_rtm_counters->total_count_addr(), R0); 2639 //atomic_inc_ptr(tmp, /*temp, will be reloaded*/mark_word); We don't increment atomically 2640 ldx(mark_word, tmp); 2641 addi(mark_word, mark_word, 1); 2642 stdx(mark_word, tmp); 2643 bind(L_noincrement); 2644 } 2645 tbegin_(); 2646 beq(CCR0, L_on_abort); 2647 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // Reload in transaction, conflicts need to be tracked. 2648 andi(R0, mark_word, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits 2649 cmpwi(flag, R0, markOopDesc::unlocked_value); // bits = 001 unlocked 2650 beq(flag, DONE_LABEL); // all done if unlocked 2651 2652 if (UseRTMXendForLockBusy) { 2653 tend_(); 2654 b(L_decrement_retry); 2655 } else { 2656 tabort_(); 2657 } 2658 bind(L_on_abort); 2659 const Register abort_status_Reg = tmp; 2660 mftexasr(abort_status_Reg); 2661 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2662 rtm_profiling(abort_status_Reg, /*temp*/mark_word, stack_rtm_counters, method_data, profile_rtm); 2663 } 2664 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // reload 2665 if (RTMRetryCount > 0) { 2666 // Retry on lock abort if abort status is not permanent. 2667 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry, &L_decrement_retry); 2668 } else { 2669 bind(L_decrement_retry); 2670 } 2671 } 2672 2673 // Use RTM for inflating locks 2674 // inputs: obj (object to lock) 2675 // mark_word (current header - KILLED) 2676 // boxReg (on-stack box address (displaced header location) - KILLED) 2677 void MacroAssembler::rtm_inflated_locking(ConditionRegister flag, 2678 Register obj, Register mark_word, Register boxReg, 2679 Register retry_on_busy_count_Reg, Register retry_on_abort_count_Reg, 2680 RTMLockingCounters* rtm_counters, 2681 Metadata* method_data, bool profile_rtm, 2682 Label& DONE_LABEL) { 2683 assert(UseRTMLocking, "why call this otherwise?"); 2684 Label L_rtm_retry, L_decrement_retry, L_on_abort; 2685 // Clean monitor_value bit to get valid pointer. 2686 int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markOopDesc::monitor_value; 2687 2688 // Store non-null, using boxReg instead of (intptr_t)markOopDesc::unused_mark(). 2689 std(boxReg, BasicLock::displaced_header_offset_in_bytes(), boxReg); 2690 const Register tmpReg = boxReg; 2691 const Register owner_addr_Reg = mark_word; 2692 addi(owner_addr_Reg, mark_word, owner_offset); 2693 2694 if (RTMRetryCount > 0) { 2695 load_const_optimized(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy. 2696 load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort. 2697 bind(L_rtm_retry); 2698 } 2699 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2700 Label L_noincrement; 2701 if (RTMTotalCountIncrRate > 1) { 2702 branch_on_random_using_tb(R0, RTMTotalCountIncrRate, L_noincrement); 2703 } 2704 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2705 load_const(R0, (address)rtm_counters->total_count_addr(), tmpReg); 2706 //atomic_inc_ptr(R0, tmpReg); We don't increment atomically 2707 ldx(tmpReg, R0); 2708 addi(tmpReg, tmpReg, 1); 2709 stdx(tmpReg, R0); 2710 bind(L_noincrement); 2711 } 2712 tbegin_(); 2713 beq(CCR0, L_on_abort); 2714 // We don't reload mark word. Will only be reset at safepoint. 2715 ld(R0, 0, owner_addr_Reg); // Load in transaction, conflicts need to be tracked. 2716 cmpdi(flag, R0, 0); 2717 beq(flag, DONE_LABEL); 2718 2719 if (UseRTMXendForLockBusy) { 2720 tend_(); 2721 b(L_decrement_retry); 2722 } else { 2723 tabort_(); 2724 } 2725 bind(L_on_abort); 2726 const Register abort_status_Reg = tmpReg; 2727 mftexasr(abort_status_Reg); 2728 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2729 rtm_profiling(abort_status_Reg, /*temp*/ owner_addr_Reg, rtm_counters, method_data, profile_rtm); 2730 // Restore owner_addr_Reg 2731 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); 2732 #ifdef ASSERT 2733 andi_(R0, mark_word, markOopDesc::monitor_value); 2734 asm_assert_ne("must be inflated", 0xa754); // Deflating only allowed at safepoint. 2735 #endif 2736 addi(owner_addr_Reg, mark_word, owner_offset); 2737 } 2738 if (RTMRetryCount > 0) { 2739 // Retry on lock abort if abort status is not permanent. 2740 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 2741 } 2742 2743 // Appears unlocked - try to swing _owner from null to non-null. 2744 cmpxchgd(flag, /*current val*/ R0, (intptr_t)0, /*new val*/ R16_thread, owner_addr_Reg, 2745 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2746 MacroAssembler::cmpxchgx_hint_acquire_lock(), noreg, &L_decrement_retry, true); 2747 2748 if (RTMRetryCount > 0) { 2749 // success done else retry 2750 b(DONE_LABEL); 2751 bind(L_decrement_retry); 2752 // Spin and retry if lock is busy. 2753 rtm_retry_lock_on_busy(retry_on_busy_count_Reg, owner_addr_Reg, L_rtm_retry); 2754 } else { 2755 bind(L_decrement_retry); 2756 } 2757 } 2758 2759 #endif // INCLUDE_RTM_OPT 2760 2761 // "The box" is the space on the stack where we copy the object mark. 2762 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box, 2763 Register temp, Register displaced_header, Register current_header, 2764 bool try_bias, 2765 RTMLockingCounters* rtm_counters, 2766 RTMLockingCounters* stack_rtm_counters, 2767 Metadata* method_data, 2768 bool use_rtm, bool profile_rtm) { 2769 assert_different_registers(oop, box, temp, displaced_header, current_header); 2770 assert(flag != CCR0, "bad condition register"); 2771 Label cont; 2772 Label object_has_monitor; 2773 Label cas_failed; 2774 2775 // Load markOop from object into displaced_header. 2776 ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop); 2777 2778 2779 // Always do locking in runtime. 2780 if (EmitSync & 0x01) { 2781 cmpdi(flag, oop, 0); // Oop can't be 0 here => always false. 2782 return; 2783 } 2784 2785 if (try_bias) { 2786 biased_locking_enter(flag, oop, displaced_header, temp, current_header, cont); 2787 } 2788 2789 #if INCLUDE_RTM_OPT 2790 if (UseRTMForStackLocks && use_rtm) { 2791 rtm_stack_locking(flag, oop, displaced_header, temp, /*temp*/ current_header, 2792 stack_rtm_counters, method_data, profile_rtm, 2793 cont, object_has_monitor); 2794 } 2795 #endif // INCLUDE_RTM_OPT 2796 2797 // Handle existing monitor. 2798 if ((EmitSync & 0x02) == 0) { 2799 // The object has an existing monitor iff (mark & monitor_value) != 0. 2800 andi_(temp, displaced_header, markOopDesc::monitor_value); 2801 bne(CCR0, object_has_monitor); 2802 } 2803 2804 // Set displaced_header to be (markOop of object | UNLOCK_VALUE). 2805 ori(displaced_header, displaced_header, markOopDesc::unlocked_value); 2806 2807 // Load Compare Value application register. 2808 2809 // Initialize the box. (Must happen before we update the object mark!) 2810 std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2811 2812 // Must fence, otherwise, preceding store(s) may float below cmpxchg. 2813 // Compare object markOop with mark and if equal exchange scratch1 with object markOop. 2814 cmpxchgd(/*flag=*/flag, 2815 /*current_value=*/current_header, 2816 /*compare_value=*/displaced_header, 2817 /*exchange_value=*/box, 2818 /*where=*/oop, 2819 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2820 MacroAssembler::cmpxchgx_hint_acquire_lock(), 2821 noreg, 2822 &cas_failed, 2823 /*check without membar and ldarx first*/true); 2824 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2825 2826 // If the compare-and-exchange succeeded, then we found an unlocked 2827 // object and we have now locked it. 2828 b(cont); 2829 2830 bind(cas_failed); 2831 // We did not see an unlocked object so try the fast recursive case. 2832 2833 // Check if the owner is self by comparing the value in the markOop of object 2834 // (current_header) with the stack pointer. 2835 sub(current_header, current_header, R1_SP); 2836 load_const_optimized(temp, ~(os::vm_page_size()-1) | markOopDesc::lock_mask_in_place); 2837 2838 and_(R0/*==0?*/, current_header, temp); 2839 // If condition is true we are cont and hence we can store 0 as the 2840 // displaced header in the box, which indicates that it is a recursive lock. 2841 mcrf(flag,CCR0); 2842 std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box); 2843 2844 // Handle existing monitor. 2845 if ((EmitSync & 0x02) == 0) { 2846 b(cont); 2847 2848 bind(object_has_monitor); 2849 // The object's monitor m is unlocked iff m->owner == NULL, 2850 // otherwise m->owner may contain a thread or a stack address. 2851 2852 #if INCLUDE_RTM_OPT 2853 // Use the same RTM locking code in 32- and 64-bit VM. 2854 if (use_rtm) { 2855 rtm_inflated_locking(flag, oop, displaced_header, box, temp, /*temp*/ current_header, 2856 rtm_counters, method_data, profile_rtm, cont); 2857 } else { 2858 #endif // INCLUDE_RTM_OPT 2859 2860 // Try to CAS m->owner from NULL to current thread. 2861 addi(temp, displaced_header, ObjectMonitor::owner_offset_in_bytes()-markOopDesc::monitor_value); 2862 cmpxchgd(/*flag=*/flag, 2863 /*current_value=*/current_header, 2864 /*compare_value=*/(intptr_t)0, 2865 /*exchange_value=*/R16_thread, 2866 /*where=*/temp, 2867 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2868 MacroAssembler::cmpxchgx_hint_acquire_lock()); 2869 2870 // Store a non-null value into the box. 2871 std(box, BasicLock::displaced_header_offset_in_bytes(), box); 2872 2873 # ifdef ASSERT 2874 bne(flag, cont); 2875 // We have acquired the monitor, check some invariants. 2876 addi(/*monitor=*/temp, temp, -ObjectMonitor::owner_offset_in_bytes()); 2877 // Invariant 1: _recursions should be 0. 2878 //assert(ObjectMonitor::recursions_size_in_bytes() == 8, "unexpected size"); 2879 asm_assert_mem8_is_zero(ObjectMonitor::recursions_offset_in_bytes(), temp, 2880 "monitor->_recursions should be 0", -1); 2881 // Invariant 2: OwnerIsThread shouldn't be 0. 2882 //assert(ObjectMonitor::OwnerIsThread_size_in_bytes() == 4, "unexpected size"); 2883 //asm_assert_mem4_isnot_zero(ObjectMonitor::OwnerIsThread_offset_in_bytes(), temp, 2884 // "monitor->OwnerIsThread shouldn't be 0", -1); 2885 # endif 2886 2887 #if INCLUDE_RTM_OPT 2888 } // use_rtm() 2889 #endif 2890 } 2891 2892 bind(cont); 2893 // flag == EQ indicates success 2894 // flag == NE indicates failure 2895 } 2896 2897 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box, 2898 Register temp, Register displaced_header, Register current_header, 2899 bool try_bias, bool use_rtm) { 2900 assert_different_registers(oop, box, temp, displaced_header, current_header); 2901 assert(flag != CCR0, "bad condition register"); 2902 Label cont; 2903 Label object_has_monitor; 2904 2905 // Always do locking in runtime. 2906 if (EmitSync & 0x01) { 2907 cmpdi(flag, oop, 0); // Oop can't be 0 here => always false. 2908 return; 2909 } 2910 2911 if (try_bias) { 2912 biased_locking_exit(flag, oop, current_header, cont); 2913 } 2914 2915 #if INCLUDE_RTM_OPT 2916 if (UseRTMForStackLocks && use_rtm) { 2917 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); 2918 Label L_regular_unlock; 2919 ld(current_header, oopDesc::mark_offset_in_bytes(), oop); // fetch markword 2920 andi(R0, current_header, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits 2921 cmpwi(flag, R0, markOopDesc::unlocked_value); // bits = 001 unlocked 2922 bne(flag, L_regular_unlock); // else RegularLock 2923 tend_(); // otherwise end... 2924 b(cont); // ... and we're done 2925 bind(L_regular_unlock); 2926 } 2927 #endif 2928 2929 // Find the lock address and load the displaced header from the stack. 2930 ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2931 2932 // If the displaced header is 0, we have a recursive unlock. 2933 cmpdi(flag, displaced_header, 0); 2934 beq(flag, cont); 2935 2936 // Handle existing monitor. 2937 if ((EmitSync & 0x02) == 0) { 2938 // The object has an existing monitor iff (mark & monitor_value) != 0. 2939 RTM_OPT_ONLY( if (!(UseRTMForStackLocks && use_rtm)) ) // skip load if already done 2940 ld(current_header, oopDesc::mark_offset_in_bytes(), oop); 2941 andi_(R0, current_header, markOopDesc::monitor_value); 2942 bne(CCR0, object_has_monitor); 2943 } 2944 2945 // Check if it is still a light weight lock, this is is true if we see 2946 // the stack address of the basicLock in the markOop of the object. 2947 // Cmpxchg sets flag to cmpd(current_header, box). 2948 cmpxchgd(/*flag=*/flag, 2949 /*current_value=*/current_header, 2950 /*compare_value=*/box, 2951 /*exchange_value=*/displaced_header, 2952 /*where=*/oop, 2953 MacroAssembler::MemBarRel, 2954 MacroAssembler::cmpxchgx_hint_release_lock(), 2955 noreg, 2956 &cont); 2957 2958 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2959 2960 // Handle existing monitor. 2961 if ((EmitSync & 0x02) == 0) { 2962 b(cont); 2963 2964 bind(object_has_monitor); 2965 addi(current_header, current_header, -markOopDesc::monitor_value); // monitor 2966 ld(temp, ObjectMonitor::owner_offset_in_bytes(), current_header); 2967 2968 // It's inflated. 2969 #if INCLUDE_RTM_OPT 2970 if (use_rtm) { 2971 Label L_regular_inflated_unlock; 2972 // Clean monitor_value bit to get valid pointer 2973 cmpdi(flag, temp, 0); 2974 bne(flag, L_regular_inflated_unlock); 2975 tend_(); 2976 b(cont); 2977 bind(L_regular_inflated_unlock); 2978 } 2979 #endif 2980 2981 ld(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header); 2982 xorr(temp, R16_thread, temp); // Will be 0 if we are the owner. 2983 orr(temp, temp, displaced_header); // Will be 0 if there are 0 recursions. 2984 cmpdi(flag, temp, 0); 2985 bne(flag, cont); 2986 2987 ld(temp, ObjectMonitor::EntryList_offset_in_bytes(), current_header); 2988 ld(displaced_header, ObjectMonitor::cxq_offset_in_bytes(), current_header); 2989 orr(temp, temp, displaced_header); // Will be 0 if both are 0. 2990 cmpdi(flag, temp, 0); 2991 bne(flag, cont); 2992 release(); 2993 std(temp, ObjectMonitor::owner_offset_in_bytes(), current_header); 2994 } 2995 2996 bind(cont); 2997 // flag == EQ indicates success 2998 // flag == NE indicates failure 2999 } 3000 3001 // Write serialization page so VM thread can do a pseudo remote membar. 3002 // We use the current thread pointer to calculate a thread specific 3003 // offset to write to within the page. This minimizes bus traffic 3004 // due to cache line collision. 3005 void MacroAssembler::serialize_memory(Register thread, Register tmp1, Register tmp2) { 3006 srdi(tmp2, thread, os::get_serialize_page_shift_count()); 3007 3008 int mask = os::vm_page_size() - sizeof(int); 3009 if (Assembler::is_simm(mask, 16)) { 3010 andi(tmp2, tmp2, mask); 3011 } else { 3012 lis(tmp1, (int)((signed short) (mask >> 16))); 3013 ori(tmp1, tmp1, mask & 0x0000ffff); 3014 andr(tmp2, tmp2, tmp1); 3015 } 3016 3017 load_const(tmp1, (long) os::get_memory_serialize_page()); 3018 release(); 3019 stwx(R0, tmp1, tmp2); 3020 } 3021 3022 3023 // GC barrier helper macros 3024 3025 // Write the card table byte if needed. 3026 void MacroAssembler::card_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp) { 3027 CardTableModRefBS* bs = 3028 barrier_set_cast<CardTableModRefBS>(Universe::heap()->barrier_set()); 3029 assert(bs->kind() == BarrierSet::CardTableForRS || 3030 bs->kind() == BarrierSet::CardTableExtension, "wrong barrier"); 3031 #ifdef ASSERT 3032 cmpdi(CCR0, Rnew_val, 0); 3033 asm_assert_ne("null oop not allowed", 0x321); 3034 #endif 3035 card_table_write(bs->byte_map_base, Rtmp, Rstore_addr); 3036 } 3037 3038 // Write the card table byte. 3039 void MacroAssembler::card_table_write(jbyte* byte_map_base, Register Rtmp, Register Robj) { 3040 assert_different_registers(Robj, Rtmp, R0); 3041 load_const_optimized(Rtmp, (address)byte_map_base, R0); 3042 srdi(Robj, Robj, CardTableModRefBS::card_shift); 3043 li(R0, 0); // dirty 3044 if (UseConcMarkSweepGC) membar(Assembler::StoreStore); 3045 stbx(R0, Rtmp, Robj); 3046 } 3047 3048 // Kills R31 if value is a volatile register. 3049 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2, bool needs_frame) { 3050 Label done; 3051 cmpdi(CCR0, value, 0); 3052 beq(CCR0, done); // Use NULL as-is. 3053 3054 clrrdi(tmp1, value, JNIHandles::weak_tag_size); 3055 #if INCLUDE_ALL_GCS 3056 if (UseG1GC) { andi_(tmp2, value, JNIHandles::weak_tag_mask); } 3057 #endif 3058 ld(value, 0, tmp1); // Resolve (untagged) jobject. 3059 3060 #if INCLUDE_ALL_GCS 3061 if (UseG1GC) { 3062 Label not_weak; 3063 beq(CCR0, not_weak); // Test for jweak tag. 3064 verify_oop(value); 3065 g1_write_barrier_pre(noreg, // obj 3066 noreg, // offset 3067 value, // pre_val 3068 tmp1, tmp2, needs_frame); 3069 bind(not_weak); 3070 } 3071 #endif // INCLUDE_ALL_GCS 3072 verify_oop(value); 3073 bind(done); 3074 } 3075 3076 #if INCLUDE_ALL_GCS 3077 // General G1 pre-barrier generator. 3078 // Goal: record the previous value if it is not null. 3079 void MacroAssembler::g1_write_barrier_pre(Register Robj, RegisterOrConstant offset, Register Rpre_val, 3080 Register Rtmp1, Register Rtmp2, bool needs_frame) { 3081 Label runtime, filtered; 3082 3083 // Is marking active? 3084 if (in_bytes(SATBMarkQueue::byte_width_of_active()) == 4) { 3085 lwz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_active()), R16_thread); 3086 } else { 3087 guarantee(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, "Assumption"); 3088 lbz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_active()), R16_thread); 3089 } 3090 cmpdi(CCR0, Rtmp1, 0); 3091 beq(CCR0, filtered); 3092 3093 // Do we need to load the previous value? 3094 if (Robj != noreg) { 3095 // Load the previous value... 3096 if (UseCompressedOops) { 3097 lwz(Rpre_val, offset, Robj); 3098 } else { 3099 ld(Rpre_val, offset, Robj); 3100 } 3101 // Previous value has been loaded into Rpre_val. 3102 } 3103 assert(Rpre_val != noreg, "must have a real register"); 3104 3105 // Is the previous value null? 3106 cmpdi(CCR0, Rpre_val, 0); 3107 beq(CCR0, filtered); 3108 3109 if (Robj != noreg && UseCompressedOops) { 3110 decode_heap_oop_not_null(Rpre_val); 3111 } 3112 3113 // OK, it's not filtered, so we'll need to call enqueue. In the normal 3114 // case, pre_val will be a scratch G-reg, but there are some cases in 3115 // which it's an O-reg. In the first case, do a normal call. In the 3116 // latter, do a save here and call the frameless version. 3117 3118 // Can we store original value in the thread's buffer? 3119 // Is index == 0? 3120 // (The index field is typed as size_t.) 3121 const Register Rbuffer = Rtmp1, Rindex = Rtmp2; 3122 3123 ld(Rindex, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_index()), R16_thread); 3124 cmpdi(CCR0, Rindex, 0); 3125 beq(CCR0, runtime); // If index == 0, goto runtime. 3126 ld(Rbuffer, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_buf()), R16_thread); 3127 3128 addi(Rindex, Rindex, -wordSize); // Decrement index. 3129 std(Rindex, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_index()), R16_thread); 3130 3131 // Record the previous value. 3132 stdx(Rpre_val, Rbuffer, Rindex); 3133 b(filtered); 3134 3135 bind(runtime); 3136 3137 // May need to preserve LR. Also needed if current frame is not compatible with C calling convention. 3138 if (needs_frame) { 3139 save_LR_CR(Rtmp1); 3140 push_frame_reg_args(0, Rtmp2); 3141 } 3142 3143 if (Rpre_val->is_volatile() && Robj == noreg) mr(R31, Rpre_val); // Save pre_val across C call if it was preloaded. 3144 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), Rpre_val, R16_thread); 3145 if (Rpre_val->is_volatile() && Robj == noreg) mr(Rpre_val, R31); // restore 3146 3147 if (needs_frame) { 3148 pop_frame(); 3149 restore_LR_CR(Rtmp1); 3150 } 3151 3152 bind(filtered); 3153 } 3154 3155 // General G1 post-barrier generator 3156 // Store cross-region card. 3157 void MacroAssembler::g1_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp1, Register Rtmp2, Register Rtmp3, Label *filtered_ext) { 3158 Label runtime, filtered_int; 3159 Label& filtered = (filtered_ext != NULL) ? *filtered_ext : filtered_int; 3160 assert_different_registers(Rstore_addr, Rnew_val, Rtmp1, Rtmp2); 3161 3162 G1SATBCardTableLoggingModRefBS* bs = 3163 barrier_set_cast<G1SATBCardTableLoggingModRefBS>(Universe::heap()->barrier_set()); 3164 3165 // Does store cross heap regions? 3166 if (G1RSBarrierRegionFilter) { 3167 xorr(Rtmp1, Rstore_addr, Rnew_val); 3168 srdi_(Rtmp1, Rtmp1, HeapRegion::LogOfHRGrainBytes); 3169 beq(CCR0, filtered); 3170 } 3171 3172 // Crosses regions, storing NULL? 3173 #ifdef ASSERT 3174 cmpdi(CCR0, Rnew_val, 0); 3175 asm_assert_ne("null oop not allowed (G1)", 0x322); // Checked by caller on PPC64, so following branch is obsolete: 3176 //beq(CCR0, filtered); 3177 #endif 3178 3179 // Storing region crossing non-NULL, is card already dirty? 3180 assert(sizeof(*bs->byte_map_base) == sizeof(jbyte), "adjust this code"); 3181 const Register Rcard_addr = Rtmp1; 3182 Register Rbase = Rtmp2; 3183 load_const_optimized(Rbase, (address)bs->byte_map_base, /*temp*/ Rtmp3); 3184 3185 srdi(Rcard_addr, Rstore_addr, CardTableModRefBS::card_shift); 3186 3187 // Get the address of the card. 3188 lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr); 3189 cmpwi(CCR0, Rtmp3, (int)G1SATBCardTableModRefBS::g1_young_card_val()); 3190 beq(CCR0, filtered); 3191 3192 membar(Assembler::StoreLoad); 3193 lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr); // Reload after membar. 3194 cmpwi(CCR0, Rtmp3 /* card value */, CardTableModRefBS::dirty_card_val()); 3195 beq(CCR0, filtered); 3196 3197 // Storing a region crossing, non-NULL oop, card is clean. 3198 // Dirty card and log. 3199 li(Rtmp3, CardTableModRefBS::dirty_card_val()); 3200 //release(); // G1: oops are allowed to get visible after dirty marking. 3201 stbx(Rtmp3, Rbase, Rcard_addr); 3202 3203 add(Rcard_addr, Rbase, Rcard_addr); // This is the address which needs to get enqueued. 3204 Rbase = noreg; // end of lifetime 3205 3206 const Register Rqueue_index = Rtmp2, 3207 Rqueue_buf = Rtmp3; 3208 ld(Rqueue_index, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_index()), R16_thread); 3209 cmpdi(CCR0, Rqueue_index, 0); 3210 beq(CCR0, runtime); // index == 0 then jump to runtime 3211 ld(Rqueue_buf, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_buf()), R16_thread); 3212 3213 addi(Rqueue_index, Rqueue_index, -wordSize); // decrement index 3214 std(Rqueue_index, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_index()), R16_thread); 3215 3216 stdx(Rcard_addr, Rqueue_buf, Rqueue_index); // store card 3217 b(filtered); 3218 3219 bind(runtime); 3220 3221 // Save the live input values. 3222 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), Rcard_addr, R16_thread); 3223 3224 bind(filtered_int); 3225 } 3226 #endif // INCLUDE_ALL_GCS 3227 3228 // Values for last_Java_pc, and last_Java_sp must comply to the rules 3229 // in frame_ppc.hpp. 3230 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) { 3231 // Always set last_Java_pc and flags first because once last_Java_sp 3232 // is visible has_last_Java_frame is true and users will look at the 3233 // rest of the fields. (Note: flags should always be zero before we 3234 // get here so doesn't need to be set.) 3235 3236 // Verify that last_Java_pc was zeroed on return to Java 3237 asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread, 3238 "last_Java_pc not zeroed before leaving Java", 0x200); 3239 3240 // When returning from calling out from Java mode the frame anchor's 3241 // last_Java_pc will always be set to NULL. It is set here so that 3242 // if we are doing a call to native (not VM) that we capture the 3243 // known pc and don't have to rely on the native call having a 3244 // standard frame linkage where we can find the pc. 3245 if (last_Java_pc != noreg) 3246 std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 3247 3248 // Set last_Java_sp last. 3249 std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 3250 } 3251 3252 void MacroAssembler::reset_last_Java_frame(void) { 3253 asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()), 3254 R16_thread, "SP was not set, still zero", 0x202); 3255 3256 BLOCK_COMMENT("reset_last_Java_frame {"); 3257 li(R0, 0); 3258 3259 // _last_Java_sp = 0 3260 std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 3261 3262 // _last_Java_pc = 0 3263 std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 3264 BLOCK_COMMENT("} reset_last_Java_frame"); 3265 } 3266 3267 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) { 3268 assert_different_registers(sp, tmp1); 3269 3270 // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via 3271 // TOP_IJAVA_FRAME_ABI. 3272 // FIXME: assert that we really have a TOP_IJAVA_FRAME here! 3273 address entry = pc(); 3274 load_const_optimized(tmp1, entry); 3275 3276 set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1); 3277 } 3278 3279 void MacroAssembler::get_vm_result(Register oop_result) { 3280 // Read: 3281 // R16_thread 3282 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 3283 // 3284 // Updated: 3285 // oop_result 3286 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 3287 3288 verify_thread(); 3289 3290 ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread); 3291 li(R0, 0); 3292 std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread); 3293 3294 verify_oop(oop_result); 3295 } 3296 3297 void MacroAssembler::get_vm_result_2(Register metadata_result) { 3298 // Read: 3299 // R16_thread 3300 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 3301 // 3302 // Updated: 3303 // metadata_result 3304 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 3305 3306 ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 3307 li(R0, 0); 3308 std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 3309 } 3310 3311 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) { 3312 Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided. 3313 if (Universe::narrow_klass_base() != 0) { 3314 // Use dst as temp if it is free. 3315 sub_const_optimized(dst, current, Universe::narrow_klass_base(), R0); 3316 current = dst; 3317 } 3318 if (Universe::narrow_klass_shift() != 0) { 3319 srdi(dst, current, Universe::narrow_klass_shift()); 3320 current = dst; 3321 } 3322 return current; 3323 } 3324 3325 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) { 3326 if (UseCompressedClassPointers) { 3327 Register compressedKlass = encode_klass_not_null(ck, klass); 3328 stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop); 3329 } else { 3330 std(klass, oopDesc::klass_offset_in_bytes(), dst_oop); 3331 } 3332 } 3333 3334 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) { 3335 if (UseCompressedClassPointers) { 3336 if (val == noreg) { 3337 val = R0; 3338 li(val, 0); 3339 } 3340 stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed 3341 } 3342 } 3343 3344 int MacroAssembler::instr_size_for_decode_klass_not_null() { 3345 if (!UseCompressedClassPointers) return 0; 3346 int num_instrs = 1; // shift or move 3347 if (Universe::narrow_klass_base() != 0) num_instrs = 7; // shift + load const + add 3348 return num_instrs * BytesPerInstWord; 3349 } 3350 3351 void MacroAssembler::decode_klass_not_null(Register dst, Register src) { 3352 assert(dst != R0, "Dst reg may not be R0, as R0 is used here."); 3353 if (src == noreg) src = dst; 3354 Register shifted_src = src; 3355 if (Universe::narrow_klass_shift() != 0 || 3356 Universe::narrow_klass_base() == 0 && src != dst) { // Move required. 3357 shifted_src = dst; 3358 sldi(shifted_src, src, Universe::narrow_klass_shift()); 3359 } 3360 if (Universe::narrow_klass_base() != 0) { 3361 add_const_optimized(dst, shifted_src, Universe::narrow_klass_base(), R0); 3362 } 3363 } 3364 3365 void MacroAssembler::load_klass(Register dst, Register src) { 3366 if (UseCompressedClassPointers) { 3367 lwz(dst, oopDesc::klass_offset_in_bytes(), src); 3368 // Attention: no null check here! 3369 decode_klass_not_null(dst, dst); 3370 } else { 3371 ld(dst, oopDesc::klass_offset_in_bytes(), src); 3372 } 3373 } 3374 3375 // ((OopHandle)result).resolve(); 3376 void MacroAssembler::resolve_oop_handle(Register result) { 3377 // OopHandle::resolve is an indirection. 3378 ld(result, 0, result); 3379 } 3380 3381 void MacroAssembler::load_mirror_from_const_method(Register mirror, Register const_method) { 3382 ld(mirror, in_bytes(ConstMethod::constants_offset()), const_method); 3383 ld(mirror, ConstantPool::pool_holder_offset_in_bytes(), mirror); 3384 ld(mirror, in_bytes(Klass::java_mirror_offset()), mirror); 3385 resolve_oop_handle(mirror); 3386 } 3387 3388 // Clear Array 3389 // For very short arrays. tmp == R0 is allowed. 3390 void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) { 3391 if (cnt_dwords > 0) { li(tmp, 0); } 3392 for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); } 3393 } 3394 3395 // Version for constant short array length. Kills base_ptr. tmp == R0 is allowed. 3396 void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) { 3397 if (cnt_dwords < 8) { 3398 clear_memory_unrolled(base_ptr, cnt_dwords, tmp); 3399 return; 3400 } 3401 3402 Label loop; 3403 const long loopcnt = cnt_dwords >> 1, 3404 remainder = cnt_dwords & 1; 3405 3406 li(tmp, loopcnt); 3407 mtctr(tmp); 3408 li(tmp, 0); 3409 bind(loop); 3410 std(tmp, 0, base_ptr); 3411 std(tmp, 8, base_ptr); 3412 addi(base_ptr, base_ptr, 16); 3413 bdnz(loop); 3414 if (remainder) { std(tmp, 0, base_ptr); } 3415 } 3416 3417 // Kills both input registers. tmp == R0 is allowed. 3418 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) { 3419 // Procedure for large arrays (uses data cache block zero instruction). 3420 Label startloop, fast, fastloop, small_rest, restloop, done; 3421 const int cl_size = VM_Version::L1_data_cache_line_size(), 3422 cl_dwords = cl_size >> 3, 3423 cl_dw_addr_bits = exact_log2(cl_dwords), 3424 dcbz_min = 1, // Min count of dcbz executions, needs to be >0. 3425 min_cnt = ((dcbz_min + 1) << cl_dw_addr_bits) - 1; 3426 3427 if (const_cnt >= 0) { 3428 // Constant case. 3429 if (const_cnt < min_cnt) { 3430 clear_memory_constlen(base_ptr, const_cnt, tmp); 3431 return; 3432 } 3433 load_const_optimized(cnt_dwords, const_cnt, tmp); 3434 } else { 3435 // cnt_dwords already loaded in register. Need to check size. 3436 cmpdi(CCR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included). 3437 blt(CCR1, small_rest); 3438 } 3439 rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line. 3440 beq(CCR0, fast); // Already 128byte aligned. 3441 3442 subfic(tmp, tmp, cl_dwords); 3443 mtctr(tmp); // Set ctr to hit 128byte boundary (0<ctr<cl_dwords). 3444 subf(cnt_dwords, tmp, cnt_dwords); // rest. 3445 li(tmp, 0); 3446 3447 bind(startloop); // Clear at the beginning to reach 128byte boundary. 3448 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 3449 addi(base_ptr, base_ptr, 8); 3450 bdnz(startloop); 3451 3452 bind(fast); // Clear 128byte blocks. 3453 srdi(tmp, cnt_dwords, cl_dw_addr_bits); // Loop count for 128byte loop (>0). 3454 andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords. 3455 mtctr(tmp); // Load counter. 3456 3457 bind(fastloop); 3458 dcbz(base_ptr); // Clear 128byte aligned block. 3459 addi(base_ptr, base_ptr, cl_size); 3460 bdnz(fastloop); 3461 3462 bind(small_rest); 3463 cmpdi(CCR0, cnt_dwords, 0); // size 0? 3464 beq(CCR0, done); // rest == 0 3465 li(tmp, 0); 3466 mtctr(cnt_dwords); // Load counter. 3467 3468 bind(restloop); // Clear rest. 3469 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 3470 addi(base_ptr, base_ptr, 8); 3471 bdnz(restloop); 3472 3473 bind(done); 3474 } 3475 3476 /////////////////////////////////////////// String intrinsics //////////////////////////////////////////// 3477 3478 #ifdef COMPILER2 3479 // Intrinsics for CompactStrings 3480 3481 // Compress char[] to byte[] by compressing 16 bytes at once. 3482 void MacroAssembler::string_compress_16(Register src, Register dst, Register cnt, 3483 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, 3484 Label& Lfailure) { 3485 3486 const Register tmp0 = R0; 3487 assert_different_registers(src, dst, cnt, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5); 3488 Label Lloop, Lslow; 3489 3490 // Check if cnt >= 8 (= 16 bytes) 3491 lis(tmp1, 0xFF); // tmp1 = 0x00FF00FF00FF00FF 3492 srwi_(tmp2, cnt, 3); 3493 beq(CCR0, Lslow); 3494 ori(tmp1, tmp1, 0xFF); 3495 rldimi(tmp1, tmp1, 32, 0); 3496 mtctr(tmp2); 3497 3498 // 2x unrolled loop 3499 bind(Lloop); 3500 ld(tmp2, 0, src); // _0_1_2_3 (Big Endian) 3501 ld(tmp4, 8, src); // _4_5_6_7 3502 3503 orr(tmp0, tmp2, tmp4); 3504 rldicl(tmp3, tmp2, 6*8, 64-24); // _____1_2 3505 rldimi(tmp2, tmp2, 2*8, 2*8); // _0_2_3_3 3506 rldicl(tmp5, tmp4, 6*8, 64-24); // _____5_6 3507 rldimi(tmp4, tmp4, 2*8, 2*8); // _4_6_7_7 3508 3509 andc_(tmp0, tmp0, tmp1); 3510 bne(CCR0, Lfailure); // Not latin1. 3511 addi(src, src, 16); 3512 3513 rlwimi(tmp3, tmp2, 0*8, 24, 31);// _____1_3 3514 srdi(tmp2, tmp2, 3*8); // ____0_2_ 3515 rlwimi(tmp5, tmp4, 0*8, 24, 31);// _____5_7 3516 srdi(tmp4, tmp4, 3*8); // ____4_6_ 3517 3518 orr(tmp2, tmp2, tmp3); // ____0123 3519 orr(tmp4, tmp4, tmp5); // ____4567 3520 3521 stw(tmp2, 0, dst); 3522 stw(tmp4, 4, dst); 3523 addi(dst, dst, 8); 3524 bdnz(Lloop); 3525 3526 bind(Lslow); // Fallback to slow version 3527 } 3528 3529 // Compress char[] to byte[]. cnt must be positive int. 3530 void MacroAssembler::string_compress(Register src, Register dst, Register cnt, Register tmp, Label& Lfailure) { 3531 Label Lloop; 3532 mtctr(cnt); 3533 3534 bind(Lloop); 3535 lhz(tmp, 0, src); 3536 cmplwi(CCR0, tmp, 0xff); 3537 bgt(CCR0, Lfailure); // Not latin1. 3538 addi(src, src, 2); 3539 stb(tmp, 0, dst); 3540 addi(dst, dst, 1); 3541 bdnz(Lloop); 3542 } 3543 3544 // Inflate byte[] to char[] by inflating 16 bytes at once. 3545 void MacroAssembler::string_inflate_16(Register src, Register dst, Register cnt, 3546 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) { 3547 const Register tmp0 = R0; 3548 assert_different_registers(src, dst, cnt, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5); 3549 Label Lloop, Lslow; 3550 3551 // Check if cnt >= 8 3552 srwi_(tmp2, cnt, 3); 3553 beq(CCR0, Lslow); 3554 lis(tmp1, 0xFF); // tmp1 = 0x00FF00FF 3555 ori(tmp1, tmp1, 0xFF); 3556 mtctr(tmp2); 3557 3558 // 2x unrolled loop 3559 bind(Lloop); 3560 lwz(tmp2, 0, src); // ____0123 (Big Endian) 3561 lwz(tmp4, 4, src); // ____4567 3562 addi(src, src, 8); 3563 3564 rldicl(tmp3, tmp2, 7*8, 64-8); // _______2 3565 rlwimi(tmp2, tmp2, 3*8, 16, 23);// ____0113 3566 rldicl(tmp5, tmp4, 7*8, 64-8); // _______6 3567 rlwimi(tmp4, tmp4, 3*8, 16, 23);// ____4557 3568 3569 andc(tmp0, tmp2, tmp1); // ____0_1_ 3570 rlwimi(tmp2, tmp3, 2*8, 0, 23); // _____2_3 3571 andc(tmp3, tmp4, tmp1); // ____4_5_ 3572 rlwimi(tmp4, tmp5, 2*8, 0, 23); // _____6_7 3573 3574 rldimi(tmp2, tmp0, 3*8, 0*8); // _0_1_2_3 3575 rldimi(tmp4, tmp3, 3*8, 0*8); // _4_5_6_7 3576 3577 std(tmp2, 0, dst); 3578 std(tmp4, 8, dst); 3579 addi(dst, dst, 16); 3580 bdnz(Lloop); 3581 3582 bind(Lslow); // Fallback to slow version 3583 } 3584 3585 // Inflate byte[] to char[]. cnt must be positive int. 3586 void MacroAssembler::string_inflate(Register src, Register dst, Register cnt, Register tmp) { 3587 Label Lloop; 3588 mtctr(cnt); 3589 3590 bind(Lloop); 3591 lbz(tmp, 0, src); 3592 addi(src, src, 1); 3593 sth(tmp, 0, dst); 3594 addi(dst, dst, 2); 3595 bdnz(Lloop); 3596 } 3597 3598 void MacroAssembler::string_compare(Register str1, Register str2, 3599 Register cnt1, Register cnt2, 3600 Register tmp1, Register result, int ae) { 3601 const Register tmp0 = R0, 3602 diff = tmp1; 3603 3604 assert_different_registers(str1, str2, cnt1, cnt2, tmp0, tmp1, result); 3605 Label Ldone, Lslow, Lloop, Lreturn_diff; 3606 3607 // Note: Making use of the fact that compareTo(a, b) == -compareTo(b, a) 3608 // we interchange str1 and str2 in the UL case and negate the result. 3609 // Like this, str1 is always latin1 encoded, except for the UU case. 3610 // In addition, we need 0 (or sign which is 0) extend. 3611 3612 if (ae == StrIntrinsicNode::UU) { 3613 srwi(cnt1, cnt1, 1); 3614 } else { 3615 clrldi(cnt1, cnt1, 32); 3616 } 3617 3618 if (ae != StrIntrinsicNode::LL) { 3619 srwi(cnt2, cnt2, 1); 3620 } else { 3621 clrldi(cnt2, cnt2, 32); 3622 } 3623 3624 // See if the lengths are different, and calculate min in cnt1. 3625 // Save diff in case we need it for a tie-breaker. 3626 subf_(diff, cnt2, cnt1); // diff = cnt1 - cnt2 3627 // if (diff > 0) { cnt1 = cnt2; } 3628 if (VM_Version::has_isel()) { 3629 isel(cnt1, CCR0, Assembler::greater, /*invert*/ false, cnt2); 3630 } else { 3631 Label Lskip; 3632 blt(CCR0, Lskip); 3633 mr(cnt1, cnt2); 3634 bind(Lskip); 3635 } 3636 3637 // Rename registers 3638 Register chr1 = result; 3639 Register chr2 = tmp0; 3640 3641 // Compare multiple characters in fast loop (only implemented for same encoding). 3642 int stride1 = 8, stride2 = 8; 3643 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3644 int log2_chars_per_iter = (ae == StrIntrinsicNode::LL) ? 3 : 2; 3645 Label Lfastloop, Lskipfast; 3646 3647 srwi_(tmp0, cnt1, log2_chars_per_iter); 3648 beq(CCR0, Lskipfast); 3649 rldicl(cnt2, cnt1, 0, 64 - log2_chars_per_iter); // Remaining characters. 3650 li(cnt1, 1 << log2_chars_per_iter); // Initialize for failure case: Rescan characters from current iteration. 3651 mtctr(tmp0); 3652 3653 bind(Lfastloop); 3654 ld(chr1, 0, str1); 3655 ld(chr2, 0, str2); 3656 cmpd(CCR0, chr1, chr2); 3657 bne(CCR0, Lslow); 3658 addi(str1, str1, stride1); 3659 addi(str2, str2, stride2); 3660 bdnz(Lfastloop); 3661 mr(cnt1, cnt2); // Remaining characters. 3662 bind(Lskipfast); 3663 } 3664 3665 // Loop which searches the first difference character by character. 3666 cmpwi(CCR0, cnt1, 0); 3667 beq(CCR0, Lreturn_diff); 3668 bind(Lslow); 3669 mtctr(cnt1); 3670 3671 switch (ae) { 3672 case StrIntrinsicNode::LL: stride1 = 1; stride2 = 1; break; 3673 case StrIntrinsicNode::UL: // fallthru (see comment above) 3674 case StrIntrinsicNode::LU: stride1 = 1; stride2 = 2; break; 3675 case StrIntrinsicNode::UU: stride1 = 2; stride2 = 2; break; 3676 default: ShouldNotReachHere(); break; 3677 } 3678 3679 bind(Lloop); 3680 if (stride1 == 1) { lbz(chr1, 0, str1); } else { lhz(chr1, 0, str1); } 3681 if (stride2 == 1) { lbz(chr2, 0, str2); } else { lhz(chr2, 0, str2); } 3682 subf_(result, chr2, chr1); // result = chr1 - chr2 3683 bne(CCR0, Ldone); 3684 addi(str1, str1, stride1); 3685 addi(str2, str2, stride2); 3686 bdnz(Lloop); 3687 3688 // If strings are equal up to min length, return the length difference. 3689 bind(Lreturn_diff); 3690 mr(result, diff); 3691 3692 // Otherwise, return the difference between the first mismatched chars. 3693 bind(Ldone); 3694 if (ae == StrIntrinsicNode::UL) { 3695 neg(result, result); // Negate result (see note above). 3696 } 3697 } 3698 3699 void MacroAssembler::array_equals(bool is_array_equ, Register ary1, Register ary2, 3700 Register limit, Register tmp1, Register result, bool is_byte) { 3701 const Register tmp0 = R0; 3702 assert_different_registers(ary1, ary2, limit, tmp0, tmp1, result); 3703 Label Ldone, Lskiploop, Lloop, Lfastloop, Lskipfast; 3704 bool limit_needs_shift = false; 3705 3706 if (is_array_equ) { 3707 const int length_offset = arrayOopDesc::length_offset_in_bytes(); 3708 const int base_offset = arrayOopDesc::base_offset_in_bytes(is_byte ? T_BYTE : T_CHAR); 3709 3710 // Return true if the same array. 3711 cmpd(CCR0, ary1, ary2); 3712 beq(CCR0, Lskiploop); 3713 3714 // Return false if one of them is NULL. 3715 cmpdi(CCR0, ary1, 0); 3716 cmpdi(CCR1, ary2, 0); 3717 li(result, 0); 3718 cror(CCR0, Assembler::equal, CCR1, Assembler::equal); 3719 beq(CCR0, Ldone); 3720 3721 // Load the lengths of arrays. 3722 lwz(limit, length_offset, ary1); 3723 lwz(tmp0, length_offset, ary2); 3724 3725 // Return false if the two arrays are not equal length. 3726 cmpw(CCR0, limit, tmp0); 3727 bne(CCR0, Ldone); 3728 3729 // Load array addresses. 3730 addi(ary1, ary1, base_offset); 3731 addi(ary2, ary2, base_offset); 3732 } else { 3733 limit_needs_shift = !is_byte; 3734 li(result, 0); // Assume not equal. 3735 } 3736 3737 // Rename registers 3738 Register chr1 = tmp0; 3739 Register chr2 = tmp1; 3740 3741 // Compare 8 bytes per iteration in fast loop. 3742 const int log2_chars_per_iter = is_byte ? 3 : 2; 3743 3744 srwi_(tmp0, limit, log2_chars_per_iter + (limit_needs_shift ? 1 : 0)); 3745 beq(CCR0, Lskipfast); 3746 mtctr(tmp0); 3747 3748 bind(Lfastloop); 3749 ld(chr1, 0, ary1); 3750 ld(chr2, 0, ary2); 3751 addi(ary1, ary1, 8); 3752 addi(ary2, ary2, 8); 3753 cmpd(CCR0, chr1, chr2); 3754 bne(CCR0, Ldone); 3755 bdnz(Lfastloop); 3756 3757 bind(Lskipfast); 3758 rldicl_(limit, limit, limit_needs_shift ? 64 - 1 : 0, 64 - log2_chars_per_iter); // Remaining characters. 3759 beq(CCR0, Lskiploop); 3760 mtctr(limit); 3761 3762 // Character by character. 3763 bind(Lloop); 3764 if (is_byte) { 3765 lbz(chr1, 0, ary1); 3766 lbz(chr2, 0, ary2); 3767 addi(ary1, ary1, 1); 3768 addi(ary2, ary2, 1); 3769 } else { 3770 lhz(chr1, 0, ary1); 3771 lhz(chr2, 0, ary2); 3772 addi(ary1, ary1, 2); 3773 addi(ary2, ary2, 2); 3774 } 3775 cmpw(CCR0, chr1, chr2); 3776 bne(CCR0, Ldone); 3777 bdnz(Lloop); 3778 3779 bind(Lskiploop); 3780 li(result, 1); // All characters are equal. 3781 bind(Ldone); 3782 } 3783 3784 void MacroAssembler::string_indexof(Register result, Register haystack, Register haycnt, 3785 Register needle, ciTypeArray* needle_values, Register needlecnt, int needlecntval, 3786 Register tmp1, Register tmp2, Register tmp3, Register tmp4, int ae) { 3787 3788 // Ensure 0<needlecnt<=haycnt in ideal graph as prerequisite! 3789 Label L_TooShort, L_Found, L_NotFound, L_End; 3790 Register last_addr = haycnt, // Kill haycnt at the beginning. 3791 addr = tmp1, 3792 n_start = tmp2, 3793 ch1 = tmp3, 3794 ch2 = R0; 3795 3796 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 3797 const int h_csize = (ae == StrIntrinsicNode::LL) ? 1 : 2; 3798 const int n_csize = (ae == StrIntrinsicNode::UU) ? 2 : 1; 3799 3800 // ************************************************************************************************** 3801 // Prepare for main loop: optimized for needle count >=2, bail out otherwise. 3802 // ************************************************************************************************** 3803 3804 // Compute last haystack addr to use if no match gets found. 3805 clrldi(haycnt, haycnt, 32); // Ensure positive int is valid as 64 bit value. 3806 addi(addr, haystack, -h_csize); // Accesses use pre-increment. 3807 if (needlecntval == 0) { // variable needlecnt 3808 cmpwi(CCR6, needlecnt, 2); 3809 clrldi(needlecnt, needlecnt, 32); // Ensure positive int is valid as 64 bit value. 3810 blt(CCR6, L_TooShort); // Variable needlecnt: handle short needle separately. 3811 } 3812 3813 if (n_csize == 2) { lwz(n_start, 0, needle); } else { lhz(n_start, 0, needle); } // Load first 2 characters of needle. 3814 3815 if (needlecntval == 0) { // variable needlecnt 3816 subf(ch1, needlecnt, haycnt); // Last character index to compare is haycnt-needlecnt. 3817 addi(needlecnt, needlecnt, -2); // Rest of needle. 3818 } else { // constant needlecnt 3819 guarantee(needlecntval != 1, "IndexOf with single-character needle must be handled separately"); 3820 assert((needlecntval & 0x7fff) == needlecntval, "wrong immediate"); 3821 addi(ch1, haycnt, -needlecntval); // Last character index to compare is haycnt-needlecnt. 3822 if (needlecntval > 3) { li(needlecnt, needlecntval - 2); } // Rest of needle. 3823 } 3824 3825 if (h_csize == 2) { slwi(ch1, ch1, 1); } // Scale to number of bytes. 3826 3827 if (ae ==StrIntrinsicNode::UL) { 3828 srwi(tmp4, n_start, 1*8); // ___0 3829 rlwimi(n_start, tmp4, 2*8, 0, 23); // _0_1 3830 } 3831 3832 add(last_addr, haystack, ch1); // Point to last address to compare (haystack+2*(haycnt-needlecnt)). 3833 3834 // Main Loop (now we have at least 2 characters). 3835 Label L_OuterLoop, L_InnerLoop, L_FinalCheck, L_Comp1, L_Comp2; 3836 bind(L_OuterLoop); // Search for 1st 2 characters. 3837 Register addr_diff = tmp4; 3838 subf(addr_diff, addr, last_addr); // Difference between already checked address and last address to check. 3839 addi(addr, addr, h_csize); // This is the new address we want to use for comparing. 3840 srdi_(ch2, addr_diff, h_csize); 3841 beq(CCR0, L_FinalCheck); // 2 characters left? 3842 mtctr(ch2); // num of characters / 2 3843 bind(L_InnerLoop); // Main work horse (2x unrolled search loop) 3844 if (h_csize == 2) { // Load 2 characters of haystack (ignore alignment). 3845 lwz(ch1, 0, addr); 3846 lwz(ch2, 2, addr); 3847 } else { 3848 lhz(ch1, 0, addr); 3849 lhz(ch2, 1, addr); 3850 } 3851 cmpw(CCR0, ch1, n_start); // Compare 2 characters (1 would be sufficient but try to reduce branches to CompLoop). 3852 cmpw(CCR1, ch2, n_start); 3853 beq(CCR0, L_Comp1); // Did we find the needle start? 3854 beq(CCR1, L_Comp2); 3855 addi(addr, addr, 2 * h_csize); 3856 bdnz(L_InnerLoop); 3857 bind(L_FinalCheck); 3858 andi_(addr_diff, addr_diff, h_csize); // Remaining characters not covered by InnerLoop: (num of characters) & 1. 3859 beq(CCR0, L_NotFound); 3860 if (h_csize == 2) { lwz(ch1, 0, addr); } else { lhz(ch1, 0, addr); } // One position left at which we have to compare. 3861 cmpw(CCR1, ch1, n_start); 3862 beq(CCR1, L_Comp1); 3863 bind(L_NotFound); 3864 li(result, -1); // not found 3865 b(L_End); 3866 3867 // ************************************************************************************************** 3868 // Special Case: unfortunately, the variable needle case can be called with needlecnt<2 3869 // ************************************************************************************************** 3870 if (needlecntval == 0) { // We have to handle these cases separately. 3871 Label L_OneCharLoop; 3872 bind(L_TooShort); 3873 mtctr(haycnt); 3874 if (n_csize == 2) { lhz(n_start, 0, needle); } else { lbz(n_start, 0, needle); } // First character of needle 3875 bind(L_OneCharLoop); 3876 if (h_csize == 2) { lhzu(ch1, 2, addr); } else { lbzu(ch1, 1, addr); } 3877 cmpw(CCR1, ch1, n_start); 3878 beq(CCR1, L_Found); // Did we find the one character needle? 3879 bdnz(L_OneCharLoop); 3880 li(result, -1); // Not found. 3881 b(L_End); 3882 } 3883 3884 // ************************************************************************************************** 3885 // Regular Case Part II: compare rest of needle (first 2 characters have been compared already) 3886 // ************************************************************************************************** 3887 3888 // Compare the rest 3889 bind(L_Comp2); 3890 addi(addr, addr, h_csize); // First comparison has failed, 2nd one hit. 3891 bind(L_Comp1); // Addr points to possible needle start. 3892 if (needlecntval != 2) { // Const needlecnt==2? 3893 if (needlecntval != 3) { 3894 if (needlecntval == 0) { beq(CCR6, L_Found); } // Variable needlecnt==2? 3895 Register n_ind = tmp4, 3896 h_ind = n_ind; 3897 li(n_ind, 2 * n_csize); // First 2 characters are already compared, use index 2. 3898 mtctr(needlecnt); // Decremented by 2, still > 0. 3899 Label L_CompLoop; 3900 bind(L_CompLoop); 3901 if (ae ==StrIntrinsicNode::UL) { 3902 h_ind = ch1; 3903 sldi(h_ind, n_ind, 1); 3904 } 3905 if (n_csize == 2) { lhzx(ch2, needle, n_ind); } else { lbzx(ch2, needle, n_ind); } 3906 if (h_csize == 2) { lhzx(ch1, addr, h_ind); } else { lbzx(ch1, addr, h_ind); } 3907 cmpw(CCR1, ch1, ch2); 3908 bne(CCR1, L_OuterLoop); 3909 addi(n_ind, n_ind, n_csize); 3910 bdnz(L_CompLoop); 3911 } else { // No loop required if there's only one needle character left. 3912 if (n_csize == 2) { lhz(ch2, 2 * 2, needle); } else { lbz(ch2, 2 * 1, needle); } 3913 if (h_csize == 2) { lhz(ch1, 2 * 2, addr); } else { lbz(ch1, 2 * 1, addr); } 3914 cmpw(CCR1, ch1, ch2); 3915 bne(CCR1, L_OuterLoop); 3916 } 3917 } 3918 // Return index ... 3919 bind(L_Found); 3920 subf(result, haystack, addr); // relative to haystack, ... 3921 if (h_csize == 2) { srdi(result, result, 1); } // in characters. 3922 bind(L_End); 3923 } // string_indexof 3924 3925 void MacroAssembler::string_indexof_char(Register result, Register haystack, Register haycnt, 3926 Register needle, jchar needleChar, Register tmp1, Register tmp2, bool is_byte) { 3927 assert_different_registers(haystack, haycnt, needle, tmp1, tmp2); 3928 3929 Label L_InnerLoop, L_FinalCheck, L_Found1, L_Found2, L_NotFound, L_End; 3930 Register addr = tmp1, 3931 ch1 = tmp2, 3932 ch2 = R0; 3933 3934 const int h_csize = is_byte ? 1 : 2; 3935 3936 //4: 3937 srwi_(tmp2, haycnt, 1); // Shift right by exact_log2(UNROLL_FACTOR). 3938 mr(addr, haystack); 3939 beq(CCR0, L_FinalCheck); 3940 mtctr(tmp2); // Move to count register. 3941 //8: 3942 bind(L_InnerLoop); // Main work horse (2x unrolled search loop). 3943 if (!is_byte) { 3944 lhz(ch1, 0, addr); 3945 lhz(ch2, 2, addr); 3946 } else { 3947 lbz(ch1, 0, addr); 3948 lbz(ch2, 1, addr); 3949 } 3950 (needle != R0) ? cmpw(CCR0, ch1, needle) : cmplwi(CCR0, ch1, (unsigned int)needleChar); 3951 (needle != R0) ? cmpw(CCR1, ch2, needle) : cmplwi(CCR1, ch2, (unsigned int)needleChar); 3952 beq(CCR0, L_Found1); // Did we find the needle? 3953 beq(CCR1, L_Found2); 3954 addi(addr, addr, 2 * h_csize); 3955 bdnz(L_InnerLoop); 3956 //16: 3957 bind(L_FinalCheck); 3958 andi_(R0, haycnt, 1); 3959 beq(CCR0, L_NotFound); 3960 if (!is_byte) { lhz(ch1, 0, addr); } else { lbz(ch1, 0, addr); } // One position left at which we have to compare. 3961 (needle != R0) ? cmpw(CCR1, ch1, needle) : cmplwi(CCR1, ch1, (unsigned int)needleChar); 3962 beq(CCR1, L_Found1); 3963 //21: 3964 bind(L_NotFound); 3965 li(result, -1); // Not found. 3966 b(L_End); 3967 3968 bind(L_Found2); 3969 addi(addr, addr, h_csize); 3970 //24: 3971 bind(L_Found1); // Return index ... 3972 subf(result, haystack, addr); // relative to haystack, ... 3973 if (!is_byte) { srdi(result, result, 1); } // in characters. 3974 bind(L_End); 3975 } // string_indexof_char 3976 3977 3978 void MacroAssembler::has_negatives(Register src, Register cnt, Register result, 3979 Register tmp1, Register tmp2) { 3980 const Register tmp0 = R0; 3981 assert_different_registers(src, result, cnt, tmp0, tmp1, tmp2); 3982 Label Lfastloop, Lslow, Lloop, Lnoneg, Ldone; 3983 3984 // Check if cnt >= 8 (= 16 bytes) 3985 lis(tmp1, (int)(short)0x8080); // tmp1 = 0x8080808080808080 3986 srwi_(tmp2, cnt, 4); 3987 li(result, 1); // Assume there's a negative byte. 3988 beq(CCR0, Lslow); 3989 ori(tmp1, tmp1, 0x8080); 3990 rldimi(tmp1, tmp1, 32, 0); 3991 mtctr(tmp2); 3992 3993 // 2x unrolled loop 3994 bind(Lfastloop); 3995 ld(tmp2, 0, src); 3996 ld(tmp0, 8, src); 3997 3998 orr(tmp0, tmp2, tmp0); 3999 4000 and_(tmp0, tmp0, tmp1); 4001 bne(CCR0, Ldone); // Found negative byte. 4002 addi(src, src, 16); 4003 4004 bdnz(Lfastloop); 4005 4006 bind(Lslow); // Fallback to slow version 4007 rldicl_(tmp0, cnt, 0, 64-4); 4008 beq(CCR0, Lnoneg); 4009 mtctr(tmp0); 4010 bind(Lloop); 4011 lbz(tmp0, 0, src); 4012 addi(src, src, 1); 4013 andi_(tmp0, tmp0, 0x80); 4014 bne(CCR0, Ldone); // Found negative byte. 4015 bdnz(Lloop); 4016 bind(Lnoneg); 4017 li(result, 0); 4018 4019 bind(Ldone); 4020 } 4021 4022 #endif // Compiler2 4023 4024 // Helpers for Intrinsic Emitters 4025 // 4026 // Revert the byte order of a 32bit value in a register 4027 // src: 0x44556677 4028 // dst: 0x77665544 4029 // Three steps to obtain the result: 4030 // 1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word 4031 // into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared. 4032 // This value initializes dst. 4033 // 2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost 4034 // byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go. 4035 // This value is mask inserted into dst with a [0..23] mask of 1s. 4036 // 3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position. 4037 // This value is mask inserted into dst with a [8..15] mask of 1s. 4038 void MacroAssembler::load_reverse_32(Register dst, Register src) { 4039 assert_different_registers(dst, src); 4040 4041 rldicl(dst, src, (4+1)*8, 56); // Rotate byte 4 into position 7 (rightmost), clear all to the left. 4042 rlwimi(dst, src, 3*8, 0, 23); // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone. 4043 rlwimi(dst, src, 1*8, 8, 15); // Insert byte 6 into position 5, leave the rest alone. 4044 } 4045 4046 // Calculate the column addresses of the crc32 lookup table into distinct registers. 4047 // This loop-invariant calculation is moved out of the loop body, reducing the loop 4048 // body size from 20 to 16 instructions. 4049 // Returns the offset that was used to calculate the address of column tc3. 4050 // Due to register shortage, setting tc3 may overwrite table. With the return offset 4051 // at hand, the original table address can be easily reconstructed. 4052 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) { 4053 4054 #ifdef VM_LITTLE_ENDIAN 4055 // This is what we implement (the DOLIT4 part): 4056 // ========================================================================= */ 4057 // #define DOLIT4 c ^= *buf4++; \ 4058 // c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \ 4059 // crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24] 4060 // #define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4 4061 // ========================================================================= */ 4062 const int ix0 = 3*(4*CRC32_COLUMN_SIZE); 4063 const int ix1 = 2*(4*CRC32_COLUMN_SIZE); 4064 const int ix2 = 1*(4*CRC32_COLUMN_SIZE); 4065 const int ix3 = 0*(4*CRC32_COLUMN_SIZE); 4066 #else 4067 // This is what we implement (the DOBIG4 part): 4068 // ========================================================================= 4069 // #define DOBIG4 c ^= *++buf4; \ 4070 // c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \ 4071 // crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24] 4072 // #define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4 4073 // ========================================================================= 4074 const int ix0 = 4*(4*CRC32_COLUMN_SIZE); 4075 const int ix1 = 5*(4*CRC32_COLUMN_SIZE); 4076 const int ix2 = 6*(4*CRC32_COLUMN_SIZE); 4077 const int ix3 = 7*(4*CRC32_COLUMN_SIZE); 4078 #endif 4079 assert_different_registers(table, tc0, tc1, tc2); 4080 assert(table == tc3, "must be!"); 4081 4082 addi(tc0, table, ix0); 4083 addi(tc1, table, ix1); 4084 addi(tc2, table, ix2); 4085 if (ix3 != 0) addi(tc3, table, ix3); 4086 4087 return ix3; 4088 } 4089 4090 /** 4091 * uint32_t crc; 4092 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8); 4093 */ 4094 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) { 4095 assert_different_registers(crc, table, tmp); 4096 assert_different_registers(val, table); 4097 4098 if (crc == val) { // Must rotate first to use the unmodified value. 4099 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 4100 // As we use a word (4-byte) instruction, we have to adapt the mask bit positions. 4101 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 4102 } else { 4103 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 4104 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 4105 } 4106 lwzx(tmp, table, tmp); 4107 xorr(crc, crc, tmp); 4108 } 4109 4110 /** 4111 * uint32_t crc; 4112 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8); 4113 */ 4114 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) { 4115 fold_byte_crc32(crc, crc, table, tmp); 4116 } 4117 4118 /** 4119 * Emits code to update CRC-32 with a byte value according to constants in table. 4120 * 4121 * @param [in,out]crc Register containing the crc. 4122 * @param [in]val Register containing the byte to fold into the CRC. 4123 * @param [in]table Register containing the table of crc constants. 4124 * 4125 * uint32_t crc; 4126 * val = crc_table[(val ^ crc) & 0xFF]; 4127 * crc = val ^ (crc >> 8); 4128 */ 4129 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 4130 BLOCK_COMMENT("update_byte_crc32:"); 4131 xorr(val, val, crc); 4132 fold_byte_crc32(crc, val, table, val); 4133 } 4134 4135 /** 4136 * @param crc register containing existing CRC (32-bit) 4137 * @param buf register pointing to input byte buffer (byte*) 4138 * @param len register containing number of bytes 4139 * @param table register pointing to CRC table 4140 */ 4141 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table, 4142 Register data, bool loopAlignment) { 4143 assert_different_registers(crc, buf, len, table, data); 4144 4145 Label L_mainLoop, L_done; 4146 const int mainLoop_stepping = 1; 4147 const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4; 4148 4149 // Process all bytes in a single-byte loop. 4150 clrldi_(len, len, 32); // Enforce 32 bit. Anything to do? 4151 beq(CCR0, L_done); 4152 4153 mtctr(len); 4154 align(mainLoop_alignment); 4155 BIND(L_mainLoop); 4156 lbz(data, 0, buf); // Byte from buffer, zero-extended. 4157 addi(buf, buf, mainLoop_stepping); // Advance buffer position. 4158 update_byte_crc32(crc, data, table); 4159 bdnz(L_mainLoop); // Iterate. 4160 4161 bind(L_done); 4162 } 4163 4164 /** 4165 * Emits code to update CRC-32 with a 4-byte value according to constants in table 4166 * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c 4167 */ 4168 // A not on the lookup table address(es): 4169 // The lookup table consists of two sets of four columns each. 4170 // The columns {0..3} are used for little-endian machines. 4171 // The columns {4..7} are used for big-endian machines. 4172 // To save the effort of adding the column offset to the table address each time 4173 // a table element is looked up, it is possible to pass the pre-calculated 4174 // column addresses. 4175 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary. 4176 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc, 4177 Register t0, Register t1, Register t2, Register t3, 4178 Register tc0, Register tc1, Register tc2, Register tc3) { 4179 assert_different_registers(crc, t3); 4180 4181 // XOR crc with next four bytes of buffer. 4182 lwz(t3, bufDisp, buf); 4183 if (bufInc != 0) { 4184 addi(buf, buf, bufInc); 4185 } 4186 xorr(t3, t3, crc); 4187 4188 // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices. 4189 rlwinm(t0, t3, 2, 24-2, 31-2); // ((t1 >> 0) & 0xff) << 2 4190 rlwinm(t1, t3, 32+(2- 8), 24-2, 31-2); // ((t1 >> 8) & 0xff) << 2 4191 rlwinm(t2, t3, 32+(2-16), 24-2, 31-2); // ((t1 >> 16) & 0xff) << 2 4192 rlwinm(t3, t3, 32+(2-24), 24-2, 31-2); // ((t1 >> 24) & 0xff) << 2 4193 4194 // Use the pre-calculated column addresses. 4195 // Load pre-calculated table values. 4196 lwzx(t0, tc0, t0); 4197 lwzx(t1, tc1, t1); 4198 lwzx(t2, tc2, t2); 4199 lwzx(t3, tc3, t3); 4200 4201 // Calculate new crc from table values. 4202 xorr(t0, t0, t1); 4203 xorr(t2, t2, t3); 4204 xorr(crc, t0, t2); // Now crc contains the final checksum value. 4205 } 4206 4207 /** 4208 * @param crc register containing existing CRC (32-bit) 4209 * @param buf register pointing to input byte buffer (byte*) 4210 * @param len register containing number of bytes 4211 * @param table register pointing to CRC table 4212 * 4213 * Uses R9..R12 as work register. Must be saved/restored by caller! 4214 */ 4215 void MacroAssembler::kernel_crc32_2word(Register crc, Register buf, Register len, Register table, 4216 Register t0, Register t1, Register t2, Register t3, 4217 Register tc0, Register tc1, Register tc2, Register tc3, 4218 bool invertCRC) { 4219 assert_different_registers(crc, buf, len, table); 4220 4221 Label L_mainLoop, L_tail; 4222 Register tmp = t0; 4223 Register data = t0; 4224 Register tmp2 = t1; 4225 const int mainLoop_stepping = 8; 4226 const int tailLoop_stepping = 1; 4227 const int log_stepping = exact_log2(mainLoop_stepping); 4228 const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32; 4229 const int complexThreshold = 2*mainLoop_stepping; 4230 4231 // Don't test for len <= 0 here. This pathological case should not occur anyway. 4232 // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles 4233 // for all well-behaved cases. The situation itself is detected and handled correctly 4234 // within update_byteLoop_crc32. 4235 assert(tailLoop_stepping == 1, "check tailLoop_stepping!"); 4236 4237 BLOCK_COMMENT("kernel_crc32_2word {"); 4238 4239 if (invertCRC) { 4240 nand(crc, crc, crc); // 1s complement of crc 4241 } 4242 4243 // Check for short (<mainLoop_stepping) buffer. 4244 cmpdi(CCR0, len, complexThreshold); 4245 blt(CCR0, L_tail); 4246 4247 // Pre-mainLoop alignment did show a slight (1%) positive effect on performance. 4248 // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions. 4249 { 4250 // Align buf addr to mainLoop_stepping boundary. 4251 neg(tmp2, buf); // Calculate # preLoop iterations for alignment. 4252 rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63. 4253 4254 if (complexThreshold > mainLoop_stepping) { 4255 sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 4256 } else { 4257 sub(tmp, len, tmp2); // Remaining bytes for main loop. 4258 cmpdi(CCR0, tmp, mainLoop_stepping); 4259 blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing 4260 mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 4261 } 4262 update_byteLoop_crc32(crc, buf, tmp2, table, data, false); 4263 } 4264 4265 srdi(tmp2, len, log_stepping); // #iterations for mainLoop 4266 andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop 4267 mtctr(tmp2); 4268 4269 #ifdef VM_LITTLE_ENDIAN 4270 Register crc_rv = crc; 4271 #else 4272 Register crc_rv = tmp; // Load_reverse needs separate registers to work on. 4273 // Occupies tmp, but frees up crc. 4274 load_reverse_32(crc_rv, crc); // Revert byte order because we are dealing with big-endian data. 4275 tmp = crc; 4276 #endif 4277 4278 int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3); 4279 4280 align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement. 4281 BIND(L_mainLoop); 4282 update_1word_crc32(crc_rv, buf, table, 0, 0, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 4283 update_1word_crc32(crc_rv, buf, table, 4, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 4284 bdnz(L_mainLoop); 4285 4286 #ifndef VM_LITTLE_ENDIAN 4287 load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data. 4288 tmp = crc_rv; // Tmp uses it's original register again. 4289 #endif 4290 4291 // Restore original table address for tailLoop. 4292 if (reconstructTableOffset != 0) { 4293 addi(table, table, -reconstructTableOffset); 4294 } 4295 4296 // Process last few (<complexThreshold) bytes of buffer. 4297 BIND(L_tail); 4298 update_byteLoop_crc32(crc, buf, len, table, data, false); 4299 4300 if (invertCRC) { 4301 nand(crc, crc, crc); // 1s complement of crc 4302 } 4303 BLOCK_COMMENT("} kernel_crc32_2word"); 4304 } 4305 4306 /** 4307 * @param crc register containing existing CRC (32-bit) 4308 * @param buf register pointing to input byte buffer (byte*) 4309 * @param len register containing number of bytes 4310 * @param table register pointing to CRC table 4311 * 4312 * uses R9..R12 as work register. Must be saved/restored by caller! 4313 */ 4314 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table, 4315 Register t0, Register t1, Register t2, Register t3, 4316 Register tc0, Register tc1, Register tc2, Register tc3, 4317 bool invertCRC) { 4318 assert_different_registers(crc, buf, len, table); 4319 4320 Label L_mainLoop, L_tail; 4321 Register tmp = t0; 4322 Register data = t0; 4323 Register tmp2 = t1; 4324 const int mainLoop_stepping = 4; 4325 const int tailLoop_stepping = 1; 4326 const int log_stepping = exact_log2(mainLoop_stepping); 4327 const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32; 4328 const int complexThreshold = 2*mainLoop_stepping; 4329 4330 // Don't test for len <= 0 here. This pathological case should not occur anyway. 4331 // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles 4332 // for all well-behaved cases. The situation itself is detected and handled correctly 4333 // within update_byteLoop_crc32. 4334 assert(tailLoop_stepping == 1, "check tailLoop_stepping!"); 4335 4336 BLOCK_COMMENT("kernel_crc32_1word {"); 4337 4338 if (invertCRC) { 4339 nand(crc, crc, crc); // 1s complement of crc 4340 } 4341 4342 // Check for short (<mainLoop_stepping) buffer. 4343 cmpdi(CCR0, len, complexThreshold); 4344 blt(CCR0, L_tail); 4345 4346 // Pre-mainLoop alignment did show a slight (1%) positive effect on performance. 4347 // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions. 4348 { 4349 // Align buf addr to mainLoop_stepping boundary. 4350 neg(tmp2, buf); // Calculate # preLoop iterations for alignment. 4351 rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63. 4352 4353 if (complexThreshold > mainLoop_stepping) { 4354 sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 4355 } else { 4356 sub(tmp, len, tmp2); // Remaining bytes for main loop. 4357 cmpdi(CCR0, tmp, mainLoop_stepping); 4358 blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing 4359 mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 4360 } 4361 update_byteLoop_crc32(crc, buf, tmp2, table, data, false); 4362 } 4363 4364 srdi(tmp2, len, log_stepping); // #iterations for mainLoop 4365 andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop 4366 mtctr(tmp2); 4367 4368 #ifdef VM_LITTLE_ENDIAN 4369 Register crc_rv = crc; 4370 #else 4371 Register crc_rv = tmp; // Load_reverse needs separate registers to work on. 4372 // Occupies tmp, but frees up crc. 4373 load_reverse_32(crc_rv, crc); // Revert byte order because we are dealing with big-endian data. 4374 tmp = crc; 4375 #endif 4376 4377 int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3); 4378 4379 align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement. 4380 BIND(L_mainLoop); 4381 update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 4382 bdnz(L_mainLoop); 4383 4384 #ifndef VM_LITTLE_ENDIAN 4385 load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data. 4386 tmp = crc_rv; // Tmp uses it's original register again. 4387 #endif 4388 4389 // Restore original table address for tailLoop. 4390 if (reconstructTableOffset != 0) { 4391 addi(table, table, -reconstructTableOffset); 4392 } 4393 4394 // Process last few (<complexThreshold) bytes of buffer. 4395 BIND(L_tail); 4396 update_byteLoop_crc32(crc, buf, len, table, data, false); 4397 4398 if (invertCRC) { 4399 nand(crc, crc, crc); // 1s complement of crc 4400 } 4401 BLOCK_COMMENT("} kernel_crc32_1word"); 4402 } 4403 4404 /** 4405 * @param crc register containing existing CRC (32-bit) 4406 * @param buf register pointing to input byte buffer (byte*) 4407 * @param len register containing number of bytes 4408 * @param table register pointing to CRC table 4409 * 4410 * Uses R7_ARG5, R8_ARG6 as work registers. 4411 */ 4412 void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len, Register table, 4413 Register t0, Register t1, Register t2, Register t3, 4414 bool invertCRC) { 4415 assert_different_registers(crc, buf, len, table); 4416 4417 Register data = t0; // Holds the current byte to be folded into crc. 4418 4419 BLOCK_COMMENT("kernel_crc32_1byte {"); 4420 4421 if (invertCRC) { 4422 nand(crc, crc, crc); // 1s complement of crc 4423 } 4424 4425 // Process all bytes in a single-byte loop. 4426 update_byteLoop_crc32(crc, buf, len, table, data, true); 4427 4428 if (invertCRC) { 4429 nand(crc, crc, crc); // 1s complement of crc 4430 } 4431 BLOCK_COMMENT("} kernel_crc32_1byte"); 4432 } 4433 4434 /** 4435 * @param crc register containing existing CRC (32-bit) 4436 * @param buf register pointing to input byte buffer (byte*) 4437 * @param len register containing number of bytes 4438 * @param table register pointing to CRC table 4439 * @param constants register pointing to CRC table for 128-bit aligned memory 4440 * @param barretConstants register pointing to table for barrett reduction 4441 * @param t0 volatile register 4442 * @param t1 volatile register 4443 * @param t2 volatile register 4444 * @param t3 volatile register 4445 */ 4446 void MacroAssembler::kernel_crc32_1word_vpmsumd(Register crc, Register buf, Register len, Register table, 4447 Register constants, Register barretConstants, 4448 Register t0, Register t1, Register t2, Register t3, Register t4, 4449 bool invertCRC) { 4450 assert_different_registers(crc, buf, len, table); 4451 4452 Label L_alignedHead, L_tail, L_alignTail, L_start, L_end; 4453 4454 Register prealign = t0; 4455 Register postalign = t0; 4456 4457 BLOCK_COMMENT("kernel_crc32_1word_vpmsumb {"); 4458 4459 // 1. use kernel_crc32_1word for shorter than 384bit 4460 clrldi(len, len, 32); 4461 cmpdi(CCR0, len, 384); 4462 bge(CCR0, L_start); 4463 4464 Register tc0 = t4; 4465 Register tc1 = constants; 4466 Register tc2 = barretConstants; 4467 kernel_crc32_1word(crc, buf, len, table,t0, t1, t2, t3, tc0, tc1, tc2, table, invertCRC); 4468 b(L_end); 4469 4470 BIND(L_start); 4471 4472 // 2. ~c 4473 if (invertCRC) { 4474 nand(crc, crc, crc); // 1s complement of crc 4475 } 4476 4477 // 3. calculate from 0 to first 128bit-aligned address 4478 clrldi_(prealign, buf, 57); 4479 beq(CCR0, L_alignedHead); 4480 4481 subfic(prealign, prealign, 128); 4482 4483 subf(len, prealign, len); 4484 update_byteLoop_crc32(crc, buf, prealign, table, t2, false); 4485 4486 // 4. calculate from first 128bit-aligned address to last 128bit-aligned address 4487 BIND(L_alignedHead); 4488 4489 clrldi(postalign, len, 57); 4490 subf(len, postalign, len); 4491 4492 // len must be more than 256bit 4493 kernel_crc32_1word_aligned(crc, buf, len, constants, barretConstants, t1, t2, t3); 4494 4495 // 5. calculate remaining 4496 cmpdi(CCR0, postalign, 0); 4497 beq(CCR0, L_tail); 4498 4499 update_byteLoop_crc32(crc, buf, postalign, table, t2, false); 4500 4501 BIND(L_tail); 4502 4503 // 6. ~c 4504 if (invertCRC) { 4505 nand(crc, crc, crc); // 1s complement of crc 4506 } 4507 4508 BIND(L_end); 4509 4510 BLOCK_COMMENT("} kernel_crc32_1word_vpmsumb"); 4511 } 4512 4513 /** 4514 * @param crc register containing existing CRC (32-bit) 4515 * @param buf register pointing to input byte buffer (byte*) 4516 * @param len register containing number of bytes 4517 * @param constants register pointing to CRC table for 128-bit aligned memory 4518 * @param barretConstants register pointing to table for barrett reduction 4519 * @param t0 volatile register 4520 * @param t1 volatile register 4521 * @param t2 volatile register 4522 */ 4523 void MacroAssembler::kernel_crc32_1word_aligned(Register crc, Register buf, Register len, 4524 Register constants, Register barretConstants, Register t0, Register t1, Register t2) { 4525 Label L_mainLoop, L_tail, L_alignTail, L_barrett_reduction, L_end, L_first_warm_up_done, L_first_cool_down, L_second_cool_down, L_XOR, L_test; 4526 Label L_lv0, L_lv1, L_lv2, L_lv3, L_lv4, L_lv5, L_lv6, L_lv7, L_lv8, L_lv9, L_lv10, L_lv11, L_lv12, L_lv13, L_lv14, L_lv15; 4527 Label L_1, L_2, L_3, L_4; 4528 4529 Register rLoaded = t0; 4530 Register rTmp1 = t1; 4531 Register rTmp2 = t2; 4532 Register off16 = R22; 4533 Register off32 = R23; 4534 Register off48 = R24; 4535 Register off64 = R25; 4536 Register off80 = R26; 4537 Register off96 = R27; 4538 Register off112 = R28; 4539 Register rIdx = R29; 4540 Register rMax = R30; 4541 Register constantsPos = R31; 4542 4543 VectorRegister mask_32bit = VR24; 4544 VectorRegister mask_64bit = VR25; 4545 VectorRegister zeroes = VR26; 4546 VectorRegister const1 = VR27; 4547 VectorRegister const2 = VR28; 4548 4549 // Save non-volatile vector registers (frameless). 4550 Register offset = t1; int offsetInt = 0; 4551 offsetInt -= 16; li(offset, -16); stvx(VR20, offset, R1_SP); 4552 offsetInt -= 16; addi(offset, offset, -16); stvx(VR21, offset, R1_SP); 4553 offsetInt -= 16; addi(offset, offset, -16); stvx(VR22, offset, R1_SP); 4554 offsetInt -= 16; addi(offset, offset, -16); stvx(VR23, offset, R1_SP); 4555 offsetInt -= 16; addi(offset, offset, -16); stvx(VR24, offset, R1_SP); 4556 offsetInt -= 16; addi(offset, offset, -16); stvx(VR25, offset, R1_SP); 4557 offsetInt -= 16; addi(offset, offset, -16); stvx(VR26, offset, R1_SP); 4558 offsetInt -= 16; addi(offset, offset, -16); stvx(VR27, offset, R1_SP); 4559 offsetInt -= 16; addi(offset, offset, -16); stvx(VR28, offset, R1_SP); 4560 offsetInt -= 8; std(R22, offsetInt, R1_SP); 4561 offsetInt -= 8; std(R23, offsetInt, R1_SP); 4562 offsetInt -= 8; std(R24, offsetInt, R1_SP); 4563 offsetInt -= 8; std(R25, offsetInt, R1_SP); 4564 offsetInt -= 8; std(R26, offsetInt, R1_SP); 4565 offsetInt -= 8; std(R27, offsetInt, R1_SP); 4566 offsetInt -= 8; std(R28, offsetInt, R1_SP); 4567 offsetInt -= 8; std(R29, offsetInt, R1_SP); 4568 offsetInt -= 8; std(R30, offsetInt, R1_SP); 4569 offsetInt -= 8; std(R31, offsetInt, R1_SP); 4570 4571 // Set constants 4572 li(off16, 16); 4573 li(off32, 32); 4574 li(off48, 48); 4575 li(off64, 64); 4576 li(off80, 80); 4577 li(off96, 96); 4578 li(off112, 112); 4579 4580 clrldi(crc, crc, 32); 4581 4582 vxor(zeroes, zeroes, zeroes); 4583 vspltisw(VR0, -1); 4584 4585 vsldoi(mask_32bit, zeroes, VR0, 4); 4586 vsldoi(mask_64bit, zeroes, VR0, 8); 4587 4588 // Get the initial value into v8 4589 vxor(VR8, VR8, VR8); 4590 mtvrd(VR8, crc); 4591 vsldoi(VR8, zeroes, VR8, 8); // shift into bottom 32 bits 4592 4593 li (rLoaded, 0); 4594 4595 rldicr(rIdx, len, 0, 56); 4596 4597 { 4598 BIND(L_1); 4599 // Checksum in blocks of MAX_SIZE (32768) 4600 lis(rMax, 0); 4601 ori(rMax, rMax, 32768); 4602 mr(rTmp2, rMax); 4603 cmpd(CCR0, rIdx, rMax); 4604 bgt(CCR0, L_2); 4605 mr(rMax, rIdx); 4606 4607 BIND(L_2); 4608 subf(rIdx, rMax, rIdx); 4609 4610 // our main loop does 128 bytes at a time 4611 srdi(rMax, rMax, 7); 4612 4613 /* 4614 * Work out the offset into the constants table to start at. Each 4615 * constant is 16 bytes, and it is used against 128 bytes of input 4616 * data - 128 / 16 = 8 4617 */ 4618 sldi(rTmp1, rMax, 4); 4619 srdi(rTmp2, rTmp2, 3); 4620 subf(rTmp1, rTmp1, rTmp2); 4621 4622 // We reduce our final 128 bytes in a separate step 4623 addi(rMax, rMax, -1); 4624 mtctr(rMax); 4625 4626 // Find the start of our constants 4627 add(constantsPos, constants, rTmp1); 4628 4629 // zero VR0-v7 which will contain our checksums 4630 vxor(VR0, VR0, VR0); 4631 vxor(VR1, VR1, VR1); 4632 vxor(VR2, VR2, VR2); 4633 vxor(VR3, VR3, VR3); 4634 vxor(VR4, VR4, VR4); 4635 vxor(VR5, VR5, VR5); 4636 vxor(VR6, VR6, VR6); 4637 vxor(VR7, VR7, VR7); 4638 4639 lvx(const1, constantsPos); 4640 4641 /* 4642 * If we are looping back to consume more data we use the values 4643 * already in VR16-v23. 4644 */ 4645 cmpdi(CCR0, rLoaded, 1); 4646 beq(CCR0, L_3); 4647 { 4648 4649 // First warm up pass 4650 lvx(VR16, buf); 4651 lvx(VR17, off16, buf); 4652 lvx(VR18, off32, buf); 4653 lvx(VR19, off48, buf); 4654 lvx(VR20, off64, buf); 4655 lvx(VR21, off80, buf); 4656 lvx(VR22, off96, buf); 4657 lvx(VR23, off112, buf); 4658 addi(buf, buf, 8*16); 4659 4660 // xor in initial value 4661 vxor(VR16, VR16, VR8); 4662 } 4663 4664 BIND(L_3); 4665 bdz(L_first_warm_up_done); 4666 4667 addi(constantsPos, constantsPos, 16); 4668 lvx(const2, constantsPos); 4669 4670 // Second warm up pass 4671 vpmsumd(VR8, VR16, const1); 4672 lvx(VR16, buf); 4673 4674 vpmsumd(VR9, VR17, const1); 4675 lvx(VR17, off16, buf); 4676 4677 vpmsumd(VR10, VR18, const1); 4678 lvx(VR18, off32, buf); 4679 4680 vpmsumd(VR11, VR19, const1); 4681 lvx(VR19, off48, buf); 4682 4683 vpmsumd(VR12, VR20, const1); 4684 lvx(VR20, off64, buf); 4685 4686 vpmsumd(VR13, VR21, const1); 4687 lvx(VR21, off80, buf); 4688 4689 vpmsumd(VR14, VR22, const1); 4690 lvx(VR22, off96, buf); 4691 4692 vpmsumd(VR15, VR23, const1); 4693 lvx(VR23, off112, buf); 4694 4695 addi(buf, buf, 8 * 16); 4696 4697 bdz(L_first_cool_down); 4698 4699 /* 4700 * main loop. We modulo schedule it such that it takes three iterations 4701 * to complete - first iteration load, second iteration vpmsum, third 4702 * iteration xor. 4703 */ 4704 { 4705 BIND(L_4); 4706 lvx(const1, constantsPos); addi(constantsPos, constantsPos, 16); 4707 4708 vxor(VR0, VR0, VR8); 4709 vpmsumd(VR8, VR16, const2); 4710 lvx(VR16, buf); 4711 4712 vxor(VR1, VR1, VR9); 4713 vpmsumd(VR9, VR17, const2); 4714 lvx(VR17, off16, buf); 4715 4716 vxor(VR2, VR2, VR10); 4717 vpmsumd(VR10, VR18, const2); 4718 lvx(VR18, off32, buf); 4719 4720 vxor(VR3, VR3, VR11); 4721 vpmsumd(VR11, VR19, const2); 4722 lvx(VR19, off48, buf); 4723 lvx(const2, constantsPos); 4724 4725 vxor(VR4, VR4, VR12); 4726 vpmsumd(VR12, VR20, const1); 4727 lvx(VR20, off64, buf); 4728 4729 vxor(VR5, VR5, VR13); 4730 vpmsumd(VR13, VR21, const1); 4731 lvx(VR21, off80, buf); 4732 4733 vxor(VR6, VR6, VR14); 4734 vpmsumd(VR14, VR22, const1); 4735 lvx(VR22, off96, buf); 4736 4737 vxor(VR7, VR7, VR15); 4738 vpmsumd(VR15, VR23, const1); 4739 lvx(VR23, off112, buf); 4740 4741 addi(buf, buf, 8 * 16); 4742 4743 bdnz(L_4); 4744 } 4745 4746 BIND(L_first_cool_down); 4747 4748 // First cool down pass 4749 lvx(const1, constantsPos); 4750 addi(constantsPos, constantsPos, 16); 4751 4752 vxor(VR0, VR0, VR8); 4753 vpmsumd(VR8, VR16, const1); 4754 4755 vxor(VR1, VR1, VR9); 4756 vpmsumd(VR9, VR17, const1); 4757 4758 vxor(VR2, VR2, VR10); 4759 vpmsumd(VR10, VR18, const1); 4760 4761 vxor(VR3, VR3, VR11); 4762 vpmsumd(VR11, VR19, const1); 4763 4764 vxor(VR4, VR4, VR12); 4765 vpmsumd(VR12, VR20, const1); 4766 4767 vxor(VR5, VR5, VR13); 4768 vpmsumd(VR13, VR21, const1); 4769 4770 vxor(VR6, VR6, VR14); 4771 vpmsumd(VR14, VR22, const1); 4772 4773 vxor(VR7, VR7, VR15); 4774 vpmsumd(VR15, VR23, const1); 4775 4776 BIND(L_second_cool_down); 4777 // Second cool down pass 4778 vxor(VR0, VR0, VR8); 4779 vxor(VR1, VR1, VR9); 4780 vxor(VR2, VR2, VR10); 4781 vxor(VR3, VR3, VR11); 4782 vxor(VR4, VR4, VR12); 4783 vxor(VR5, VR5, VR13); 4784 vxor(VR6, VR6, VR14); 4785 vxor(VR7, VR7, VR15); 4786 4787 /* 4788 * vpmsumd produces a 96 bit result in the least significant bits 4789 * of the register. Since we are bit reflected we have to shift it 4790 * left 32 bits so it occupies the least significant bits in the 4791 * bit reflected domain. 4792 */ 4793 vsldoi(VR0, VR0, zeroes, 4); 4794 vsldoi(VR1, VR1, zeroes, 4); 4795 vsldoi(VR2, VR2, zeroes, 4); 4796 vsldoi(VR3, VR3, zeroes, 4); 4797 vsldoi(VR4, VR4, zeroes, 4); 4798 vsldoi(VR5, VR5, zeroes, 4); 4799 vsldoi(VR6, VR6, zeroes, 4); 4800 vsldoi(VR7, VR7, zeroes, 4); 4801 4802 // xor with last 1024 bits 4803 lvx(VR8, buf); 4804 lvx(VR9, off16, buf); 4805 lvx(VR10, off32, buf); 4806 lvx(VR11, off48, buf); 4807 lvx(VR12, off64, buf); 4808 lvx(VR13, off80, buf); 4809 lvx(VR14, off96, buf); 4810 lvx(VR15, off112, buf); 4811 addi(buf, buf, 8 * 16); 4812 4813 vxor(VR16, VR0, VR8); 4814 vxor(VR17, VR1, VR9); 4815 vxor(VR18, VR2, VR10); 4816 vxor(VR19, VR3, VR11); 4817 vxor(VR20, VR4, VR12); 4818 vxor(VR21, VR5, VR13); 4819 vxor(VR22, VR6, VR14); 4820 vxor(VR23, VR7, VR15); 4821 4822 li(rLoaded, 1); 4823 cmpdi(CCR0, rIdx, 0); 4824 addi(rIdx, rIdx, 128); 4825 bne(CCR0, L_1); 4826 } 4827 4828 // Work out how many bytes we have left 4829 andi_(len, len, 127); 4830 4831 // Calculate where in the constant table we need to start 4832 subfic(rTmp1, len, 128); 4833 add(constantsPos, constantsPos, rTmp1); 4834 4835 // How many 16 byte chunks are in the tail 4836 srdi(rIdx, len, 4); 4837 mtctr(rIdx); 4838 4839 /* 4840 * Reduce the previously calculated 1024 bits to 64 bits, shifting 4841 * 32 bits to include the trailing 32 bits of zeros 4842 */ 4843 lvx(VR0, constantsPos); 4844 lvx(VR1, off16, constantsPos); 4845 lvx(VR2, off32, constantsPos); 4846 lvx(VR3, off48, constantsPos); 4847 lvx(VR4, off64, constantsPos); 4848 lvx(VR5, off80, constantsPos); 4849 lvx(VR6, off96, constantsPos); 4850 lvx(VR7, off112, constantsPos); 4851 addi(constantsPos, constantsPos, 8 * 16); 4852 4853 vpmsumw(VR0, VR16, VR0); 4854 vpmsumw(VR1, VR17, VR1); 4855 vpmsumw(VR2, VR18, VR2); 4856 vpmsumw(VR3, VR19, VR3); 4857 vpmsumw(VR4, VR20, VR4); 4858 vpmsumw(VR5, VR21, VR5); 4859 vpmsumw(VR6, VR22, VR6); 4860 vpmsumw(VR7, VR23, VR7); 4861 4862 // Now reduce the tail (0 - 112 bytes) 4863 cmpdi(CCR0, rIdx, 0); 4864 beq(CCR0, L_XOR); 4865 4866 lvx(VR16, buf); addi(buf, buf, 16); 4867 lvx(VR17, constantsPos); 4868 vpmsumw(VR16, VR16, VR17); 4869 vxor(VR0, VR0, VR16); 4870 beq(CCR0, L_XOR); 4871 4872 lvx(VR16, buf); addi(buf, buf, 16); 4873 lvx(VR17, off16, constantsPos); 4874 vpmsumw(VR16, VR16, VR17); 4875 vxor(VR0, VR0, VR16); 4876 beq(CCR0, L_XOR); 4877 4878 lvx(VR16, buf); addi(buf, buf, 16); 4879 lvx(VR17, off32, constantsPos); 4880 vpmsumw(VR16, VR16, VR17); 4881 vxor(VR0, VR0, VR16); 4882 beq(CCR0, L_XOR); 4883 4884 lvx(VR16, buf); addi(buf, buf, 16); 4885 lvx(VR17, off48,constantsPos); 4886 vpmsumw(VR16, VR16, VR17); 4887 vxor(VR0, VR0, VR16); 4888 beq(CCR0, L_XOR); 4889 4890 lvx(VR16, buf); addi(buf, buf, 16); 4891 lvx(VR17, off64, constantsPos); 4892 vpmsumw(VR16, VR16, VR17); 4893 vxor(VR0, VR0, VR16); 4894 beq(CCR0, L_XOR); 4895 4896 lvx(VR16, buf); addi(buf, buf, 16); 4897 lvx(VR17, off80, constantsPos); 4898 vpmsumw(VR16, VR16, VR17); 4899 vxor(VR0, VR0, VR16); 4900 beq(CCR0, L_XOR); 4901 4902 lvx(VR16, buf); addi(buf, buf, 16); 4903 lvx(VR17, off96, constantsPos); 4904 vpmsumw(VR16, VR16, VR17); 4905 vxor(VR0, VR0, VR16); 4906 4907 // Now xor all the parallel chunks together 4908 BIND(L_XOR); 4909 vxor(VR0, VR0, VR1); 4910 vxor(VR2, VR2, VR3); 4911 vxor(VR4, VR4, VR5); 4912 vxor(VR6, VR6, VR7); 4913 4914 vxor(VR0, VR0, VR2); 4915 vxor(VR4, VR4, VR6); 4916 4917 vxor(VR0, VR0, VR4); 4918 4919 b(L_barrett_reduction); 4920 4921 BIND(L_first_warm_up_done); 4922 lvx(const1, constantsPos); 4923 addi(constantsPos, constantsPos, 16); 4924 vpmsumd(VR8, VR16, const1); 4925 vpmsumd(VR9, VR17, const1); 4926 vpmsumd(VR10, VR18, const1); 4927 vpmsumd(VR11, VR19, const1); 4928 vpmsumd(VR12, VR20, const1); 4929 vpmsumd(VR13, VR21, const1); 4930 vpmsumd(VR14, VR22, const1); 4931 vpmsumd(VR15, VR23, const1); 4932 b(L_second_cool_down); 4933 4934 BIND(L_barrett_reduction); 4935 4936 lvx(const1, barretConstants); 4937 addi(barretConstants, barretConstants, 16); 4938 lvx(const2, barretConstants); 4939 4940 vsldoi(VR1, VR0, VR0, 8); 4941 vxor(VR0, VR0, VR1); // xor two 64 bit results together 4942 4943 // shift left one bit 4944 vspltisb(VR1, 1); 4945 vsl(VR0, VR0, VR1); 4946 4947 vand(VR0, VR0, mask_64bit); 4948 4949 /* 4950 * The reflected version of Barrett reduction. Instead of bit 4951 * reflecting our data (which is expensive to do), we bit reflect our 4952 * constants and our algorithm, which means the intermediate data in 4953 * our vector registers goes from 0-63 instead of 63-0. We can reflect 4954 * the algorithm because we don't carry in mod 2 arithmetic. 4955 */ 4956 vand(VR1, VR0, mask_32bit); // bottom 32 bits of a 4957 vpmsumd(VR1, VR1, const1); // ma 4958 vand(VR1, VR1, mask_32bit); // bottom 32bits of ma 4959 vpmsumd(VR1, VR1, const2); // qn */ 4960 vxor(VR0, VR0, VR1); // a - qn, subtraction is xor in GF(2) 4961 4962 /* 4963 * Since we are bit reflected, the result (ie the low 32 bits) is in 4964 * the high 32 bits. We just need to shift it left 4 bytes 4965 * V0 [ 0 1 X 3 ] 4966 * V0 [ 0 X 2 3 ] 4967 */ 4968 vsldoi(VR0, VR0, zeroes, 4); // shift result into top 64 bits of 4969 4970 // Get it into r3 4971 mfvrd(crc, VR0); 4972 4973 BIND(L_end); 4974 4975 offsetInt = 0; 4976 // Restore non-volatile Vector registers (frameless). 4977 offsetInt -= 16; li(offset, -16); lvx(VR20, offset, R1_SP); 4978 offsetInt -= 16; addi(offset, offset, -16); lvx(VR21, offset, R1_SP); 4979 offsetInt -= 16; addi(offset, offset, -16); lvx(VR22, offset, R1_SP); 4980 offsetInt -= 16; addi(offset, offset, -16); lvx(VR23, offset, R1_SP); 4981 offsetInt -= 16; addi(offset, offset, -16); lvx(VR24, offset, R1_SP); 4982 offsetInt -= 16; addi(offset, offset, -16); lvx(VR25, offset, R1_SP); 4983 offsetInt -= 16; addi(offset, offset, -16); lvx(VR26, offset, R1_SP); 4984 offsetInt -= 16; addi(offset, offset, -16); lvx(VR27, offset, R1_SP); 4985 offsetInt -= 16; addi(offset, offset, -16); lvx(VR28, offset, R1_SP); 4986 offsetInt -= 8; ld(R22, offsetInt, R1_SP); 4987 offsetInt -= 8; ld(R23, offsetInt, R1_SP); 4988 offsetInt -= 8; ld(R24, offsetInt, R1_SP); 4989 offsetInt -= 8; ld(R25, offsetInt, R1_SP); 4990 offsetInt -= 8; ld(R26, offsetInt, R1_SP); 4991 offsetInt -= 8; ld(R27, offsetInt, R1_SP); 4992 offsetInt -= 8; ld(R28, offsetInt, R1_SP); 4993 offsetInt -= 8; ld(R29, offsetInt, R1_SP); 4994 offsetInt -= 8; ld(R30, offsetInt, R1_SP); 4995 offsetInt -= 8; ld(R31, offsetInt, R1_SP); 4996 } 4997 4998 void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp, bool invertCRC) { 4999 assert_different_registers(crc, buf, /* len, not used!! */ table, tmp); 5000 5001 BLOCK_COMMENT("kernel_crc32_singleByte:"); 5002 if (invertCRC) { 5003 nand(crc, crc, crc); // 1s complement of crc 5004 } 5005 5006 lbz(tmp, 0, buf); // Byte from buffer, zero-extended. 5007 update_byte_crc32(crc, tmp, table); 5008 5009 if (invertCRC) { 5010 nand(crc, crc, crc); // 1s complement of crc 5011 } 5012 } 5013 5014 void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) { 5015 assert_different_registers(crc, val, table); 5016 5017 BLOCK_COMMENT("kernel_crc32_singleByteReg:"); 5018 if (invertCRC) { 5019 nand(crc, crc, crc); // 1s complement of crc 5020 } 5021 5022 update_byte_crc32(crc, val, table); 5023 5024 if (invertCRC) { 5025 nand(crc, crc, crc); // 1s complement of crc 5026 } 5027 } 5028 5029 // dest_lo += src1 + src2 5030 // dest_hi += carry1 + carry2 5031 void MacroAssembler::add2_with_carry(Register dest_hi, 5032 Register dest_lo, 5033 Register src1, Register src2) { 5034 li(R0, 0); 5035 addc(dest_lo, dest_lo, src1); 5036 adde(dest_hi, dest_hi, R0); 5037 addc(dest_lo, dest_lo, src2); 5038 adde(dest_hi, dest_hi, R0); 5039 } 5040 5041 // Multiply 64 bit by 64 bit first loop. 5042 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, 5043 Register x_xstart, 5044 Register y, Register y_idx, 5045 Register z, 5046 Register carry, 5047 Register product_high, Register product, 5048 Register idx, Register kdx, 5049 Register tmp) { 5050 // jlong carry, x[], y[], z[]; 5051 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) { 5052 // huge_128 product = y[idx] * x[xstart] + carry; 5053 // z[kdx] = (jlong)product; 5054 // carry = (jlong)(product >>> 64); 5055 // } 5056 // z[xstart] = carry; 5057 5058 Label L_first_loop, L_first_loop_exit; 5059 Label L_one_x, L_one_y, L_multiply; 5060 5061 addic_(xstart, xstart, -1); 5062 blt(CCR0, L_one_x); // Special case: length of x is 1. 5063 5064 // Load next two integers of x. 5065 sldi(tmp, xstart, LogBytesPerInt); 5066 ldx(x_xstart, x, tmp); 5067 #ifdef VM_LITTLE_ENDIAN 5068 rldicl(x_xstart, x_xstart, 32, 0); 5069 #endif 5070 5071 align(32, 16); 5072 bind(L_first_loop); 5073 5074 cmpdi(CCR0, idx, 1); 5075 blt(CCR0, L_first_loop_exit); 5076 addi(idx, idx, -2); 5077 beq(CCR0, L_one_y); 5078 5079 // Load next two integers of y. 5080 sldi(tmp, idx, LogBytesPerInt); 5081 ldx(y_idx, y, tmp); 5082 #ifdef VM_LITTLE_ENDIAN 5083 rldicl(y_idx, y_idx, 32, 0); 5084 #endif 5085 5086 5087 bind(L_multiply); 5088 multiply64(product_high, product, x_xstart, y_idx); 5089 5090 li(tmp, 0); 5091 addc(product, product, carry); // Add carry to result. 5092 adde(product_high, product_high, tmp); // Add carry of the last addition. 5093 addi(kdx, kdx, -2); 5094 5095 // Store result. 5096 #ifdef VM_LITTLE_ENDIAN 5097 rldicl(product, product, 32, 0); 5098 #endif 5099 sldi(tmp, kdx, LogBytesPerInt); 5100 stdx(product, z, tmp); 5101 mr_if_needed(carry, product_high); 5102 b(L_first_loop); 5103 5104 5105 bind(L_one_y); // Load one 32 bit portion of y as (0,value). 5106 5107 lwz(y_idx, 0, y); 5108 b(L_multiply); 5109 5110 5111 bind(L_one_x); // Load one 32 bit portion of x as (0,value). 5112 5113 lwz(x_xstart, 0, x); 5114 b(L_first_loop); 5115 5116 bind(L_first_loop_exit); 5117 } 5118 5119 // Multiply 64 bit by 64 bit and add 128 bit. 5120 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, 5121 Register z, Register yz_idx, 5122 Register idx, Register carry, 5123 Register product_high, Register product, 5124 Register tmp, int offset) { 5125 5126 // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry; 5127 // z[kdx] = (jlong)product; 5128 5129 sldi(tmp, idx, LogBytesPerInt); 5130 if (offset) { 5131 addi(tmp, tmp, offset); 5132 } 5133 ldx(yz_idx, y, tmp); 5134 #ifdef VM_LITTLE_ENDIAN 5135 rldicl(yz_idx, yz_idx, 32, 0); 5136 #endif 5137 5138 multiply64(product_high, product, x_xstart, yz_idx); 5139 ldx(yz_idx, z, tmp); 5140 #ifdef VM_LITTLE_ENDIAN 5141 rldicl(yz_idx, yz_idx, 32, 0); 5142 #endif 5143 5144 add2_with_carry(product_high, product, carry, yz_idx); 5145 5146 sldi(tmp, idx, LogBytesPerInt); 5147 if (offset) { 5148 addi(tmp, tmp, offset); 5149 } 5150 #ifdef VM_LITTLE_ENDIAN 5151 rldicl(product, product, 32, 0); 5152 #endif 5153 stdx(product, z, tmp); 5154 } 5155 5156 // Multiply 128 bit by 128 bit. Unrolled inner loop. 5157 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, 5158 Register y, Register z, 5159 Register yz_idx, Register idx, Register carry, 5160 Register product_high, Register product, 5161 Register carry2, Register tmp) { 5162 5163 // jlong carry, x[], y[], z[]; 5164 // int kdx = ystart+1; 5165 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 5166 // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry; 5167 // z[kdx+idx+1] = (jlong)product; 5168 // jlong carry2 = (jlong)(product >>> 64); 5169 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2; 5170 // z[kdx+idx] = (jlong)product; 5171 // carry = (jlong)(product >>> 64); 5172 // } 5173 // idx += 2; 5174 // if (idx > 0) { 5175 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry; 5176 // z[kdx+idx] = (jlong)product; 5177 // carry = (jlong)(product >>> 64); 5178 // } 5179 5180 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 5181 const Register jdx = R0; 5182 5183 // Scale the index. 5184 srdi_(jdx, idx, 2); 5185 beq(CCR0, L_third_loop_exit); 5186 mtctr(jdx); 5187 5188 align(32, 16); 5189 bind(L_third_loop); 5190 5191 addi(idx, idx, -4); 5192 5193 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8); 5194 mr_if_needed(carry2, product_high); 5195 5196 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0); 5197 mr_if_needed(carry, product_high); 5198 bdnz(L_third_loop); 5199 5200 bind(L_third_loop_exit); // Handle any left-over operand parts. 5201 5202 andi_(idx, idx, 0x3); 5203 beq(CCR0, L_post_third_loop_done); 5204 5205 Label L_check_1; 5206 5207 addic_(idx, idx, -2); 5208 blt(CCR0, L_check_1); 5209 5210 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0); 5211 mr_if_needed(carry, product_high); 5212 5213 bind(L_check_1); 5214 5215 addi(idx, idx, 0x2); 5216 andi_(idx, idx, 0x1); 5217 addic_(idx, idx, -1); 5218 blt(CCR0, L_post_third_loop_done); 5219 5220 sldi(tmp, idx, LogBytesPerInt); 5221 lwzx(yz_idx, y, tmp); 5222 multiply64(product_high, product, x_xstart, yz_idx); 5223 lwzx(yz_idx, z, tmp); 5224 5225 add2_with_carry(product_high, product, yz_idx, carry); 5226 5227 sldi(tmp, idx, LogBytesPerInt); 5228 stwx(product, z, tmp); 5229 srdi(product, product, 32); 5230 5231 sldi(product_high, product_high, 32); 5232 orr(product, product, product_high); 5233 mr_if_needed(carry, product); 5234 5235 bind(L_post_third_loop_done); 5236 } // multiply_128_x_128_loop 5237 5238 void MacroAssembler::multiply_to_len(Register x, Register xlen, 5239 Register y, Register ylen, 5240 Register z, Register zlen, 5241 Register tmp1, Register tmp2, 5242 Register tmp3, Register tmp4, 5243 Register tmp5, Register tmp6, 5244 Register tmp7, Register tmp8, 5245 Register tmp9, Register tmp10, 5246 Register tmp11, Register tmp12, 5247 Register tmp13) { 5248 5249 ShortBranchVerifier sbv(this); 5250 5251 assert_different_registers(x, xlen, y, ylen, z, zlen, 5252 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6); 5253 assert_different_registers(x, xlen, y, ylen, z, zlen, 5254 tmp1, tmp2, tmp3, tmp4, tmp5, tmp7); 5255 assert_different_registers(x, xlen, y, ylen, z, zlen, 5256 tmp1, tmp2, tmp3, tmp4, tmp5, tmp8); 5257 5258 const Register idx = tmp1; 5259 const Register kdx = tmp2; 5260 const Register xstart = tmp3; 5261 5262 const Register y_idx = tmp4; 5263 const Register carry = tmp5; 5264 const Register product = tmp6; 5265 const Register product_high = tmp7; 5266 const Register x_xstart = tmp8; 5267 const Register tmp = tmp9; 5268 5269 // First Loop. 5270 // 5271 // final static long LONG_MASK = 0xffffffffL; 5272 // int xstart = xlen - 1; 5273 // int ystart = ylen - 1; 5274 // long carry = 0; 5275 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 5276 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; 5277 // z[kdx] = (int)product; 5278 // carry = product >>> 32; 5279 // } 5280 // z[xstart] = (int)carry; 5281 5282 mr_if_needed(idx, ylen); // idx = ylen 5283 mr_if_needed(kdx, zlen); // kdx = xlen + ylen 5284 li(carry, 0); // carry = 0 5285 5286 Label L_done; 5287 5288 addic_(xstart, xlen, -1); 5289 blt(CCR0, L_done); 5290 5291 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, 5292 carry, product_high, product, idx, kdx, tmp); 5293 5294 Label L_second_loop; 5295 5296 cmpdi(CCR0, kdx, 0); 5297 beq(CCR0, L_second_loop); 5298 5299 Label L_carry; 5300 5301 addic_(kdx, kdx, -1); 5302 beq(CCR0, L_carry); 5303 5304 // Store lower 32 bits of carry. 5305 sldi(tmp, kdx, LogBytesPerInt); 5306 stwx(carry, z, tmp); 5307 srdi(carry, carry, 32); 5308 addi(kdx, kdx, -1); 5309 5310 5311 bind(L_carry); 5312 5313 // Store upper 32 bits of carry. 5314 sldi(tmp, kdx, LogBytesPerInt); 5315 stwx(carry, z, tmp); 5316 5317 // Second and third (nested) loops. 5318 // 5319 // for (int i = xstart-1; i >= 0; i--) { // Second loop 5320 // carry = 0; 5321 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop 5322 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + 5323 // (z[k] & LONG_MASK) + carry; 5324 // z[k] = (int)product; 5325 // carry = product >>> 32; 5326 // } 5327 // z[i] = (int)carry; 5328 // } 5329 // 5330 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx 5331 5332 bind(L_second_loop); 5333 5334 li(carry, 0); // carry = 0; 5335 5336 addic_(xstart, xstart, -1); // i = xstart-1; 5337 blt(CCR0, L_done); 5338 5339 Register zsave = tmp10; 5340 5341 mr(zsave, z); 5342 5343 5344 Label L_last_x; 5345 5346 sldi(tmp, xstart, LogBytesPerInt); 5347 add(z, z, tmp); // z = z + k - j 5348 addi(z, z, 4); 5349 addic_(xstart, xstart, -1); // i = xstart-1; 5350 blt(CCR0, L_last_x); 5351 5352 sldi(tmp, xstart, LogBytesPerInt); 5353 ldx(x_xstart, x, tmp); 5354 #ifdef VM_LITTLE_ENDIAN 5355 rldicl(x_xstart, x_xstart, 32, 0); 5356 #endif 5357 5358 5359 Label L_third_loop_prologue; 5360 5361 bind(L_third_loop_prologue); 5362 5363 Register xsave = tmp11; 5364 Register xlensave = tmp12; 5365 Register ylensave = tmp13; 5366 5367 mr(xsave, x); 5368 mr(xlensave, xstart); 5369 mr(ylensave, ylen); 5370 5371 5372 multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen, 5373 carry, product_high, product, x, tmp); 5374 5375 mr(z, zsave); 5376 mr(x, xsave); 5377 mr(xlen, xlensave); // This is the decrement of the loop counter! 5378 mr(ylen, ylensave); 5379 5380 addi(tmp3, xlen, 1); 5381 sldi(tmp, tmp3, LogBytesPerInt); 5382 stwx(carry, z, tmp); 5383 addic_(tmp3, tmp3, -1); 5384 blt(CCR0, L_done); 5385 5386 srdi(carry, carry, 32); 5387 sldi(tmp, tmp3, LogBytesPerInt); 5388 stwx(carry, z, tmp); 5389 b(L_second_loop); 5390 5391 // Next infrequent code is moved outside loops. 5392 bind(L_last_x); 5393 5394 lwz(x_xstart, 0, x); 5395 b(L_third_loop_prologue); 5396 5397 bind(L_done); 5398 } // multiply_to_len 5399 5400 void MacroAssembler::asm_assert(bool check_equal, const char *msg, int id) { 5401 #ifdef ASSERT 5402 Label ok; 5403 if (check_equal) { 5404 beq(CCR0, ok); 5405 } else { 5406 bne(CCR0, ok); 5407 } 5408 stop(msg, id); 5409 bind(ok); 5410 #endif 5411 } 5412 5413 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset, 5414 Register mem_base, const char* msg, int id) { 5415 #ifdef ASSERT 5416 switch (size) { 5417 case 4: 5418 lwz(R0, mem_offset, mem_base); 5419 cmpwi(CCR0, R0, 0); 5420 break; 5421 case 8: 5422 ld(R0, mem_offset, mem_base); 5423 cmpdi(CCR0, R0, 0); 5424 break; 5425 default: 5426 ShouldNotReachHere(); 5427 } 5428 asm_assert(check_equal, msg, id); 5429 #endif // ASSERT 5430 } 5431 5432 void MacroAssembler::verify_thread() { 5433 if (VerifyThread) { 5434 unimplemented("'VerifyThread' currently not implemented on PPC"); 5435 } 5436 } 5437 5438 // READ: oop. KILL: R0. Volatile floats perhaps. 5439 void MacroAssembler::verify_oop(Register oop, const char* msg) { 5440 if (!VerifyOops) { 5441 return; 5442 } 5443 5444 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address(); 5445 const Register tmp = R11; // Will be preserved. 5446 const int nbytes_save = MacroAssembler::num_volatile_regs * 8; 5447 save_volatile_gprs(R1_SP, -nbytes_save); // except R0 5448 5449 mr_if_needed(R4_ARG2, oop); 5450 save_LR_CR(tmp); // save in old frame 5451 push_frame_reg_args(nbytes_save, tmp); 5452 // load FunctionDescriptor** / entry_address * 5453 load_const_optimized(tmp, fd, R0); 5454 // load FunctionDescriptor* / entry_address 5455 ld(tmp, 0, tmp); 5456 load_const_optimized(R3_ARG1, (address)msg, R0); 5457 // Call destination for its side effect. 5458 call_c(tmp); 5459 5460 pop_frame(); 5461 restore_LR_CR(tmp); 5462 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0 5463 } 5464 5465 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) { 5466 if (!VerifyOops) { 5467 return; 5468 } 5469 5470 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address(); 5471 const Register tmp = R11; // Will be preserved. 5472 const int nbytes_save = MacroAssembler::num_volatile_regs * 8; 5473 save_volatile_gprs(R1_SP, -nbytes_save); // except R0 5474 5475 ld(R4_ARG2, offs, base); 5476 save_LR_CR(tmp); // save in old frame 5477 push_frame_reg_args(nbytes_save, tmp); 5478 // load FunctionDescriptor** / entry_address * 5479 load_const_optimized(tmp, fd, R0); 5480 // load FunctionDescriptor* / entry_address 5481 ld(tmp, 0, tmp); 5482 load_const_optimized(R3_ARG1, (address)msg, R0); 5483 // Call destination for its side effect. 5484 call_c(tmp); 5485 5486 pop_frame(); 5487 restore_LR_CR(tmp); 5488 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0 5489 } 5490 5491 const char* stop_types[] = { 5492 "stop", 5493 "untested", 5494 "unimplemented", 5495 "shouldnotreachhere" 5496 }; 5497 5498 static void stop_on_request(int tp, const char* msg) { 5499 tty->print("PPC assembly code requires stop: (%s) %s\n", stop_types[tp%/*stop_end*/4], msg); 5500 guarantee(false, "PPC assembly code requires stop: %s", msg); 5501 } 5502 5503 // Call a C-function that prints output. 5504 void MacroAssembler::stop(int type, const char* msg, int id) { 5505 #ifndef PRODUCT 5506 block_comment(err_msg("stop: %s %s {", stop_types[type%stop_end], msg)); 5507 #else 5508 block_comment("stop {"); 5509 #endif 5510 5511 // setup arguments 5512 load_const_optimized(R3_ARG1, type); 5513 load_const_optimized(R4_ARG2, (void *)msg, /*tmp=*/R0); 5514 call_VM_leaf(CAST_FROM_FN_PTR(address, stop_on_request), R3_ARG1, R4_ARG2); 5515 illtrap(); 5516 emit_int32(id); 5517 block_comment("} stop;"); 5518 } 5519 5520 #ifndef PRODUCT 5521 // Write pattern 0x0101010101010101 in memory region [low-before, high+after]. 5522 // Val, addr are temp registers. 5523 // If low == addr, addr is killed. 5524 // High is preserved. 5525 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) { 5526 if (!ZapMemory) return; 5527 5528 assert_different_registers(low, val); 5529 5530 BLOCK_COMMENT("zap memory region {"); 5531 load_const_optimized(val, 0x0101010101010101); 5532 int size = before + after; 5533 if (low == high && size < 5 && size > 0) { 5534 int offset = -before*BytesPerWord; 5535 for (int i = 0; i < size; ++i) { 5536 std(val, offset, low); 5537 offset += (1*BytesPerWord); 5538 } 5539 } else { 5540 addi(addr, low, -before*BytesPerWord); 5541 assert_different_registers(high, val); 5542 if (after) addi(high, high, after * BytesPerWord); 5543 Label loop; 5544 bind(loop); 5545 std(val, 0, addr); 5546 addi(addr, addr, 8); 5547 cmpd(CCR6, addr, high); 5548 ble(CCR6, loop); 5549 if (after) addi(high, high, -after * BytesPerWord); // Correct back to old value. 5550 } 5551 BLOCK_COMMENT("} zap memory region"); 5552 } 5553 5554 #endif // !PRODUCT 5555 5556 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() { 5557 int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true); 5558 assert(sizeof(bool) == 1, "PowerPC ABI"); 5559 masm->lbz(temp, simm16_offset, temp); 5560 masm->cmpwi(CCR0, temp, 0); 5561 masm->beq(CCR0, _label); 5562 } 5563 5564 SkipIfEqualZero::~SkipIfEqualZero() { 5565 _masm->bind(_label); 5566 }