1 /* 2 * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2012, 2018, SAP SE. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.inline.hpp" 28 #include "compiler/disassembler.hpp" 29 #include "gc/shared/collectedHeap.inline.hpp" 30 #include "gc/shared/barrierSet.hpp" 31 #include "gc/shared/barrierSetAssembler.hpp" 32 #include "interpreter/interpreter.hpp" 33 #include "memory/resourceArea.hpp" 34 #include "nativeInst_ppc.hpp" 35 #include "prims/methodHandles.hpp" 36 #include "runtime/biasedLocking.hpp" 37 #include "runtime/icache.hpp" 38 #include "runtime/interfaceSupport.inline.hpp" 39 #include "runtime/objectMonitor.hpp" 40 #include "runtime/os.hpp" 41 #include "runtime/safepoint.hpp" 42 #include "runtime/safepointMechanism.hpp" 43 #include "runtime/sharedRuntime.hpp" 44 #include "runtime/stubRoutines.hpp" 45 #include "utilities/macros.hpp" 46 #ifdef COMPILER2 47 #include "opto/intrinsicnode.hpp" 48 #endif 49 50 #ifdef PRODUCT 51 #define BLOCK_COMMENT(str) // nothing 52 #else 53 #define BLOCK_COMMENT(str) block_comment(str) 54 #endif 55 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 56 57 #ifdef ASSERT 58 // On RISC, there's no benefit to verifying instruction boundaries. 59 bool AbstractAssembler::pd_check_instruction_mark() { return false; } 60 #endif 61 62 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) { 63 assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range"); 64 if (Assembler::is_simm(si31, 16)) { 65 ld(d, si31, a); 66 if (emit_filler_nop) nop(); 67 } else { 68 const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31); 69 const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31); 70 addis(d, a, hi); 71 ld(d, lo, d); 72 } 73 } 74 75 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) { 76 assert_different_registers(d, a); 77 ld_largeoffset_unchecked(d, si31, a, emit_filler_nop); 78 } 79 80 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base, 81 size_t size_in_bytes, bool is_signed) { 82 switch (size_in_bytes) { 83 case 8: ld(dst, offs, base); break; 84 case 4: is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break; 85 case 2: is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break; 86 case 1: lbz(dst, offs, base); if (is_signed) extsb(dst, dst); break; // lba doesn't exist :( 87 default: ShouldNotReachHere(); 88 } 89 } 90 91 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base, 92 size_t size_in_bytes) { 93 switch (size_in_bytes) { 94 case 8: std(dst, offs, base); break; 95 case 4: stw(dst, offs, base); break; 96 case 2: sth(dst, offs, base); break; 97 case 1: stb(dst, offs, base); break; 98 default: ShouldNotReachHere(); 99 } 100 } 101 102 void MacroAssembler::align(int modulus, int max, int rem) { 103 int padding = (rem + modulus - (offset() % modulus)) % modulus; 104 if (padding > max) return; 105 for (int c = (padding >> 2); c > 0; --c) { nop(); } 106 } 107 108 // Issue instructions that calculate given TOC from global TOC. 109 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16, 110 bool add_relocation, bool emit_dummy_addr) { 111 int offset = -1; 112 if (emit_dummy_addr) { 113 offset = -128; // dummy address 114 } else if (addr != (address)(intptr_t)-1) { 115 offset = MacroAssembler::offset_to_global_toc(addr); 116 } 117 118 if (hi16) { 119 addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset)); 120 } 121 if (lo16) { 122 if (add_relocation) { 123 // Relocate at the addi to avoid confusion with a load from the method's TOC. 124 relocate(internal_word_Relocation::spec(addr)); 125 } 126 addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset)); 127 } 128 } 129 130 address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) { 131 const int offset = MacroAssembler::offset_to_global_toc(addr); 132 133 const address inst2_addr = a; 134 const int inst2 = *(int *)inst2_addr; 135 136 // The relocation points to the second instruction, the addi, 137 // and the addi reads and writes the same register dst. 138 const int dst = inv_rt_field(inst2); 139 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 140 141 // Now, find the preceding addis which writes to dst. 142 int inst1 = 0; 143 address inst1_addr = inst2_addr - BytesPerInstWord; 144 while (inst1_addr >= bound) { 145 inst1 = *(int *) inst1_addr; 146 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 147 // Stop, found the addis which writes dst. 148 break; 149 } 150 inst1_addr -= BytesPerInstWord; 151 } 152 153 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 154 set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset)); 155 set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset)); 156 return inst1_addr; 157 } 158 159 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) { 160 const address inst2_addr = a; 161 const int inst2 = *(int *)inst2_addr; 162 163 // The relocation points to the second instruction, the addi, 164 // and the addi reads and writes the same register dst. 165 const int dst = inv_rt_field(inst2); 166 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 167 168 // Now, find the preceding addis which writes to dst. 169 int inst1 = 0; 170 address inst1_addr = inst2_addr - BytesPerInstWord; 171 while (inst1_addr >= bound) { 172 inst1 = *(int *) inst1_addr; 173 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 174 // stop, found the addis which writes dst 175 break; 176 } 177 inst1_addr -= BytesPerInstWord; 178 } 179 180 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 181 182 int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0); 183 // -1 is a special case 184 if (offset == -1) { 185 return (address)(intptr_t)-1; 186 } else { 187 return global_toc() + offset; 188 } 189 } 190 191 #ifdef _LP64 192 // Patch compressed oops or klass constants. 193 // Assembler sequence is 194 // 1) compressed oops: 195 // lis rx = const.hi 196 // ori rx = rx | const.lo 197 // 2) compressed klass: 198 // lis rx = const.hi 199 // clrldi rx = rx & 0xFFFFffff // clearMS32b, optional 200 // ori rx = rx | const.lo 201 // Clrldi will be passed by. 202 address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) { 203 assert(UseCompressedOops, "Should only patch compressed oops"); 204 205 const address inst2_addr = a; 206 const int inst2 = *(int *)inst2_addr; 207 208 // The relocation points to the second instruction, the ori, 209 // and the ori reads and writes the same register dst. 210 const int dst = inv_rta_field(inst2); 211 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 212 // Now, find the preceding addis which writes to dst. 213 int inst1 = 0; 214 address inst1_addr = inst2_addr - BytesPerInstWord; 215 bool inst1_found = false; 216 while (inst1_addr >= bound) { 217 inst1 = *(int *)inst1_addr; 218 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; } 219 inst1_addr -= BytesPerInstWord; 220 } 221 assert(inst1_found, "inst is not lis"); 222 223 int xc = (data >> 16) & 0xffff; 224 int xd = (data >> 0) & 0xffff; 225 226 set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo 227 set_imm((int *)inst2_addr, (xd)); // unsigned int 228 return inst1_addr; 229 } 230 231 // Get compressed oop or klass constant. 232 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) { 233 assert(UseCompressedOops, "Should only patch compressed oops"); 234 235 const address inst2_addr = a; 236 const int inst2 = *(int *)inst2_addr; 237 238 // The relocation points to the second instruction, the ori, 239 // and the ori reads and writes the same register dst. 240 const int dst = inv_rta_field(inst2); 241 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 242 // Now, find the preceding lis which writes to dst. 243 int inst1 = 0; 244 address inst1_addr = inst2_addr - BytesPerInstWord; 245 bool inst1_found = false; 246 247 while (inst1_addr >= bound) { 248 inst1 = *(int *) inst1_addr; 249 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;} 250 inst1_addr -= BytesPerInstWord; 251 } 252 assert(inst1_found, "inst is not lis"); 253 254 uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff)); 255 uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16); 256 257 return (int) (xl | xh); 258 } 259 #endif // _LP64 260 261 // Returns true if successful. 262 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a, 263 Register toc, bool fixed_size) { 264 int toc_offset = 0; 265 // Use RelocationHolder::none for the constant pool entry, otherwise 266 // we will end up with a failing NativeCall::verify(x) where x is 267 // the address of the constant pool entry. 268 // FIXME: We should insert relocation information for oops at the constant 269 // pool entries instead of inserting it at the loads; patching of a constant 270 // pool entry should be less expensive. 271 address const_address = address_constant((address)a.value(), RelocationHolder::none); 272 if (const_address == NULL) { return false; } // allocation failure 273 // Relocate at the pc of the load. 274 relocate(a.rspec()); 275 toc_offset = (int)(const_address - code()->consts()->start()); 276 ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size); 277 return true; 278 } 279 280 bool MacroAssembler::is_load_const_from_method_toc_at(address a) { 281 const address inst1_addr = a; 282 const int inst1 = *(int *)inst1_addr; 283 284 // The relocation points to the ld or the addis. 285 return (is_ld(inst1)) || 286 (is_addis(inst1) && inv_ra_field(inst1) != 0); 287 } 288 289 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) { 290 assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc"); 291 292 const address inst1_addr = a; 293 const int inst1 = *(int *)inst1_addr; 294 295 if (is_ld(inst1)) { 296 return inv_d1_field(inst1); 297 } else if (is_addis(inst1)) { 298 const int dst = inv_rt_field(inst1); 299 300 // Now, find the succeeding ld which reads and writes to dst. 301 address inst2_addr = inst1_addr + BytesPerInstWord; 302 int inst2 = 0; 303 while (true) { 304 inst2 = *(int *) inst2_addr; 305 if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) { 306 // Stop, found the ld which reads and writes dst. 307 break; 308 } 309 inst2_addr += BytesPerInstWord; 310 } 311 return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2); 312 } 313 ShouldNotReachHere(); 314 return 0; 315 } 316 317 // Get the constant from a `load_const' sequence. 318 long MacroAssembler::get_const(address a) { 319 assert(is_load_const_at(a), "not a load of a constant"); 320 const int *p = (const int*) a; 321 unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48); 322 if (is_ori(*(p+1))) { 323 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32); 324 x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16); 325 x |= (((unsigned long) (get_imm(a,4) & 0xffff))); 326 } else if (is_lis(*(p+1))) { 327 x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32); 328 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16); 329 x |= (((unsigned long) (get_imm(a,3) & 0xffff))); 330 } else { 331 ShouldNotReachHere(); 332 return (long) 0; 333 } 334 return (long) x; 335 } 336 337 // Patch the 64 bit constant of a `load_const' sequence. This is a low 338 // level procedure. It neither flushes the instruction cache nor is it 339 // mt safe. 340 void MacroAssembler::patch_const(address a, long x) { 341 assert(is_load_const_at(a), "not a load of a constant"); 342 int *p = (int*) a; 343 if (is_ori(*(p+1))) { 344 set_imm(0 + p, (x >> 48) & 0xffff); 345 set_imm(1 + p, (x >> 32) & 0xffff); 346 set_imm(3 + p, (x >> 16) & 0xffff); 347 set_imm(4 + p, x & 0xffff); 348 } else if (is_lis(*(p+1))) { 349 set_imm(0 + p, (x >> 48) & 0xffff); 350 set_imm(2 + p, (x >> 32) & 0xffff); 351 set_imm(1 + p, (x >> 16) & 0xffff); 352 set_imm(3 + p, x & 0xffff); 353 } else { 354 ShouldNotReachHere(); 355 } 356 } 357 358 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) { 359 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 360 int index = oop_recorder()->allocate_metadata_index(obj); 361 RelocationHolder rspec = metadata_Relocation::spec(index); 362 return AddressLiteral((address)obj, rspec); 363 } 364 365 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) { 366 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 367 int index = oop_recorder()->find_index(obj); 368 RelocationHolder rspec = metadata_Relocation::spec(index); 369 return AddressLiteral((address)obj, rspec); 370 } 371 372 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) { 373 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 374 int oop_index = oop_recorder()->allocate_oop_index(obj); 375 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 376 } 377 378 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) { 379 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 380 int oop_index = oop_recorder()->find_index(obj); 381 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 382 } 383 384 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr, 385 Register tmp, int offset) { 386 intptr_t value = *delayed_value_addr; 387 if (value != 0) { 388 return RegisterOrConstant(value + offset); 389 } 390 391 // Load indirectly to solve generation ordering problem. 392 // static address, no relocation 393 int simm16_offset = load_const_optimized(tmp, delayed_value_addr, noreg, true); 394 ld(tmp, simm16_offset, tmp); // must be aligned ((xa & 3) == 0) 395 396 if (offset != 0) { 397 addi(tmp, tmp, offset); 398 } 399 400 return RegisterOrConstant(tmp); 401 } 402 403 #ifndef PRODUCT 404 void MacroAssembler::pd_print_patched_instruction(address branch) { 405 Unimplemented(); // TODO: PPC port 406 } 407 #endif // ndef PRODUCT 408 409 // Conditional far branch for destinations encodable in 24+2 bits. 410 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) { 411 412 // If requested by flag optimize, relocate the bc_far as a 413 // runtime_call and prepare for optimizing it when the code gets 414 // relocated. 415 if (optimize == bc_far_optimize_on_relocate) { 416 relocate(relocInfo::runtime_call_type); 417 } 418 419 // variant 2: 420 // 421 // b!cxx SKIP 422 // bxx DEST 423 // SKIP: 424 // 425 426 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 427 opposite_bcond(inv_boint_bcond(boint))); 428 429 // We emit two branches. 430 // First, a conditional branch which jumps around the far branch. 431 const address not_taken_pc = pc() + 2 * BytesPerInstWord; 432 const address bc_pc = pc(); 433 bc(opposite_boint, biint, not_taken_pc); 434 435 const int bc_instr = *(int*)bc_pc; 436 assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition"); 437 assert(opposite_boint == inv_bo_field(bc_instr), "postcondition"); 438 assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))), 439 opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))), 440 "postcondition"); 441 assert(biint == inv_bi_field(bc_instr), "postcondition"); 442 443 // Second, an unconditional far branch which jumps to dest. 444 // Note: target(dest) remembers the current pc (see CodeSection::target) 445 // and returns the current pc if the label is not bound yet; when 446 // the label gets bound, the unconditional far branch will be patched. 447 const address target_pc = target(dest); 448 const address b_pc = pc(); 449 b(target_pc); 450 451 assert(not_taken_pc == pc(), "postcondition"); 452 assert(dest.is_bound() || target_pc == b_pc, "postcondition"); 453 } 454 455 // 1 or 2 instructions 456 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) { 457 if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) { 458 bc(boint, biint, dest); 459 } else { 460 bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate); 461 } 462 } 463 464 bool MacroAssembler::is_bc_far_at(address instruction_addr) { 465 return is_bc_far_variant1_at(instruction_addr) || 466 is_bc_far_variant2_at(instruction_addr) || 467 is_bc_far_variant3_at(instruction_addr); 468 } 469 470 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) { 471 if (is_bc_far_variant1_at(instruction_addr)) { 472 const address instruction_1_addr = instruction_addr; 473 const int instruction_1 = *(int*)instruction_1_addr; 474 return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr); 475 } else if (is_bc_far_variant2_at(instruction_addr)) { 476 const address instruction_2_addr = instruction_addr + 4; 477 return bxx_destination(instruction_2_addr); 478 } else if (is_bc_far_variant3_at(instruction_addr)) { 479 return instruction_addr + 8; 480 } 481 // variant 4 ??? 482 ShouldNotReachHere(); 483 return NULL; 484 } 485 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) { 486 487 if (is_bc_far_variant3_at(instruction_addr)) { 488 // variant 3, far cond branch to the next instruction, already patched to nops: 489 // 490 // nop 491 // endgroup 492 // SKIP/DEST: 493 // 494 return; 495 } 496 497 // first, extract boint and biint from the current branch 498 int boint = 0; 499 int biint = 0; 500 501 ResourceMark rm; 502 const int code_size = 2 * BytesPerInstWord; 503 CodeBuffer buf(instruction_addr, code_size); 504 MacroAssembler masm(&buf); 505 if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) { 506 // Far branch to next instruction: Optimize it by patching nops (produce variant 3). 507 masm.nop(); 508 masm.endgroup(); 509 } else { 510 if (is_bc_far_variant1_at(instruction_addr)) { 511 // variant 1, the 1st instruction contains the destination address: 512 // 513 // bcxx DEST 514 // nop 515 // 516 const int instruction_1 = *(int*)(instruction_addr); 517 boint = inv_bo_field(instruction_1); 518 biint = inv_bi_field(instruction_1); 519 } else if (is_bc_far_variant2_at(instruction_addr)) { 520 // variant 2, the 2nd instruction contains the destination address: 521 // 522 // b!cxx SKIP 523 // bxx DEST 524 // SKIP: 525 // 526 const int instruction_1 = *(int*)(instruction_addr); 527 boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))), 528 opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1)))); 529 biint = inv_bi_field(instruction_1); 530 } else { 531 // variant 4??? 532 ShouldNotReachHere(); 533 } 534 535 // second, set the new branch destination and optimize the code 536 if (dest != instruction_addr + 4 && // the bc_far is still unbound! 537 masm.is_within_range_of_bcxx(dest, instruction_addr)) { 538 // variant 1: 539 // 540 // bcxx DEST 541 // nop 542 // 543 masm.bc(boint, biint, dest); 544 masm.nop(); 545 } else { 546 // variant 2: 547 // 548 // b!cxx SKIP 549 // bxx DEST 550 // SKIP: 551 // 552 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 553 opposite_bcond(inv_boint_bcond(boint))); 554 const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord; 555 masm.bc(opposite_boint, biint, not_taken_pc); 556 masm.b(dest); 557 } 558 } 559 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 560 } 561 562 // Emit a NOT mt-safe patchable 64 bit absolute call/jump. 563 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) { 564 // get current pc 565 uint64_t start_pc = (uint64_t) pc(); 566 567 const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last 568 const address pc_of_b = (address) (start_pc + (0*BytesPerInstWord)); // b is first 569 570 // relocate here 571 if (rt != relocInfo::none) { 572 relocate(rt); 573 } 574 575 if ( ReoptimizeCallSequences && 576 (( link && is_within_range_of_b(dest, pc_of_bl)) || 577 (!link && is_within_range_of_b(dest, pc_of_b)))) { 578 // variant 2: 579 // Emit an optimized, pc-relative call/jump. 580 581 if (link) { 582 // some padding 583 nop(); 584 nop(); 585 nop(); 586 nop(); 587 nop(); 588 nop(); 589 590 // do the call 591 assert(pc() == pc_of_bl, "just checking"); 592 bl(dest, relocInfo::none); 593 } else { 594 // do the jump 595 assert(pc() == pc_of_b, "just checking"); 596 b(dest, relocInfo::none); 597 598 // some padding 599 nop(); 600 nop(); 601 nop(); 602 nop(); 603 nop(); 604 nop(); 605 } 606 607 // Assert that we can identify the emitted call/jump. 608 assert(is_bxx64_patchable_variant2_at((address)start_pc, link), 609 "can't identify emitted call"); 610 } else { 611 // variant 1: 612 mr(R0, R11); // spill R11 -> R0. 613 614 // Load the destination address into CTR, 615 // calculate destination relative to global toc. 616 calculate_address_from_global_toc(R11, dest, true, true, false); 617 618 mtctr(R11); 619 mr(R11, R0); // spill R11 <- R0. 620 nop(); 621 622 // do the call/jump 623 if (link) { 624 bctrl(); 625 } else{ 626 bctr(); 627 } 628 // Assert that we can identify the emitted call/jump. 629 assert(is_bxx64_patchable_variant1b_at((address)start_pc, link), 630 "can't identify emitted call"); 631 } 632 633 // Assert that we can identify the emitted call/jump. 634 assert(is_bxx64_patchable_at((address)start_pc, link), 635 "can't identify emitted call"); 636 assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest, 637 "wrong encoding of dest address"); 638 } 639 640 // Identify a bxx64_patchable instruction. 641 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) { 642 return is_bxx64_patchable_variant1b_at(instruction_addr, link) 643 //|| is_bxx64_patchable_variant1_at(instruction_addr, link) 644 || is_bxx64_patchable_variant2_at(instruction_addr, link); 645 } 646 647 // Does the call64_patchable instruction use a pc-relative encoding of 648 // the call destination? 649 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) { 650 // variant 2 is pc-relative 651 return is_bxx64_patchable_variant2_at(instruction_addr, link); 652 } 653 654 // Identify variant 1. 655 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) { 656 unsigned int* instr = (unsigned int*) instruction_addr; 657 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 658 && is_mtctr(instr[5]) // mtctr 659 && is_load_const_at(instruction_addr); 660 } 661 662 // Identify variant 1b: load destination relative to global toc. 663 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) { 664 unsigned int* instr = (unsigned int*) instruction_addr; 665 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 666 && is_mtctr(instr[3]) // mtctr 667 && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr); 668 } 669 670 // Identify variant 2. 671 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) { 672 unsigned int* instr = (unsigned int*) instruction_addr; 673 if (link) { 674 return is_bl (instr[6]) // bl dest is last 675 && is_nop(instr[0]) // nop 676 && is_nop(instr[1]) // nop 677 && is_nop(instr[2]) // nop 678 && is_nop(instr[3]) // nop 679 && is_nop(instr[4]) // nop 680 && is_nop(instr[5]); // nop 681 } else { 682 return is_b (instr[0]) // b dest is first 683 && is_nop(instr[1]) // nop 684 && is_nop(instr[2]) // nop 685 && is_nop(instr[3]) // nop 686 && is_nop(instr[4]) // nop 687 && is_nop(instr[5]) // nop 688 && is_nop(instr[6]); // nop 689 } 690 } 691 692 // Set dest address of a bxx64_patchable instruction. 693 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) { 694 ResourceMark rm; 695 int code_size = MacroAssembler::bxx64_patchable_size; 696 CodeBuffer buf(instruction_addr, code_size); 697 MacroAssembler masm(&buf); 698 masm.bxx64_patchable(dest, relocInfo::none, link); 699 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 700 } 701 702 // Get dest address of a bxx64_patchable instruction. 703 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) { 704 if (is_bxx64_patchable_variant1_at(instruction_addr, link)) { 705 return (address) (unsigned long) get_const(instruction_addr); 706 } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) { 707 unsigned int* instr = (unsigned int*) instruction_addr; 708 if (link) { 709 const int instr_idx = 6; // bl is last 710 int branchoffset = branch_destination(instr[instr_idx], 0); 711 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 712 } else { 713 const int instr_idx = 0; // b is first 714 int branchoffset = branch_destination(instr[instr_idx], 0); 715 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 716 } 717 // Load dest relative to global toc. 718 } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) { 719 return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, 720 instruction_addr); 721 } else { 722 ShouldNotReachHere(); 723 return NULL; 724 } 725 } 726 727 // Uses ordering which corresponds to ABI: 728 // _savegpr0_14: std r14,-144(r1) 729 // _savegpr0_15: std r15,-136(r1) 730 // _savegpr0_16: std r16,-128(r1) 731 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) { 732 std(R14, offset, dst); offset += 8; 733 std(R15, offset, dst); offset += 8; 734 std(R16, offset, dst); offset += 8; 735 std(R17, offset, dst); offset += 8; 736 std(R18, offset, dst); offset += 8; 737 std(R19, offset, dst); offset += 8; 738 std(R20, offset, dst); offset += 8; 739 std(R21, offset, dst); offset += 8; 740 std(R22, offset, dst); offset += 8; 741 std(R23, offset, dst); offset += 8; 742 std(R24, offset, dst); offset += 8; 743 std(R25, offset, dst); offset += 8; 744 std(R26, offset, dst); offset += 8; 745 std(R27, offset, dst); offset += 8; 746 std(R28, offset, dst); offset += 8; 747 std(R29, offset, dst); offset += 8; 748 std(R30, offset, dst); offset += 8; 749 std(R31, offset, dst); offset += 8; 750 751 stfd(F14, offset, dst); offset += 8; 752 stfd(F15, offset, dst); offset += 8; 753 stfd(F16, offset, dst); offset += 8; 754 stfd(F17, offset, dst); offset += 8; 755 stfd(F18, offset, dst); offset += 8; 756 stfd(F19, offset, dst); offset += 8; 757 stfd(F20, offset, dst); offset += 8; 758 stfd(F21, offset, dst); offset += 8; 759 stfd(F22, offset, dst); offset += 8; 760 stfd(F23, offset, dst); offset += 8; 761 stfd(F24, offset, dst); offset += 8; 762 stfd(F25, offset, dst); offset += 8; 763 stfd(F26, offset, dst); offset += 8; 764 stfd(F27, offset, dst); offset += 8; 765 stfd(F28, offset, dst); offset += 8; 766 stfd(F29, offset, dst); offset += 8; 767 stfd(F30, offset, dst); offset += 8; 768 stfd(F31, offset, dst); 769 } 770 771 // Uses ordering which corresponds to ABI: 772 // _restgpr0_14: ld r14,-144(r1) 773 // _restgpr0_15: ld r15,-136(r1) 774 // _restgpr0_16: ld r16,-128(r1) 775 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) { 776 ld(R14, offset, src); offset += 8; 777 ld(R15, offset, src); offset += 8; 778 ld(R16, offset, src); offset += 8; 779 ld(R17, offset, src); offset += 8; 780 ld(R18, offset, src); offset += 8; 781 ld(R19, offset, src); offset += 8; 782 ld(R20, offset, src); offset += 8; 783 ld(R21, offset, src); offset += 8; 784 ld(R22, offset, src); offset += 8; 785 ld(R23, offset, src); offset += 8; 786 ld(R24, offset, src); offset += 8; 787 ld(R25, offset, src); offset += 8; 788 ld(R26, offset, src); offset += 8; 789 ld(R27, offset, src); offset += 8; 790 ld(R28, offset, src); offset += 8; 791 ld(R29, offset, src); offset += 8; 792 ld(R30, offset, src); offset += 8; 793 ld(R31, offset, src); offset += 8; 794 795 // FP registers 796 lfd(F14, offset, src); offset += 8; 797 lfd(F15, offset, src); offset += 8; 798 lfd(F16, offset, src); offset += 8; 799 lfd(F17, offset, src); offset += 8; 800 lfd(F18, offset, src); offset += 8; 801 lfd(F19, offset, src); offset += 8; 802 lfd(F20, offset, src); offset += 8; 803 lfd(F21, offset, src); offset += 8; 804 lfd(F22, offset, src); offset += 8; 805 lfd(F23, offset, src); offset += 8; 806 lfd(F24, offset, src); offset += 8; 807 lfd(F25, offset, src); offset += 8; 808 lfd(F26, offset, src); offset += 8; 809 lfd(F27, offset, src); offset += 8; 810 lfd(F28, offset, src); offset += 8; 811 lfd(F29, offset, src); offset += 8; 812 lfd(F30, offset, src); offset += 8; 813 lfd(F31, offset, src); 814 } 815 816 // For verify_oops. 817 void MacroAssembler::save_volatile_gprs(Register dst, int offset) { 818 std(R2, offset, dst); offset += 8; 819 std(R3, offset, dst); offset += 8; 820 std(R4, offset, dst); offset += 8; 821 std(R5, offset, dst); offset += 8; 822 std(R6, offset, dst); offset += 8; 823 std(R7, offset, dst); offset += 8; 824 std(R8, offset, dst); offset += 8; 825 std(R9, offset, dst); offset += 8; 826 std(R10, offset, dst); offset += 8; 827 std(R11, offset, dst); offset += 8; 828 std(R12, offset, dst); offset += 8; 829 830 stfd(F0, offset, dst); offset += 8; 831 stfd(F1, offset, dst); offset += 8; 832 stfd(F2, offset, dst); offset += 8; 833 stfd(F3, offset, dst); offset += 8; 834 stfd(F4, offset, dst); offset += 8; 835 stfd(F5, offset, dst); offset += 8; 836 stfd(F6, offset, dst); offset += 8; 837 stfd(F7, offset, dst); offset += 8; 838 stfd(F8, offset, dst); offset += 8; 839 stfd(F9, offset, dst); offset += 8; 840 stfd(F10, offset, dst); offset += 8; 841 stfd(F11, offset, dst); offset += 8; 842 stfd(F12, offset, dst); offset += 8; 843 stfd(F13, offset, dst); 844 } 845 846 // For verify_oops. 847 void MacroAssembler::restore_volatile_gprs(Register src, int offset) { 848 ld(R2, offset, src); offset += 8; 849 ld(R3, offset, src); offset += 8; 850 ld(R4, offset, src); offset += 8; 851 ld(R5, offset, src); offset += 8; 852 ld(R6, offset, src); offset += 8; 853 ld(R7, offset, src); offset += 8; 854 ld(R8, offset, src); offset += 8; 855 ld(R9, offset, src); offset += 8; 856 ld(R10, offset, src); offset += 8; 857 ld(R11, offset, src); offset += 8; 858 ld(R12, offset, src); offset += 8; 859 860 lfd(F0, offset, src); offset += 8; 861 lfd(F1, offset, src); offset += 8; 862 lfd(F2, offset, src); offset += 8; 863 lfd(F3, offset, src); offset += 8; 864 lfd(F4, offset, src); offset += 8; 865 lfd(F5, offset, src); offset += 8; 866 lfd(F6, offset, src); offset += 8; 867 lfd(F7, offset, src); offset += 8; 868 lfd(F8, offset, src); offset += 8; 869 lfd(F9, offset, src); offset += 8; 870 lfd(F10, offset, src); offset += 8; 871 lfd(F11, offset, src); offset += 8; 872 lfd(F12, offset, src); offset += 8; 873 lfd(F13, offset, src); 874 } 875 876 void MacroAssembler::save_LR_CR(Register tmp) { 877 mfcr(tmp); 878 std(tmp, _abi(cr), R1_SP); 879 mflr(tmp); 880 std(tmp, _abi(lr), R1_SP); 881 // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad) 882 } 883 884 void MacroAssembler::restore_LR_CR(Register tmp) { 885 assert(tmp != R1_SP, "must be distinct"); 886 ld(tmp, _abi(lr), R1_SP); 887 mtlr(tmp); 888 ld(tmp, _abi(cr), R1_SP); 889 mtcr(tmp); 890 } 891 892 address MacroAssembler::get_PC_trash_LR(Register result) { 893 Label L; 894 bl(L); 895 bind(L); 896 address lr_pc = pc(); 897 mflr(result); 898 return lr_pc; 899 } 900 901 void MacroAssembler::resize_frame(Register offset, Register tmp) { 902 #ifdef ASSERT 903 assert_different_registers(offset, tmp, R1_SP); 904 andi_(tmp, offset, frame::alignment_in_bytes-1); 905 asm_assert_eq("resize_frame: unaligned", 0x204); 906 #endif 907 908 // tmp <- *(SP) 909 ld(tmp, _abi(callers_sp), R1_SP); 910 // addr <- SP + offset; 911 // *(addr) <- tmp; 912 // SP <- addr 913 stdux(tmp, R1_SP, offset); 914 } 915 916 void MacroAssembler::resize_frame(int offset, Register tmp) { 917 assert(is_simm(offset, 16), "too big an offset"); 918 assert_different_registers(tmp, R1_SP); 919 assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned"); 920 // tmp <- *(SP) 921 ld(tmp, _abi(callers_sp), R1_SP); 922 // addr <- SP + offset; 923 // *(addr) <- tmp; 924 // SP <- addr 925 stdu(tmp, offset, R1_SP); 926 } 927 928 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) { 929 // (addr == tmp1) || (addr == tmp2) is allowed here! 930 assert(tmp1 != tmp2, "must be distinct"); 931 932 // compute offset w.r.t. current stack pointer 933 // tmp_1 <- addr - SP (!) 934 subf(tmp1, R1_SP, addr); 935 936 // atomically update SP keeping back link. 937 resize_frame(tmp1/* offset */, tmp2/* tmp */); 938 } 939 940 void MacroAssembler::push_frame(Register bytes, Register tmp) { 941 #ifdef ASSERT 942 assert(bytes != R0, "r0 not allowed here"); 943 andi_(R0, bytes, frame::alignment_in_bytes-1); 944 asm_assert_eq("push_frame(Reg, Reg): unaligned", 0x203); 945 #endif 946 neg(tmp, bytes); 947 stdux(R1_SP, R1_SP, tmp); 948 } 949 950 // Push a frame of size `bytes'. 951 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) { 952 long offset = align_addr(bytes, frame::alignment_in_bytes); 953 if (is_simm(-offset, 16)) { 954 stdu(R1_SP, -offset, R1_SP); 955 } else { 956 load_const_optimized(tmp, -offset); 957 stdux(R1_SP, R1_SP, tmp); 958 } 959 } 960 961 // Push a frame of size `bytes' plus abi_reg_args on top. 962 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) { 963 push_frame(bytes + frame::abi_reg_args_size, tmp); 964 } 965 966 // Setup up a new C frame with a spill area for non-volatile GPRs and 967 // additional space for local variables. 968 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes, 969 Register tmp) { 970 push_frame(bytes + frame::abi_reg_args_size + frame::spill_nonvolatiles_size, tmp); 971 } 972 973 // Pop current C frame. 974 void MacroAssembler::pop_frame() { 975 ld(R1_SP, _abi(callers_sp), R1_SP); 976 } 977 978 #if defined(ABI_ELFv2) 979 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) { 980 // TODO(asmundak): make sure the caller uses R12 as function descriptor 981 // most of the times. 982 if (R12 != r_function_entry) { 983 mr(R12, r_function_entry); 984 } 985 mtctr(R12); 986 // Do a call or a branch. 987 if (and_link) { 988 bctrl(); 989 } else { 990 bctr(); 991 } 992 _last_calls_return_pc = pc(); 993 994 return _last_calls_return_pc; 995 } 996 997 // Call a C function via a function descriptor and use full C 998 // calling conventions. Updates and returns _last_calls_return_pc. 999 address MacroAssembler::call_c(Register r_function_entry) { 1000 return branch_to(r_function_entry, /*and_link=*/true); 1001 } 1002 1003 // For tail calls: only branch, don't link, so callee returns to caller of this function. 1004 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) { 1005 return branch_to(r_function_entry, /*and_link=*/false); 1006 } 1007 1008 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) { 1009 load_const(R12, function_entry, R0); 1010 return branch_to(R12, /*and_link=*/true); 1011 } 1012 1013 #else 1014 // Generic version of a call to C function via a function descriptor 1015 // with variable support for C calling conventions (TOC, ENV, etc.). 1016 // Updates and returns _last_calls_return_pc. 1017 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call, 1018 bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) { 1019 // we emit standard ptrgl glue code here 1020 assert((function_descriptor != R0), "function_descriptor cannot be R0"); 1021 1022 // retrieve necessary entries from the function descriptor 1023 ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor); 1024 mtctr(R0); 1025 1026 if (load_toc_of_callee) { 1027 ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor); 1028 } 1029 if (load_env_of_callee) { 1030 ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor); 1031 } else if (load_toc_of_callee) { 1032 li(R11, 0); 1033 } 1034 1035 // do a call or a branch 1036 if (and_link) { 1037 bctrl(); 1038 } else { 1039 bctr(); 1040 } 1041 _last_calls_return_pc = pc(); 1042 1043 return _last_calls_return_pc; 1044 } 1045 1046 // Call a C function via a function descriptor and use full C calling 1047 // conventions. 1048 // We don't use the TOC in generated code, so there is no need to save 1049 // and restore its value. 1050 address MacroAssembler::call_c(Register fd) { 1051 return branch_to(fd, /*and_link=*/true, 1052 /*save toc=*/false, 1053 /*restore toc=*/false, 1054 /*load toc=*/true, 1055 /*load env=*/true); 1056 } 1057 1058 address MacroAssembler::call_c_and_return_to_caller(Register fd) { 1059 return branch_to(fd, /*and_link=*/false, 1060 /*save toc=*/false, 1061 /*restore toc=*/false, 1062 /*load toc=*/true, 1063 /*load env=*/true); 1064 } 1065 1066 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) { 1067 if (rt != relocInfo::none) { 1068 // this call needs to be relocatable 1069 if (!ReoptimizeCallSequences 1070 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1071 || fd == NULL // support code-size estimation 1072 || !fd->is_friend_function() 1073 || fd->entry() == NULL) { 1074 // it's not a friend function as defined by class FunctionDescriptor, 1075 // so do a full call-c here. 1076 load_const(R11, (address)fd, R0); 1077 1078 bool has_env = (fd != NULL && fd->env() != NULL); 1079 return branch_to(R11, /*and_link=*/true, 1080 /*save toc=*/false, 1081 /*restore toc=*/false, 1082 /*load toc=*/true, 1083 /*load env=*/has_env); 1084 } else { 1085 // It's a friend function. Load the entry point and don't care about 1086 // toc and env. Use an optimizable call instruction, but ensure the 1087 // same code-size as in the case of a non-friend function. 1088 nop(); 1089 nop(); 1090 nop(); 1091 bl64_patchable(fd->entry(), rt); 1092 _last_calls_return_pc = pc(); 1093 return _last_calls_return_pc; 1094 } 1095 } else { 1096 // This call does not need to be relocatable, do more aggressive 1097 // optimizations. 1098 if (!ReoptimizeCallSequences 1099 || !fd->is_friend_function()) { 1100 // It's not a friend function as defined by class FunctionDescriptor, 1101 // so do a full call-c here. 1102 load_const(R11, (address)fd, R0); 1103 return branch_to(R11, /*and_link=*/true, 1104 /*save toc=*/false, 1105 /*restore toc=*/false, 1106 /*load toc=*/true, 1107 /*load env=*/true); 1108 } else { 1109 // it's a friend function, load the entry point and don't care about 1110 // toc and env. 1111 address dest = fd->entry(); 1112 if (is_within_range_of_b(dest, pc())) { 1113 bl(dest); 1114 } else { 1115 bl64_patchable(dest, rt); 1116 } 1117 _last_calls_return_pc = pc(); 1118 return _last_calls_return_pc; 1119 } 1120 } 1121 } 1122 1123 // Call a C function. All constants needed reside in TOC. 1124 // 1125 // Read the address to call from the TOC. 1126 // Read env from TOC, if fd specifies an env. 1127 // Read new TOC from TOC. 1128 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd, 1129 relocInfo::relocType rt, Register toc) { 1130 if (!ReoptimizeCallSequences 1131 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1132 || !fd->is_friend_function()) { 1133 // It's not a friend function as defined by class FunctionDescriptor, 1134 // so do a full call-c here. 1135 assert(fd->entry() != NULL, "function must be linked"); 1136 1137 AddressLiteral fd_entry(fd->entry()); 1138 bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true); 1139 mtctr(R11); 1140 if (fd->env() == NULL) { 1141 li(R11, 0); 1142 nop(); 1143 } else { 1144 AddressLiteral fd_env(fd->env()); 1145 success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true); 1146 } 1147 AddressLiteral fd_toc(fd->toc()); 1148 // Set R2_TOC (load from toc) 1149 success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true); 1150 bctrl(); 1151 _last_calls_return_pc = pc(); 1152 if (!success) { return NULL; } 1153 } else { 1154 // It's a friend function, load the entry point and don't care about 1155 // toc and env. Use an optimizable call instruction, but ensure the 1156 // same code-size as in the case of a non-friend function. 1157 nop(); 1158 bl64_patchable(fd->entry(), rt); 1159 _last_calls_return_pc = pc(); 1160 } 1161 return _last_calls_return_pc; 1162 } 1163 #endif // ABI_ELFv2 1164 1165 void MacroAssembler::call_VM_base(Register oop_result, 1166 Register last_java_sp, 1167 address entry_point, 1168 bool check_exceptions) { 1169 BLOCK_COMMENT("call_VM {"); 1170 // Determine last_java_sp register. 1171 if (!last_java_sp->is_valid()) { 1172 last_java_sp = R1_SP; 1173 } 1174 set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1); 1175 1176 // ARG1 must hold thread address. 1177 mr(R3_ARG1, R16_thread); 1178 #if defined(ABI_ELFv2) 1179 address return_pc = call_c(entry_point, relocInfo::none); 1180 #else 1181 address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none); 1182 #endif 1183 1184 reset_last_Java_frame(); 1185 1186 // Check for pending exceptions. 1187 if (check_exceptions) { 1188 // We don't check for exceptions here. 1189 ShouldNotReachHere(); 1190 } 1191 1192 // Get oop result if there is one and reset the value in the thread. 1193 if (oop_result->is_valid()) { 1194 get_vm_result(oop_result); 1195 } 1196 1197 _last_calls_return_pc = return_pc; 1198 BLOCK_COMMENT("} call_VM"); 1199 } 1200 1201 void MacroAssembler::call_VM_leaf_base(address entry_point) { 1202 BLOCK_COMMENT("call_VM_leaf {"); 1203 #if defined(ABI_ELFv2) 1204 call_c(entry_point, relocInfo::none); 1205 #else 1206 call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none); 1207 #endif 1208 BLOCK_COMMENT("} call_VM_leaf"); 1209 } 1210 1211 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) { 1212 call_VM_base(oop_result, noreg, entry_point, check_exceptions); 1213 } 1214 1215 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, 1216 bool check_exceptions) { 1217 // R3_ARG1 is reserved for the thread. 1218 mr_if_needed(R4_ARG2, arg_1); 1219 call_VM(oop_result, entry_point, check_exceptions); 1220 } 1221 1222 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, 1223 bool check_exceptions) { 1224 // R3_ARG1 is reserved for the thread 1225 mr_if_needed(R4_ARG2, arg_1); 1226 assert(arg_2 != R4_ARG2, "smashed argument"); 1227 mr_if_needed(R5_ARG3, arg_2); 1228 call_VM(oop_result, entry_point, check_exceptions); 1229 } 1230 1231 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3, 1232 bool check_exceptions) { 1233 // R3_ARG1 is reserved for the thread 1234 mr_if_needed(R4_ARG2, arg_1); 1235 assert(arg_2 != R4_ARG2, "smashed argument"); 1236 mr_if_needed(R5_ARG3, arg_2); 1237 mr_if_needed(R6_ARG4, arg_3); 1238 call_VM(oop_result, entry_point, check_exceptions); 1239 } 1240 1241 void MacroAssembler::call_VM_leaf(address entry_point) { 1242 call_VM_leaf_base(entry_point); 1243 } 1244 1245 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) { 1246 mr_if_needed(R3_ARG1, arg_1); 1247 call_VM_leaf(entry_point); 1248 } 1249 1250 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) { 1251 mr_if_needed(R3_ARG1, arg_1); 1252 assert(arg_2 != R3_ARG1, "smashed argument"); 1253 mr_if_needed(R4_ARG2, arg_2); 1254 call_VM_leaf(entry_point); 1255 } 1256 1257 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) { 1258 mr_if_needed(R3_ARG1, arg_1); 1259 assert(arg_2 != R3_ARG1, "smashed argument"); 1260 mr_if_needed(R4_ARG2, arg_2); 1261 assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument"); 1262 mr_if_needed(R5_ARG3, arg_3); 1263 call_VM_leaf(entry_point); 1264 } 1265 1266 // Check whether instruction is a read access to the polling page 1267 // which was emitted by load_from_polling_page(..). 1268 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext, 1269 address* polling_address_ptr) { 1270 if (!is_ld(instruction)) 1271 return false; // It's not a ld. Fail. 1272 1273 int rt = inv_rt_field(instruction); 1274 int ra = inv_ra_field(instruction); 1275 int ds = inv_ds_field(instruction); 1276 if (!(ds == 0 && ra != 0 && rt == 0)) { 1277 return false; // It's not a ld(r0, X, ra). Fail. 1278 } 1279 1280 if (!ucontext) { 1281 // Set polling address. 1282 if (polling_address_ptr != NULL) { 1283 *polling_address_ptr = NULL; 1284 } 1285 return true; // No ucontext given. Can't check value of ra. Assume true. 1286 } 1287 1288 #ifdef LINUX 1289 // Ucontext given. Check that register ra contains the address of 1290 // the safepoing polling page. 1291 ucontext_t* uc = (ucontext_t*) ucontext; 1292 // Set polling address. 1293 address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds; 1294 if (polling_address_ptr != NULL) { 1295 *polling_address_ptr = addr; 1296 } 1297 return os::is_poll_address(addr); 1298 #else 1299 // Not on Linux, ucontext must be NULL. 1300 ShouldNotReachHere(); 1301 return false; 1302 #endif 1303 } 1304 1305 bool MacroAssembler::is_memory_serialization(int instruction, JavaThread* thread, void* ucontext) { 1306 #ifdef LINUX 1307 ucontext_t* uc = (ucontext_t*) ucontext; 1308 1309 if (is_stwx(instruction) || is_stwux(instruction)) { 1310 int ra = inv_ra_field(instruction); 1311 int rb = inv_rb_field(instruction); 1312 1313 // look up content of ra and rb in ucontext 1314 address ra_val=(address)uc->uc_mcontext.regs->gpr[ra]; 1315 long rb_val=(long)uc->uc_mcontext.regs->gpr[rb]; 1316 return os::is_memory_serialize_page(thread, ra_val+rb_val); 1317 } else if (is_stw(instruction) || is_stwu(instruction)) { 1318 int ra = inv_ra_field(instruction); 1319 int d1 = inv_d1_field(instruction); 1320 1321 // look up content of ra in ucontext 1322 address ra_val=(address)uc->uc_mcontext.regs->gpr[ra]; 1323 return os::is_memory_serialize_page(thread, ra_val+d1); 1324 } else { 1325 return false; 1326 } 1327 #else 1328 // workaround not needed on !LINUX :-) 1329 ShouldNotCallThis(); 1330 return false; 1331 #endif 1332 } 1333 1334 void MacroAssembler::bang_stack_with_offset(int offset) { 1335 // When increasing the stack, the old stack pointer will be written 1336 // to the new top of stack according to the PPC64 abi. 1337 // Therefore, stack banging is not necessary when increasing 1338 // the stack by <= os::vm_page_size() bytes. 1339 // When increasing the stack by a larger amount, this method is 1340 // called repeatedly to bang the intermediate pages. 1341 1342 // Stack grows down, caller passes positive offset. 1343 assert(offset > 0, "must bang with positive offset"); 1344 1345 long stdoffset = -offset; 1346 1347 if (is_simm(stdoffset, 16)) { 1348 // Signed 16 bit offset, a simple std is ok. 1349 if (UseLoadInstructionsForStackBangingPPC64) { 1350 ld(R0, (int)(signed short)stdoffset, R1_SP); 1351 } else { 1352 std(R0,(int)(signed short)stdoffset, R1_SP); 1353 } 1354 } else if (is_simm(stdoffset, 31)) { 1355 const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset); 1356 const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset); 1357 1358 Register tmp = R11; 1359 addis(tmp, R1_SP, hi); 1360 if (UseLoadInstructionsForStackBangingPPC64) { 1361 ld(R0, lo, tmp); 1362 } else { 1363 std(R0, lo, tmp); 1364 } 1365 } else { 1366 ShouldNotReachHere(); 1367 } 1368 } 1369 1370 // If instruction is a stack bang of the form 1371 // std R0, x(Ry), (see bang_stack_with_offset()) 1372 // stdu R1_SP, x(R1_SP), (see push_frame(), resize_frame()) 1373 // or stdux R1_SP, Rx, R1_SP (see push_frame(), resize_frame()) 1374 // return the banged address. Otherwise, return 0. 1375 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) { 1376 #ifdef LINUX 1377 ucontext_t* uc = (ucontext_t*) ucontext; 1378 int rs = inv_rs_field(instruction); 1379 int ra = inv_ra_field(instruction); 1380 if ( (is_ld(instruction) && rs == 0 && UseLoadInstructionsForStackBangingPPC64) 1381 || (is_std(instruction) && rs == 0 && !UseLoadInstructionsForStackBangingPPC64) 1382 || (is_stdu(instruction) && rs == 1)) { 1383 int ds = inv_ds_field(instruction); 1384 // return banged address 1385 return ds+(address)uc->uc_mcontext.regs->gpr[ra]; 1386 } else if (is_stdux(instruction) && rs == 1) { 1387 int rb = inv_rb_field(instruction); 1388 address sp = (address)uc->uc_mcontext.regs->gpr[1]; 1389 long rb_val = (long)uc->uc_mcontext.regs->gpr[rb]; 1390 return ra != 1 || rb_val >= 0 ? NULL // not a stack bang 1391 : sp + rb_val; // banged address 1392 } 1393 return NULL; // not a stack bang 1394 #else 1395 // workaround not needed on !LINUX :-) 1396 ShouldNotCallThis(); 1397 return NULL; 1398 #endif 1399 } 1400 1401 void MacroAssembler::reserved_stack_check(Register return_pc) { 1402 // Test if reserved zone needs to be enabled. 1403 Label no_reserved_zone_enabling; 1404 1405 ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread); 1406 cmpld(CCR0, R1_SP, R0); 1407 blt_predict_taken(CCR0, no_reserved_zone_enabling); 1408 1409 // Enable reserved zone again, throw stack overflow exception. 1410 push_frame_reg_args(0, R0); 1411 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread); 1412 pop_frame(); 1413 mtlr(return_pc); 1414 load_const_optimized(R0, StubRoutines::throw_delayed_StackOverflowError_entry()); 1415 mtctr(R0); 1416 bctr(); 1417 1418 should_not_reach_here(); 1419 1420 bind(no_reserved_zone_enabling); 1421 } 1422 1423 void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base, 1424 bool cmpxchgx_hint) { 1425 Label retry; 1426 bind(retry); 1427 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1428 stdcx_(exchange_value, addr_base); 1429 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1430 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1431 } else { 1432 bne( CCR0, retry); // StXcx_ sets CCR0. 1433 } 1434 } 1435 1436 void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base, 1437 Register tmp, bool cmpxchgx_hint) { 1438 Label retry; 1439 bind(retry); 1440 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1441 add(tmp, dest_current_value, inc_value); 1442 stdcx_(tmp, addr_base); 1443 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1444 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1445 } else { 1446 bne( CCR0, retry); // StXcx_ sets CCR0. 1447 } 1448 } 1449 1450 // Word/sub-word atomic helper functions 1451 1452 // Temps and addr_base are killed if size < 4 and processor does not support respective instructions. 1453 // Only signed types are supported with size < 4. 1454 // Atomic add always kills tmp1. 1455 void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value, 1456 Register addr_base, Register tmp1, Register tmp2, Register tmp3, 1457 bool cmpxchgx_hint, bool is_add, int size) { 1458 // Sub-word instructions are available since Power 8. 1459 // For older processors, instruction_type != size holds, and we 1460 // emulate the sub-word instructions by constructing a 4-byte value 1461 // that leaves the other bytes unchanged. 1462 const int instruction_type = VM_Version::has_lqarx() ? size : 4; 1463 1464 Label retry; 1465 Register shift_amount = noreg, 1466 val32 = dest_current_value, 1467 modval = is_add ? tmp1 : exchange_value; 1468 1469 if (instruction_type != size) { 1470 assert_different_registers(tmp1, tmp2, tmp3, dest_current_value, exchange_value, addr_base); 1471 modval = tmp1; 1472 shift_amount = tmp2; 1473 val32 = tmp3; 1474 // Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned. 1475 #ifdef VM_LITTLE_ENDIAN 1476 rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8; 1477 clrrdi(addr_base, addr_base, 2); 1478 #else 1479 xori(shift_amount, addr_base, (size == 1) ? 3 : 2); 1480 clrrdi(addr_base, addr_base, 2); 1481 rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16; 1482 #endif 1483 } 1484 1485 // atomic emulation loop 1486 bind(retry); 1487 1488 switch (instruction_type) { 1489 case 4: lwarx(val32, addr_base, cmpxchgx_hint); break; 1490 case 2: lharx(val32, addr_base, cmpxchgx_hint); break; 1491 case 1: lbarx(val32, addr_base, cmpxchgx_hint); break; 1492 default: ShouldNotReachHere(); 1493 } 1494 1495 if (instruction_type != size) { 1496 srw(dest_current_value, val32, shift_amount); 1497 } 1498 1499 if (is_add) { add(modval, dest_current_value, exchange_value); } 1500 1501 if (instruction_type != size) { 1502 // Transform exchange value such that the replacement can be done by one xor instruction. 1503 xorr(modval, dest_current_value, is_add ? modval : exchange_value); 1504 clrldi(modval, modval, (size == 1) ? 56 : 48); 1505 slw(modval, modval, shift_amount); 1506 xorr(modval, val32, modval); 1507 } 1508 1509 switch (instruction_type) { 1510 case 4: stwcx_(modval, addr_base); break; 1511 case 2: sthcx_(modval, addr_base); break; 1512 case 1: stbcx_(modval, addr_base); break; 1513 default: ShouldNotReachHere(); 1514 } 1515 1516 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1517 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1518 } else { 1519 bne( CCR0, retry); // StXcx_ sets CCR0. 1520 } 1521 1522 // l?arx zero-extends, but Java wants byte/short values sign-extended. 1523 if (size == 1) { 1524 extsb(dest_current_value, dest_current_value); 1525 } else if (size == 2) { 1526 extsh(dest_current_value, dest_current_value); 1527 }; 1528 } 1529 1530 // Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions. 1531 // Only signed types are supported with size < 4. 1532 void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value, 1533 Register compare_value, Register exchange_value, 1534 Register addr_base, Register tmp1, Register tmp2, 1535 Label &retry, Label &failed, bool cmpxchgx_hint, int size) { 1536 // Sub-word instructions are available since Power 8. 1537 // For older processors, instruction_type != size holds, and we 1538 // emulate the sub-word instructions by constructing a 4-byte value 1539 // that leaves the other bytes unchanged. 1540 const int instruction_type = VM_Version::has_lqarx() ? size : 4; 1541 1542 Register shift_amount = noreg, 1543 val32 = dest_current_value, 1544 modval = exchange_value; 1545 1546 if (instruction_type != size) { 1547 assert_different_registers(tmp1, tmp2, dest_current_value, compare_value, exchange_value, addr_base); 1548 shift_amount = tmp1; 1549 val32 = tmp2; 1550 modval = tmp2; 1551 // Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned. 1552 #ifdef VM_LITTLE_ENDIAN 1553 rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8; 1554 clrrdi(addr_base, addr_base, 2); 1555 #else 1556 xori(shift_amount, addr_base, (size == 1) ? 3 : 2); 1557 clrrdi(addr_base, addr_base, 2); 1558 rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16; 1559 #endif 1560 // Transform exchange value such that the replacement can be done by one xor instruction. 1561 xorr(exchange_value, compare_value, exchange_value); 1562 clrldi(exchange_value, exchange_value, (size == 1) ? 56 : 48); 1563 slw(exchange_value, exchange_value, shift_amount); 1564 } 1565 1566 // atomic emulation loop 1567 bind(retry); 1568 1569 switch (instruction_type) { 1570 case 4: lwarx(val32, addr_base, cmpxchgx_hint); break; 1571 case 2: lharx(val32, addr_base, cmpxchgx_hint); break; 1572 case 1: lbarx(val32, addr_base, cmpxchgx_hint); break; 1573 default: ShouldNotReachHere(); 1574 } 1575 1576 if (instruction_type != size) { 1577 srw(dest_current_value, val32, shift_amount); 1578 } 1579 if (size == 1) { 1580 extsb(dest_current_value, dest_current_value); 1581 } else if (size == 2) { 1582 extsh(dest_current_value, dest_current_value); 1583 }; 1584 1585 cmpw(flag, dest_current_value, compare_value); 1586 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1587 bne_predict_not_taken(flag, failed); 1588 } else { 1589 bne( flag, failed); 1590 } 1591 // branch to done => (flag == ne), (dest_current_value != compare_value) 1592 // fall through => (flag == eq), (dest_current_value == compare_value) 1593 1594 if (instruction_type != size) { 1595 xorr(modval, val32, exchange_value); 1596 } 1597 1598 switch (instruction_type) { 1599 case 4: stwcx_(modval, addr_base); break; 1600 case 2: sthcx_(modval, addr_base); break; 1601 case 1: stbcx_(modval, addr_base); break; 1602 default: ShouldNotReachHere(); 1603 } 1604 } 1605 1606 // CmpxchgX sets condition register to cmpX(current, compare). 1607 void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value, 1608 Register compare_value, Register exchange_value, 1609 Register addr_base, Register tmp1, Register tmp2, 1610 int semantics, bool cmpxchgx_hint, 1611 Register int_flag_success, bool contention_hint, bool weak, int size) { 1612 Label retry; 1613 Label failed; 1614 Label done; 1615 1616 // Save one branch if result is returned via register and 1617 // result register is different from the other ones. 1618 bool use_result_reg = (int_flag_success != noreg); 1619 bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value && 1620 int_flag_success != exchange_value && int_flag_success != addr_base && 1621 int_flag_success != tmp1 && int_flag_success != tmp2); 1622 assert(!weak || flag == CCR0, "weak only supported with CCR0"); 1623 assert(size == 1 || size == 2 || size == 4, "unsupported"); 1624 1625 if (use_result_reg && preset_result_reg) { 1626 li(int_flag_success, 0); // preset (assume cas failed) 1627 } 1628 1629 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1630 if (contention_hint) { // Don't try to reserve if cmp fails. 1631 switch (size) { 1632 case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break; 1633 case 2: lha(dest_current_value, 0, addr_base); break; 1634 case 4: lwz(dest_current_value, 0, addr_base); break; 1635 default: ShouldNotReachHere(); 1636 } 1637 cmpw(flag, dest_current_value, compare_value); 1638 bne(flag, failed); 1639 } 1640 1641 // release/fence semantics 1642 if (semantics & MemBarRel) { 1643 release(); 1644 } 1645 1646 cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base, tmp1, tmp2, 1647 retry, failed, cmpxchgx_hint, size); 1648 if (!weak || use_result_reg) { 1649 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1650 bne_predict_not_taken(CCR0, weak ? failed : retry); // StXcx_ sets CCR0. 1651 } else { 1652 bne( CCR0, weak ? failed : retry); // StXcx_ sets CCR0. 1653 } 1654 } 1655 // fall through => (flag == eq), (dest_current_value == compare_value), (swapped) 1656 1657 // Result in register (must do this at the end because int_flag_success can be the 1658 // same register as one above). 1659 if (use_result_reg) { 1660 li(int_flag_success, 1); 1661 } 1662 1663 if (semantics & MemBarFenceAfter) { 1664 fence(); 1665 } else if (semantics & MemBarAcq) { 1666 isync(); 1667 } 1668 1669 if (use_result_reg && !preset_result_reg) { 1670 b(done); 1671 } 1672 1673 bind(failed); 1674 if (use_result_reg && !preset_result_reg) { 1675 li(int_flag_success, 0); 1676 } 1677 1678 bind(done); 1679 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1680 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1681 } 1682 1683 // Preforms atomic compare exchange: 1684 // if (compare_value == *addr_base) 1685 // *addr_base = exchange_value 1686 // int_flag_success = 1; 1687 // else 1688 // int_flag_success = 0; 1689 // 1690 // ConditionRegister flag = cmp(compare_value, *addr_base) 1691 // Register dest_current_value = *addr_base 1692 // Register compare_value Used to compare with value in memory 1693 // Register exchange_value Written to memory if compare_value == *addr_base 1694 // Register addr_base The memory location to compareXChange 1695 // Register int_flag_success Set to 1 if exchange_value was written to *addr_base 1696 // 1697 // To avoid the costly compare exchange the value is tested beforehand. 1698 // Several special cases exist to avoid that unnecessary information is generated. 1699 // 1700 void MacroAssembler::cmpxchgd(ConditionRegister flag, 1701 Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value, 1702 Register addr_base, int semantics, bool cmpxchgx_hint, 1703 Register int_flag_success, Label* failed_ext, bool contention_hint, bool weak) { 1704 Label retry; 1705 Label failed_int; 1706 Label& failed = (failed_ext != NULL) ? *failed_ext : failed_int; 1707 Label done; 1708 1709 // Save one branch if result is returned via register and result register is different from the other ones. 1710 bool use_result_reg = (int_flag_success!=noreg); 1711 bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() && 1712 int_flag_success!=exchange_value && int_flag_success!=addr_base); 1713 assert(!weak || flag == CCR0, "weak only supported with CCR0"); 1714 assert(int_flag_success == noreg || failed_ext == NULL, "cannot have both"); 1715 1716 if (use_result_reg && preset_result_reg) { 1717 li(int_flag_success, 0); // preset (assume cas failed) 1718 } 1719 1720 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1721 if (contention_hint) { // Don't try to reserve if cmp fails. 1722 ld(dest_current_value, 0, addr_base); 1723 cmpd(flag, compare_value, dest_current_value); 1724 bne(flag, failed); 1725 } 1726 1727 // release/fence semantics 1728 if (semantics & MemBarRel) { 1729 release(); 1730 } 1731 1732 // atomic emulation loop 1733 bind(retry); 1734 1735 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1736 cmpd(flag, compare_value, dest_current_value); 1737 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1738 bne_predict_not_taken(flag, failed); 1739 } else { 1740 bne( flag, failed); 1741 } 1742 1743 stdcx_(exchange_value, addr_base); 1744 if (!weak || use_result_reg || failed_ext) { 1745 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1746 bne_predict_not_taken(CCR0, weak ? failed : retry); // stXcx_ sets CCR0 1747 } else { 1748 bne( CCR0, weak ? failed : retry); // stXcx_ sets CCR0 1749 } 1750 } 1751 1752 // result in register (must do this at the end because int_flag_success can be the same register as one above) 1753 if (use_result_reg) { 1754 li(int_flag_success, 1); 1755 } 1756 1757 if (semantics & MemBarFenceAfter) { 1758 fence(); 1759 } else if (semantics & MemBarAcq) { 1760 isync(); 1761 } 1762 1763 if (use_result_reg && !preset_result_reg) { 1764 b(done); 1765 } 1766 1767 bind(failed_int); 1768 if (use_result_reg && !preset_result_reg) { 1769 li(int_flag_success, 0); 1770 } 1771 1772 bind(done); 1773 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1774 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1775 } 1776 1777 // Look up the method for a megamorphic invokeinterface call. 1778 // The target method is determined by <intf_klass, itable_index>. 1779 // The receiver klass is in recv_klass. 1780 // On success, the result will be in method_result, and execution falls through. 1781 // On failure, execution transfers to the given label. 1782 void MacroAssembler::lookup_interface_method(Register recv_klass, 1783 Register intf_klass, 1784 RegisterOrConstant itable_index, 1785 Register method_result, 1786 Register scan_temp, 1787 Register temp2, 1788 Label& L_no_such_interface, 1789 bool return_method) { 1790 assert_different_registers(recv_klass, intf_klass, method_result, scan_temp); 1791 1792 // Compute start of first itableOffsetEntry (which is at the end of the vtable). 1793 int vtable_base = in_bytes(Klass::vtable_start_offset()); 1794 int itentry_off = itableMethodEntry::method_offset_in_bytes(); 1795 int logMEsize = exact_log2(itableMethodEntry::size() * wordSize); 1796 int scan_step = itableOffsetEntry::size() * wordSize; 1797 int log_vte_size= exact_log2(vtableEntry::size_in_bytes()); 1798 1799 lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass); 1800 // %%% We should store the aligned, prescaled offset in the klassoop. 1801 // Then the next several instructions would fold away. 1802 1803 sldi(scan_temp, scan_temp, log_vte_size); 1804 addi(scan_temp, scan_temp, vtable_base); 1805 add(scan_temp, recv_klass, scan_temp); 1806 1807 // Adjust recv_klass by scaled itable_index, so we can free itable_index. 1808 if (return_method) { 1809 if (itable_index.is_register()) { 1810 Register itable_offset = itable_index.as_register(); 1811 sldi(method_result, itable_offset, logMEsize); 1812 if (itentry_off) { addi(method_result, method_result, itentry_off); } 1813 add(method_result, method_result, recv_klass); 1814 } else { 1815 long itable_offset = (long)itable_index.as_constant(); 1816 // static address, no relocation 1817 add_const_optimized(method_result, recv_klass, (itable_offset << logMEsize) + itentry_off, temp2); 1818 } 1819 } 1820 1821 // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) { 1822 // if (scan->interface() == intf) { 1823 // result = (klass + scan->offset() + itable_index); 1824 // } 1825 // } 1826 Label search, found_method; 1827 1828 for (int peel = 1; peel >= 0; peel--) { 1829 // %%%% Could load both offset and interface in one ldx, if they were 1830 // in the opposite order. This would save a load. 1831 ld(temp2, itableOffsetEntry::interface_offset_in_bytes(), scan_temp); 1832 1833 // Check that this entry is non-null. A null entry means that 1834 // the receiver class doesn't implement the interface, and wasn't the 1835 // same as when the caller was compiled. 1836 cmpd(CCR0, temp2, intf_klass); 1837 1838 if (peel) { 1839 beq(CCR0, found_method); 1840 } else { 1841 bne(CCR0, search); 1842 // (invert the test to fall through to found_method...) 1843 } 1844 1845 if (!peel) break; 1846 1847 bind(search); 1848 1849 cmpdi(CCR0, temp2, 0); 1850 beq(CCR0, L_no_such_interface); 1851 addi(scan_temp, scan_temp, scan_step); 1852 } 1853 1854 bind(found_method); 1855 1856 // Got a hit. 1857 if (return_method) { 1858 int ito_offset = itableOffsetEntry::offset_offset_in_bytes(); 1859 lwz(scan_temp, ito_offset, scan_temp); 1860 ldx(method_result, scan_temp, method_result); 1861 } 1862 } 1863 1864 // virtual method calling 1865 void MacroAssembler::lookup_virtual_method(Register recv_klass, 1866 RegisterOrConstant vtable_index, 1867 Register method_result) { 1868 1869 assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg()); 1870 1871 const int base = in_bytes(Klass::vtable_start_offset()); 1872 assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); 1873 1874 if (vtable_index.is_register()) { 1875 sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord); 1876 add(recv_klass, vtable_index.as_register(), recv_klass); 1877 } else { 1878 addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord); 1879 } 1880 ld(R19_method, base + vtableEntry::method_offset_in_bytes(), recv_klass); 1881 } 1882 1883 /////////////////////////////////////////// subtype checking //////////////////////////////////////////// 1884 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, 1885 Register super_klass, 1886 Register temp1_reg, 1887 Register temp2_reg, 1888 Label* L_success, 1889 Label* L_failure, 1890 Label* L_slow_path, 1891 RegisterOrConstant super_check_offset) { 1892 1893 const Register check_cache_offset = temp1_reg; 1894 const Register cached_super = temp2_reg; 1895 1896 assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super); 1897 1898 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1899 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1900 1901 bool must_load_sco = (super_check_offset.constant_or_zero() == -1); 1902 bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset); 1903 1904 Label L_fallthrough; 1905 int label_nulls = 0; 1906 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 1907 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 1908 if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; } 1909 assert(label_nulls <= 1 || 1910 (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path), 1911 "at most one NULL in the batch, usually"); 1912 1913 // If the pointers are equal, we are done (e.g., String[] elements). 1914 // This self-check enables sharing of secondary supertype arrays among 1915 // non-primary types such as array-of-interface. Otherwise, each such 1916 // type would need its own customized SSA. 1917 // We move this check to the front of the fast path because many 1918 // type checks are in fact trivially successful in this manner, 1919 // so we get a nicely predicted branch right at the start of the check. 1920 cmpd(CCR0, sub_klass, super_klass); 1921 beq(CCR0, *L_success); 1922 1923 // Check the supertype display: 1924 if (must_load_sco) { 1925 // The super check offset is always positive... 1926 lwz(check_cache_offset, sco_offset, super_klass); 1927 super_check_offset = RegisterOrConstant(check_cache_offset); 1928 // super_check_offset is register. 1929 assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register()); 1930 } 1931 // The loaded value is the offset from KlassOopDesc. 1932 1933 ld(cached_super, super_check_offset, sub_klass); 1934 cmpd(CCR0, cached_super, super_klass); 1935 1936 // This check has worked decisively for primary supers. 1937 // Secondary supers are sought in the super_cache ('super_cache_addr'). 1938 // (Secondary supers are interfaces and very deeply nested subtypes.) 1939 // This works in the same check above because of a tricky aliasing 1940 // between the super_cache and the primary super display elements. 1941 // (The 'super_check_addr' can address either, as the case requires.) 1942 // Note that the cache is updated below if it does not help us find 1943 // what we need immediately. 1944 // So if it was a primary super, we can just fail immediately. 1945 // Otherwise, it's the slow path for us (no success at this point). 1946 1947 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); } 1948 1949 if (super_check_offset.is_register()) { 1950 beq(CCR0, *L_success); 1951 cmpwi(CCR0, super_check_offset.as_register(), sc_offset); 1952 if (L_failure == &L_fallthrough) { 1953 beq(CCR0, *L_slow_path); 1954 } else { 1955 bne(CCR0, *L_failure); 1956 FINAL_JUMP(*L_slow_path); 1957 } 1958 } else { 1959 if (super_check_offset.as_constant() == sc_offset) { 1960 // Need a slow path; fast failure is impossible. 1961 if (L_slow_path == &L_fallthrough) { 1962 beq(CCR0, *L_success); 1963 } else { 1964 bne(CCR0, *L_slow_path); 1965 FINAL_JUMP(*L_success); 1966 } 1967 } else { 1968 // No slow path; it's a fast decision. 1969 if (L_failure == &L_fallthrough) { 1970 beq(CCR0, *L_success); 1971 } else { 1972 bne(CCR0, *L_failure); 1973 FINAL_JUMP(*L_success); 1974 } 1975 } 1976 } 1977 1978 bind(L_fallthrough); 1979 #undef FINAL_JUMP 1980 } 1981 1982 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 1983 Register super_klass, 1984 Register temp1_reg, 1985 Register temp2_reg, 1986 Label* L_success, 1987 Register result_reg) { 1988 const Register array_ptr = temp1_reg; // current value from cache array 1989 const Register temp = temp2_reg; 1990 1991 assert_different_registers(sub_klass, super_klass, array_ptr, temp); 1992 1993 int source_offset = in_bytes(Klass::secondary_supers_offset()); 1994 int target_offset = in_bytes(Klass::secondary_super_cache_offset()); 1995 1996 int length_offset = Array<Klass*>::length_offset_in_bytes(); 1997 int base_offset = Array<Klass*>::base_offset_in_bytes(); 1998 1999 Label hit, loop, failure, fallthru; 2000 2001 ld(array_ptr, source_offset, sub_klass); 2002 2003 // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated."); 2004 lwz(temp, length_offset, array_ptr); 2005 cmpwi(CCR0, temp, 0); 2006 beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0 2007 2008 mtctr(temp); // load ctr 2009 2010 bind(loop); 2011 // Oops in table are NO MORE compressed. 2012 ld(temp, base_offset, array_ptr); 2013 cmpd(CCR0, temp, super_klass); 2014 beq(CCR0, hit); 2015 addi(array_ptr, array_ptr, BytesPerWord); 2016 bdnz(loop); 2017 2018 bind(failure); 2019 if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss) 2020 b(fallthru); 2021 2022 bind(hit); 2023 std(super_klass, target_offset, sub_klass); // save result to cache 2024 if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit) 2025 if (L_success != NULL) { b(*L_success); } 2026 else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided 2027 2028 bind(fallthru); 2029 } 2030 2031 // Try fast path, then go to slow one if not successful 2032 void MacroAssembler::check_klass_subtype(Register sub_klass, 2033 Register super_klass, 2034 Register temp1_reg, 2035 Register temp2_reg, 2036 Label& L_success) { 2037 Label L_failure; 2038 check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure); 2039 check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success); 2040 bind(L_failure); // Fallthru if not successful. 2041 } 2042 2043 void MacroAssembler::check_method_handle_type(Register mtype_reg, Register mh_reg, 2044 Register temp_reg, 2045 Label& wrong_method_type) { 2046 assert_different_registers(mtype_reg, mh_reg, temp_reg); 2047 // Compare method type against that of the receiver. 2048 load_heap_oop(temp_reg, delayed_value(java_lang_invoke_MethodHandle::type_offset_in_bytes, temp_reg), mh_reg, 2049 noreg, noreg, false, IS_NOT_NULL); 2050 cmpd(CCR0, temp_reg, mtype_reg); 2051 bne(CCR0, wrong_method_type); 2052 } 2053 2054 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot, 2055 Register temp_reg, 2056 int extra_slot_offset) { 2057 // cf. TemplateTable::prepare_invoke(), if (load_receiver). 2058 int stackElementSize = Interpreter::stackElementSize; 2059 int offset = extra_slot_offset * stackElementSize; 2060 if (arg_slot.is_constant()) { 2061 offset += arg_slot.as_constant() * stackElementSize; 2062 return offset; 2063 } else { 2064 assert(temp_reg != noreg, "must specify"); 2065 sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize)); 2066 if (offset != 0) 2067 addi(temp_reg, temp_reg, offset); 2068 return temp_reg; 2069 } 2070 } 2071 2072 // Supports temp2_reg = R0. 2073 void MacroAssembler::biased_locking_enter(ConditionRegister cr_reg, Register obj_reg, 2074 Register mark_reg, Register temp_reg, 2075 Register temp2_reg, Label& done, Label* slow_case) { 2076 assert(UseBiasedLocking, "why call this otherwise?"); 2077 2078 #ifdef ASSERT 2079 assert_different_registers(obj_reg, mark_reg, temp_reg, temp2_reg); 2080 #endif 2081 2082 Label cas_label; 2083 2084 // Branch to done if fast path fails and no slow_case provided. 2085 Label *slow_case_int = (slow_case != NULL) ? slow_case : &done; 2086 2087 // Biased locking 2088 // See whether the lock is currently biased toward our thread and 2089 // whether the epoch is still valid 2090 // Note that the runtime guarantees sufficient alignment of JavaThread 2091 // pointers to allow age to be placed into low bits 2092 assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, 2093 "biased locking makes assumptions about bit layout"); 2094 2095 if (PrintBiasedLockingStatistics) { 2096 load_const(temp2_reg, (address) BiasedLocking::total_entry_count_addr(), temp_reg); 2097 lwzx(temp_reg, temp2_reg); 2098 addi(temp_reg, temp_reg, 1); 2099 stwx(temp_reg, temp2_reg); 2100 } 2101 2102 andi(temp_reg, mark_reg, markOopDesc::biased_lock_mask_in_place); 2103 cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern); 2104 bne(cr_reg, cas_label); 2105 2106 load_klass(temp_reg, obj_reg); 2107 2108 load_const_optimized(temp2_reg, ~((int) markOopDesc::age_mask_in_place)); 2109 ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg); 2110 orr(temp_reg, R16_thread, temp_reg); 2111 xorr(temp_reg, mark_reg, temp_reg); 2112 andr(temp_reg, temp_reg, temp2_reg); 2113 cmpdi(cr_reg, temp_reg, 0); 2114 if (PrintBiasedLockingStatistics) { 2115 Label l; 2116 bne(cr_reg, l); 2117 load_const(temp2_reg, (address) BiasedLocking::biased_lock_entry_count_addr()); 2118 lwzx(mark_reg, temp2_reg); 2119 addi(mark_reg, mark_reg, 1); 2120 stwx(mark_reg, temp2_reg); 2121 // restore mark_reg 2122 ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg); 2123 bind(l); 2124 } 2125 beq(cr_reg, done); 2126 2127 Label try_revoke_bias; 2128 Label try_rebias; 2129 2130 // At this point we know that the header has the bias pattern and 2131 // that we are not the bias owner in the current epoch. We need to 2132 // figure out more details about the state of the header in order to 2133 // know what operations can be legally performed on the object's 2134 // header. 2135 2136 // If the low three bits in the xor result aren't clear, that means 2137 // the prototype header is no longer biased and we have to revoke 2138 // the bias on this object. 2139 andi(temp2_reg, temp_reg, markOopDesc::biased_lock_mask_in_place); 2140 cmpwi(cr_reg, temp2_reg, 0); 2141 bne(cr_reg, try_revoke_bias); 2142 2143 // Biasing is still enabled for this data type. See whether the 2144 // epoch of the current bias is still valid, meaning that the epoch 2145 // bits of the mark word are equal to the epoch bits of the 2146 // prototype header. (Note that the prototype header's epoch bits 2147 // only change at a safepoint.) If not, attempt to rebias the object 2148 // toward the current thread. Note that we must be absolutely sure 2149 // that the current epoch is invalid in order to do this because 2150 // otherwise the manipulations it performs on the mark word are 2151 // illegal. 2152 2153 int shift_amount = 64 - markOopDesc::epoch_shift; 2154 // rotate epoch bits to right (little) end and set other bits to 0 2155 // [ big part | epoch | little part ] -> [ 0..0 | epoch ] 2156 rldicl_(temp2_reg, temp_reg, shift_amount, 64 - markOopDesc::epoch_bits); 2157 // branch if epoch bits are != 0, i.e. they differ, because the epoch has been incremented 2158 bne(CCR0, try_rebias); 2159 2160 // The epoch of the current bias is still valid but we know nothing 2161 // about the owner; it might be set or it might be clear. Try to 2162 // acquire the bias of the object using an atomic operation. If this 2163 // fails we will go in to the runtime to revoke the object's bias. 2164 // Note that we first construct the presumed unbiased header so we 2165 // don't accidentally blow away another thread's valid bias. 2166 andi(mark_reg, mark_reg, (markOopDesc::biased_lock_mask_in_place | 2167 markOopDesc::age_mask_in_place | 2168 markOopDesc::epoch_mask_in_place)); 2169 orr(temp_reg, R16_thread, mark_reg); 2170 2171 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2172 2173 // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg). 2174 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg, 2175 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg, 2176 /*where=*/obj_reg, 2177 MacroAssembler::MemBarAcq, 2178 MacroAssembler::cmpxchgx_hint_acquire_lock(), 2179 noreg, slow_case_int); // bail out if failed 2180 2181 // If the biasing toward our thread failed, this means that 2182 // another thread succeeded in biasing it toward itself and we 2183 // need to revoke that bias. The revocation will occur in the 2184 // interpreter runtime in the slow case. 2185 if (PrintBiasedLockingStatistics) { 2186 load_const(temp2_reg, (address) BiasedLocking::anonymously_biased_lock_entry_count_addr(), temp_reg); 2187 lwzx(temp_reg, temp2_reg); 2188 addi(temp_reg, temp_reg, 1); 2189 stwx(temp_reg, temp2_reg); 2190 } 2191 b(done); 2192 2193 bind(try_rebias); 2194 // At this point we know the epoch has expired, meaning that the 2195 // current "bias owner", if any, is actually invalid. Under these 2196 // circumstances _only_, we are allowed to use the current header's 2197 // value as the comparison value when doing the cas to acquire the 2198 // bias in the current epoch. In other words, we allow transfer of 2199 // the bias from one thread to another directly in this situation. 2200 load_klass(temp_reg, obj_reg); 2201 andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place); 2202 orr(temp2_reg, R16_thread, temp2_reg); 2203 ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg); 2204 orr(temp_reg, temp2_reg, temp_reg); 2205 2206 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2207 2208 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg, 2209 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg, 2210 /*where=*/obj_reg, 2211 MacroAssembler::MemBarAcq, 2212 MacroAssembler::cmpxchgx_hint_acquire_lock(), 2213 noreg, slow_case_int); // bail out if failed 2214 2215 // If the biasing toward our thread failed, this means that 2216 // another thread succeeded in biasing it toward itself and we 2217 // need to revoke that bias. The revocation will occur in the 2218 // interpreter runtime in the slow case. 2219 if (PrintBiasedLockingStatistics) { 2220 load_const(temp2_reg, (address) BiasedLocking::rebiased_lock_entry_count_addr(), temp_reg); 2221 lwzx(temp_reg, temp2_reg); 2222 addi(temp_reg, temp_reg, 1); 2223 stwx(temp_reg, temp2_reg); 2224 } 2225 b(done); 2226 2227 bind(try_revoke_bias); 2228 // The prototype mark in the klass doesn't have the bias bit set any 2229 // more, indicating that objects of this data type are not supposed 2230 // to be biased any more. We are going to try to reset the mark of 2231 // this object to the prototype value and fall through to the 2232 // CAS-based locking scheme. Note that if our CAS fails, it means 2233 // that another thread raced us for the privilege of revoking the 2234 // bias of this particular object, so it's okay to continue in the 2235 // normal locking code. 2236 load_klass(temp_reg, obj_reg); 2237 ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg); 2238 andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place); 2239 orr(temp_reg, temp_reg, temp2_reg); 2240 2241 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2242 2243 // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg). 2244 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg, 2245 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg, 2246 /*where=*/obj_reg, 2247 MacroAssembler::MemBarAcq, 2248 MacroAssembler::cmpxchgx_hint_acquire_lock()); 2249 2250 // reload markOop in mark_reg before continuing with lightweight locking 2251 ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg); 2252 2253 // Fall through to the normal CAS-based lock, because no matter what 2254 // the result of the above CAS, some thread must have succeeded in 2255 // removing the bias bit from the object's header. 2256 if (PrintBiasedLockingStatistics) { 2257 Label l; 2258 bne(cr_reg, l); 2259 load_const(temp2_reg, (address) BiasedLocking::revoked_lock_entry_count_addr(), temp_reg); 2260 lwzx(temp_reg, temp2_reg); 2261 addi(temp_reg, temp_reg, 1); 2262 stwx(temp_reg, temp2_reg); 2263 bind(l); 2264 } 2265 2266 bind(cas_label); 2267 } 2268 2269 void MacroAssembler::biased_locking_exit (ConditionRegister cr_reg, Register mark_addr, Register temp_reg, Label& done) { 2270 // Check for biased locking unlock case, which is a no-op 2271 // Note: we do not have to check the thread ID for two reasons. 2272 // First, the interpreter checks for IllegalMonitorStateException at 2273 // a higher level. Second, if the bias was revoked while we held the 2274 // lock, the object could not be rebiased toward another thread, so 2275 // the bias bit would be clear. 2276 2277 ld(temp_reg, 0, mark_addr); 2278 andi(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place); 2279 2280 cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern); 2281 beq(cr_reg, done); 2282 } 2283 2284 // allocation (for C1) 2285 void MacroAssembler::eden_allocate( 2286 Register obj, // result: pointer to object after successful allocation 2287 Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise 2288 int con_size_in_bytes, // object size in bytes if known at compile time 2289 Register t1, // temp register 2290 Register t2, // temp register 2291 Label& slow_case // continuation point if fast allocation fails 2292 ) { 2293 b(slow_case); 2294 } 2295 2296 void MacroAssembler::tlab_allocate( 2297 Register obj, // result: pointer to object after successful allocation 2298 Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise 2299 int con_size_in_bytes, // object size in bytes if known at compile time 2300 Register t1, // temp register 2301 Label& slow_case // continuation point if fast allocation fails 2302 ) { 2303 // make sure arguments make sense 2304 assert_different_registers(obj, var_size_in_bytes, t1); 2305 assert(0 <= con_size_in_bytes && is_simm13(con_size_in_bytes), "illegal object size"); 2306 assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment"); 2307 2308 const Register new_top = t1; 2309 //verify_tlab(); not implemented 2310 2311 ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread); 2312 ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread); 2313 if (var_size_in_bytes == noreg) { 2314 addi(new_top, obj, con_size_in_bytes); 2315 } else { 2316 add(new_top, obj, var_size_in_bytes); 2317 } 2318 cmpld(CCR0, new_top, R0); 2319 bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case); 2320 2321 #ifdef ASSERT 2322 // make sure new free pointer is properly aligned 2323 { 2324 Label L; 2325 andi_(R0, new_top, MinObjAlignmentInBytesMask); 2326 beq(CCR0, L); 2327 stop("updated TLAB free is not properly aligned", 0x934); 2328 bind(L); 2329 } 2330 #endif // ASSERT 2331 2332 // update the tlab top pointer 2333 std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread); 2334 //verify_tlab(); not implemented 2335 } 2336 void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register t1, Register t2) { 2337 unimplemented("incr_allocated_bytes"); 2338 } 2339 2340 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset, 2341 int insts_call_instruction_offset, Register Rtoc) { 2342 // Start the stub. 2343 address stub = start_a_stub(64); 2344 if (stub == NULL) { return NULL; } // CodeCache full: bail out 2345 2346 // Create a trampoline stub relocation which relates this trampoline stub 2347 // with the call instruction at insts_call_instruction_offset in the 2348 // instructions code-section. 2349 relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset)); 2350 const int stub_start_offset = offset(); 2351 2352 // For java_to_interp stubs we use R11_scratch1 as scratch register 2353 // and in call trampoline stubs we use R12_scratch2. This way we 2354 // can distinguish them (see is_NativeCallTrampolineStub_at()). 2355 Register reg_scratch = R12_scratch2; 2356 2357 // Now, create the trampoline stub's code: 2358 // - load the TOC 2359 // - load the call target from the constant pool 2360 // - call 2361 if (Rtoc == noreg) { 2362 calculate_address_from_global_toc(reg_scratch, method_toc()); 2363 Rtoc = reg_scratch; 2364 } 2365 2366 ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false); 2367 mtctr(reg_scratch); 2368 bctr(); 2369 2370 const address stub_start_addr = addr_at(stub_start_offset); 2371 2372 // Assert that the encoded destination_toc_offset can be identified and that it is correct. 2373 assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(), 2374 "encoded offset into the constant pool must match"); 2375 // Trampoline_stub_size should be good. 2376 assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size"); 2377 assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline"); 2378 2379 // End the stub. 2380 end_a_stub(); 2381 return stub; 2382 } 2383 2384 // TM on PPC64. 2385 void MacroAssembler::atomic_inc_ptr(Register addr, Register result, int simm16) { 2386 Label retry; 2387 bind(retry); 2388 ldarx(result, addr, /*hint*/ false); 2389 addi(result, result, simm16); 2390 stdcx_(result, addr); 2391 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 2392 bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0 2393 } else { 2394 bne( CCR0, retry); // stXcx_ sets CCR0 2395 } 2396 } 2397 2398 void MacroAssembler::atomic_ori_int(Register addr, Register result, int uimm16) { 2399 Label retry; 2400 bind(retry); 2401 lwarx(result, addr, /*hint*/ false); 2402 ori(result, result, uimm16); 2403 stwcx_(result, addr); 2404 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 2405 bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0 2406 } else { 2407 bne( CCR0, retry); // stXcx_ sets CCR0 2408 } 2409 } 2410 2411 #if INCLUDE_RTM_OPT 2412 2413 // Update rtm_counters based on abort status 2414 // input: abort_status 2415 // rtm_counters_Reg (RTMLockingCounters*) 2416 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters_Reg) { 2417 // Mapping to keep PreciseRTMLockingStatistics similar to x86. 2418 // x86 ppc (! means inverted, ? means not the same) 2419 // 0 31 Set if abort caused by XABORT instruction. 2420 // 1 ! 7 If set, the transaction may succeed on a retry. This bit is always clear if bit 0 is set. 2421 // 2 13 Set if another logical processor conflicted with a memory address that was part of the transaction that aborted. 2422 // 3 10 Set if an internal buffer overflowed. 2423 // 4 ?12 Set if a debug breakpoint was hit. 2424 // 5 ?32 Set if an abort occurred during execution of a nested transaction. 2425 const int failure_bit[] = {tm_tabort, // Signal handler will set this too. 2426 tm_failure_persistent, // Inverted: transient. 2427 tm_non_trans_cf, 2428 tm_trans_cf, 2429 tm_footprint_of, 2430 tm_failure_code, 2431 tm_transaction_level}; 2432 2433 const bool failure_logic_inv[] = {false, // tabort 2434 true, // failure_persistent 2435 false, // non_trans_cf 2436 false, // trans_cf 2437 false, // footprint_of 2438 true, // failure_code 2439 false}; // transaction_level 2440 2441 const int num_failure_bits = sizeof(failure_bit) / sizeof(int); 2442 const int num_counters = RTMLockingCounters::ABORT_STATUS_LIMIT; 2443 2444 bool bit2counter_map[][num_counters] = 2445 // counters: 2446 // 0 1 2 3 4 5 2447 // abort , persist, conflict, overflow, debug , nested bits: 2448 {{ true , false , false , false , false , false }, // abort 2449 { false , true , false , false , false , false }, // failure_persistent 2450 { false , false , true , false , false , false }, // non_trans_cf 2451 { false , false , true , false , false , false }, // trans_cf 2452 { false , false , false , true , false , false }, // footprint_of 2453 { false , false , false , false , true , false }, // failure_code = 0xD4 2454 { false , false , false , false , false , true }}; // transaction_level > 1 2455 // ... 2456 2457 // Move abort_status value to R0 and use abort_status register as a 2458 // temporary register because R0 as third operand in ld/std is treated 2459 // as base address zero (value). Likewise, R0 as second operand in addi 2460 // is problematic because it amounts to li. 2461 const Register temp_Reg = abort_status; 2462 const Register abort_status_R0 = R0; 2463 mr(abort_status_R0, abort_status); 2464 2465 // Keep track of offsets added to rtm_counters_Reg to restore it back. 2466 int counters_offs = RTMLockingCounters::abort_count_offset(); 2467 addi(rtm_counters_Reg, rtm_counters_Reg, counters_offs); 2468 2469 // Increment total abort counter. 2470 // atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically. 2471 ldx(temp_Reg, rtm_counters_Reg); 2472 addi(temp_Reg, temp_Reg, 1); 2473 stdx(temp_Reg, rtm_counters_Reg); 2474 2475 // Increment specific abort counters. 2476 if (PrintPreciseRTMLockingStatistics) { 2477 2478 int abort_offs; 2479 2480 abort_offs = RTMLockingCounters::abortX_count_offset() - counters_offs; 2481 addi(rtm_counters_Reg, rtm_counters_Reg, abort_offs); 2482 2483 // Keep track of offsets added to rtm_counters_Reg. 2484 counters_offs += abort_offs; 2485 2486 for (int nbit = 0; nbit < num_failure_bits; nbit++) { 2487 for (int ncounter = 0; ncounter < num_counters; ncounter++) { 2488 if (bit2counter_map[nbit][ncounter] == true) { 2489 2490 Label check_abort; 2491 2492 // Counter offset based on counter number (counter * 8 bytes). 2493 abort_offs = ncounter << 3; 2494 2495 if (failure_bit[nbit] == tm_transaction_level) { 2496 // Don't check outer transaction, TL = 1 (bit 63). Hence only 2497 // 11 bits in the TL field are checked to find out if failure 2498 // occured in a nested transaction. This check also matches 2499 // the case when nesting_of = 1 (nesting overflow). 2500 rldicr_(temp_Reg, abort_status_R0, failure_bit[nbit], 10); 2501 } else if (failure_bit[nbit] == tm_failure_code) { 2502 // Check failure code for trap or illegal caught in TM. 2503 // Bits 0:7 are tested as bit 7 (persistent) is copied from 2504 // tabort or treclaim source operand. 2505 // On Linux: trap or illegal is TM_CAUSE_SIGNAL (0xD4). 2506 rldicl(temp_Reg, abort_status_R0, 8, 56); 2507 cmpdi(CCR0, temp_Reg, 0xD4); 2508 } else { 2509 rldicr_(temp_Reg, abort_status_R0, failure_bit[nbit], 0); 2510 } 2511 2512 if (failure_logic_inv[nbit] == true) { 2513 bne(CCR0, check_abort); 2514 } else { 2515 beq(CCR0, check_abort); 2516 } 2517 2518 // We don't increment atomically. 2519 ld(temp_Reg, abort_offs, rtm_counters_Reg); 2520 addi(temp_Reg, temp_Reg, 1); 2521 std(temp_Reg, abort_offs, rtm_counters_Reg); 2522 2523 bind(check_abort); 2524 } 2525 } 2526 } 2527 } 2528 2529 // Restore rtm_counters_Reg and abort_status. 2530 addi(rtm_counters_Reg, rtm_counters_Reg, -counters_offs); 2531 mr(abort_status, abort_status_R0); 2532 } 2533 2534 // Branch if (random & (count-1) != 0), count is 2^n 2535 // tmp and CR0 are killed 2536 void MacroAssembler::branch_on_random_using_tb(Register tmp, int count, Label& brLabel) { 2537 mftb(tmp); 2538 andi_(tmp, tmp, count-1); 2539 bne(CCR0, brLabel); 2540 } 2541 2542 // Perform abort ratio calculation, set no_rtm bit if high ratio. 2543 // input: rtm_counters_Reg (RTMLockingCounters* address) - KILLED 2544 void MacroAssembler::rtm_abort_ratio_calculation(Register rtm_counters_Reg, 2545 RTMLockingCounters* rtm_counters, 2546 Metadata* method_data) { 2547 Label L_done, L_check_always_rtm1, L_check_always_rtm2; 2548 2549 if (RTMLockingCalculationDelay > 0) { 2550 // Delay calculation. 2551 ld(rtm_counters_Reg, (RegisterOrConstant)(intptr_t)RTMLockingCounters::rtm_calculation_flag_addr()); 2552 cmpdi(CCR0, rtm_counters_Reg, 0); 2553 beq(CCR0, L_done); 2554 load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload 2555 } 2556 // Abort ratio calculation only if abort_count > RTMAbortThreshold. 2557 // Aborted transactions = abort_count * 100 2558 // All transactions = total_count * RTMTotalCountIncrRate 2559 // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio) 2560 ld(R0, RTMLockingCounters::abort_count_offset(), rtm_counters_Reg); 2561 if (is_simm(RTMAbortThreshold, 16)) { // cmpdi can handle 16bit immediate only. 2562 cmpdi(CCR0, R0, RTMAbortThreshold); 2563 blt(CCR0, L_check_always_rtm2); // reload of rtm_counters_Reg not necessary 2564 } else { 2565 load_const_optimized(rtm_counters_Reg, RTMAbortThreshold); 2566 cmpd(CCR0, R0, rtm_counters_Reg); 2567 blt(CCR0, L_check_always_rtm1); // reload of rtm_counters_Reg required 2568 } 2569 mulli(R0, R0, 100); 2570 2571 const Register tmpReg = rtm_counters_Reg; 2572 ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg); 2573 mulli(tmpReg, tmpReg, RTMTotalCountIncrRate); // allowable range: int16 2574 mulli(tmpReg, tmpReg, RTMAbortRatio); // allowable range: int16 2575 cmpd(CCR0, R0, tmpReg); 2576 blt(CCR0, L_check_always_rtm1); // jump to reload 2577 if (method_data != NULL) { 2578 // Set rtm_state to "no rtm" in MDO. 2579 // Not using a metadata relocation. Method and Class Loader are kept alive anyway. 2580 // (See nmethod::metadata_do and CodeBuffer::finalize_oop_references.) 2581 load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg); 2582 atomic_ori_int(R0, tmpReg, NoRTM); 2583 } 2584 b(L_done); 2585 2586 bind(L_check_always_rtm1); 2587 load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload 2588 bind(L_check_always_rtm2); 2589 ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg); 2590 int64_t thresholdValue = RTMLockingThreshold / RTMTotalCountIncrRate; 2591 if (is_simm(thresholdValue, 16)) { // cmpdi can handle 16bit immediate only. 2592 cmpdi(CCR0, tmpReg, thresholdValue); 2593 } else { 2594 load_const_optimized(R0, thresholdValue); 2595 cmpd(CCR0, tmpReg, R0); 2596 } 2597 blt(CCR0, L_done); 2598 if (method_data != NULL) { 2599 // Set rtm_state to "always rtm" in MDO. 2600 // Not using a metadata relocation. See above. 2601 load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg); 2602 atomic_ori_int(R0, tmpReg, UseRTM); 2603 } 2604 bind(L_done); 2605 } 2606 2607 // Update counters and perform abort ratio calculation. 2608 // input: abort_status_Reg 2609 void MacroAssembler::rtm_profiling(Register abort_status_Reg, Register temp_Reg, 2610 RTMLockingCounters* rtm_counters, 2611 Metadata* method_data, 2612 bool profile_rtm) { 2613 2614 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2615 // Update rtm counters based on state at abort. 2616 // Reads abort_status_Reg, updates flags. 2617 assert_different_registers(abort_status_Reg, temp_Reg); 2618 load_const_optimized(temp_Reg, (address)rtm_counters, R0); 2619 rtm_counters_update(abort_status_Reg, temp_Reg); 2620 if (profile_rtm) { 2621 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2622 rtm_abort_ratio_calculation(temp_Reg, rtm_counters, method_data); 2623 } 2624 } 2625 2626 // Retry on abort if abort's status indicates non-persistent failure. 2627 // inputs: retry_count_Reg 2628 // : abort_status_Reg 2629 // output: retry_count_Reg decremented by 1 2630 void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, 2631 Label& retryLabel, Label* checkRetry) { 2632 Label doneRetry; 2633 rldicr_(R0, abort_status_Reg, tm_failure_persistent, 0); 2634 bne(CCR0, doneRetry); 2635 if (checkRetry) { bind(*checkRetry); } 2636 addic_(retry_count_Reg, retry_count_Reg, -1); 2637 blt(CCR0, doneRetry); 2638 b(retryLabel); 2639 bind(doneRetry); 2640 } 2641 2642 // Spin and retry if lock is busy. 2643 // inputs: owner_addr_Reg (monitor address) 2644 // : retry_count_Reg 2645 // output: retry_count_Reg decremented by 1 2646 // CTR is killed 2647 void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register owner_addr_Reg, Label& retryLabel) { 2648 Label SpinLoop, doneRetry, doRetry; 2649 addic_(retry_count_Reg, retry_count_Reg, -1); 2650 blt(CCR0, doneRetry); 2651 2652 if (RTMSpinLoopCount > 1) { 2653 li(R0, RTMSpinLoopCount); 2654 mtctr(R0); 2655 } 2656 2657 // low thread priority 2658 smt_prio_low(); 2659 bind(SpinLoop); 2660 2661 if (RTMSpinLoopCount > 1) { 2662 bdz(doRetry); 2663 ld(R0, 0, owner_addr_Reg); 2664 cmpdi(CCR0, R0, 0); 2665 bne(CCR0, SpinLoop); 2666 } 2667 2668 bind(doRetry); 2669 2670 // restore thread priority to default in userspace 2671 #ifdef LINUX 2672 smt_prio_medium_low(); 2673 #else 2674 smt_prio_medium(); 2675 #endif 2676 2677 b(retryLabel); 2678 2679 bind(doneRetry); 2680 } 2681 2682 // Use RTM for normal stack locks. 2683 // Input: objReg (object to lock) 2684 void MacroAssembler::rtm_stack_locking(ConditionRegister flag, 2685 Register obj, Register mark_word, Register tmp, 2686 Register retry_on_abort_count_Reg, 2687 RTMLockingCounters* stack_rtm_counters, 2688 Metadata* method_data, bool profile_rtm, 2689 Label& DONE_LABEL, Label& IsInflated) { 2690 assert(UseRTMForStackLocks, "why call this otherwise?"); 2691 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); 2692 Label L_rtm_retry, L_decrement_retry, L_on_abort; 2693 2694 if (RTMRetryCount > 0) { 2695 load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 2696 bind(L_rtm_retry); 2697 } 2698 andi_(R0, mark_word, markOopDesc::monitor_value); // inflated vs stack-locked|neutral|biased 2699 bne(CCR0, IsInflated); 2700 2701 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2702 Label L_noincrement; 2703 if (RTMTotalCountIncrRate > 1) { 2704 branch_on_random_using_tb(tmp, RTMTotalCountIncrRate, L_noincrement); 2705 } 2706 assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM"); 2707 load_const_optimized(tmp, (address)stack_rtm_counters->total_count_addr(), R0); 2708 //atomic_inc_ptr(tmp, /*temp, will be reloaded*/mark_word); We don't increment atomically 2709 ldx(mark_word, tmp); 2710 addi(mark_word, mark_word, 1); 2711 stdx(mark_word, tmp); 2712 bind(L_noincrement); 2713 } 2714 tbegin_(); 2715 beq(CCR0, L_on_abort); 2716 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // Reload in transaction, conflicts need to be tracked. 2717 andi(R0, mark_word, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits 2718 cmpwi(flag, R0, markOopDesc::unlocked_value); // bits = 001 unlocked 2719 beq(flag, DONE_LABEL); // all done if unlocked 2720 2721 if (UseRTMXendForLockBusy) { 2722 tend_(); 2723 b(L_decrement_retry); 2724 } else { 2725 tabort_(); 2726 } 2727 bind(L_on_abort); 2728 const Register abort_status_Reg = tmp; 2729 mftexasr(abort_status_Reg); 2730 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2731 rtm_profiling(abort_status_Reg, /*temp*/mark_word, stack_rtm_counters, method_data, profile_rtm); 2732 } 2733 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // reload 2734 if (RTMRetryCount > 0) { 2735 // Retry on lock abort if abort status is not permanent. 2736 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry, &L_decrement_retry); 2737 } else { 2738 bind(L_decrement_retry); 2739 } 2740 } 2741 2742 // Use RTM for inflating locks 2743 // inputs: obj (object to lock) 2744 // mark_word (current header - KILLED) 2745 // boxReg (on-stack box address (displaced header location) - KILLED) 2746 void MacroAssembler::rtm_inflated_locking(ConditionRegister flag, 2747 Register obj, Register mark_word, Register boxReg, 2748 Register retry_on_busy_count_Reg, Register retry_on_abort_count_Reg, 2749 RTMLockingCounters* rtm_counters, 2750 Metadata* method_data, bool profile_rtm, 2751 Label& DONE_LABEL) { 2752 assert(UseRTMLocking, "why call this otherwise?"); 2753 Label L_rtm_retry, L_decrement_retry, L_on_abort; 2754 // Clean monitor_value bit to get valid pointer. 2755 int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markOopDesc::monitor_value; 2756 2757 // Store non-null, using boxReg instead of (intptr_t)markOopDesc::unused_mark(). 2758 std(boxReg, BasicLock::displaced_header_offset_in_bytes(), boxReg); 2759 const Register tmpReg = boxReg; 2760 const Register owner_addr_Reg = mark_word; 2761 addi(owner_addr_Reg, mark_word, owner_offset); 2762 2763 if (RTMRetryCount > 0) { 2764 load_const_optimized(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy. 2765 load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort. 2766 bind(L_rtm_retry); 2767 } 2768 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2769 Label L_noincrement; 2770 if (RTMTotalCountIncrRate > 1) { 2771 branch_on_random_using_tb(R0, RTMTotalCountIncrRate, L_noincrement); 2772 } 2773 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2774 load_const(R0, (address)rtm_counters->total_count_addr(), tmpReg); 2775 //atomic_inc_ptr(R0, tmpReg); We don't increment atomically 2776 ldx(tmpReg, R0); 2777 addi(tmpReg, tmpReg, 1); 2778 stdx(tmpReg, R0); 2779 bind(L_noincrement); 2780 } 2781 tbegin_(); 2782 beq(CCR0, L_on_abort); 2783 // We don't reload mark word. Will only be reset at safepoint. 2784 ld(R0, 0, owner_addr_Reg); // Load in transaction, conflicts need to be tracked. 2785 cmpdi(flag, R0, 0); 2786 beq(flag, DONE_LABEL); 2787 2788 if (UseRTMXendForLockBusy) { 2789 tend_(); 2790 b(L_decrement_retry); 2791 } else { 2792 tabort_(); 2793 } 2794 bind(L_on_abort); 2795 const Register abort_status_Reg = tmpReg; 2796 mftexasr(abort_status_Reg); 2797 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2798 rtm_profiling(abort_status_Reg, /*temp*/ owner_addr_Reg, rtm_counters, method_data, profile_rtm); 2799 // Restore owner_addr_Reg 2800 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); 2801 #ifdef ASSERT 2802 andi_(R0, mark_word, markOopDesc::monitor_value); 2803 asm_assert_ne("must be inflated", 0xa754); // Deflating only allowed at safepoint. 2804 #endif 2805 addi(owner_addr_Reg, mark_word, owner_offset); 2806 } 2807 if (RTMRetryCount > 0) { 2808 // Retry on lock abort if abort status is not permanent. 2809 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 2810 } 2811 2812 // Appears unlocked - try to swing _owner from null to non-null. 2813 cmpxchgd(flag, /*current val*/ R0, (intptr_t)0, /*new val*/ R16_thread, owner_addr_Reg, 2814 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2815 MacroAssembler::cmpxchgx_hint_acquire_lock(), noreg, &L_decrement_retry, true); 2816 2817 if (RTMRetryCount > 0) { 2818 // success done else retry 2819 b(DONE_LABEL); 2820 bind(L_decrement_retry); 2821 // Spin and retry if lock is busy. 2822 rtm_retry_lock_on_busy(retry_on_busy_count_Reg, owner_addr_Reg, L_rtm_retry); 2823 } else { 2824 bind(L_decrement_retry); 2825 } 2826 } 2827 2828 #endif // INCLUDE_RTM_OPT 2829 2830 // "The box" is the space on the stack where we copy the object mark. 2831 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box, 2832 Register temp, Register displaced_header, Register current_header, 2833 bool try_bias, 2834 RTMLockingCounters* rtm_counters, 2835 RTMLockingCounters* stack_rtm_counters, 2836 Metadata* method_data, 2837 bool use_rtm, bool profile_rtm) { 2838 assert_different_registers(oop, box, temp, displaced_header, current_header); 2839 assert(flag != CCR0, "bad condition register"); 2840 Label cont; 2841 Label object_has_monitor; 2842 Label cas_failed; 2843 2844 // Load markOop from object into displaced_header. 2845 ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop); 2846 2847 2848 // Always do locking in runtime. 2849 if (EmitSync & 0x01) { 2850 cmpdi(flag, oop, 0); // Oop can't be 0 here => always false. 2851 return; 2852 } 2853 2854 if (try_bias) { 2855 biased_locking_enter(flag, oop, displaced_header, temp, current_header, cont); 2856 } 2857 2858 #if INCLUDE_RTM_OPT 2859 if (UseRTMForStackLocks && use_rtm) { 2860 rtm_stack_locking(flag, oop, displaced_header, temp, /*temp*/ current_header, 2861 stack_rtm_counters, method_data, profile_rtm, 2862 cont, object_has_monitor); 2863 } 2864 #endif // INCLUDE_RTM_OPT 2865 2866 // Handle existing monitor. 2867 if ((EmitSync & 0x02) == 0) { 2868 // The object has an existing monitor iff (mark & monitor_value) != 0. 2869 andi_(temp, displaced_header, markOopDesc::monitor_value); 2870 bne(CCR0, object_has_monitor); 2871 } 2872 2873 // Set displaced_header to be (markOop of object | UNLOCK_VALUE). 2874 ori(displaced_header, displaced_header, markOopDesc::unlocked_value); 2875 2876 // Load Compare Value application register. 2877 2878 // Initialize the box. (Must happen before we update the object mark!) 2879 std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2880 2881 // Must fence, otherwise, preceding store(s) may float below cmpxchg. 2882 // Compare object markOop with mark and if equal exchange scratch1 with object markOop. 2883 cmpxchgd(/*flag=*/flag, 2884 /*current_value=*/current_header, 2885 /*compare_value=*/displaced_header, 2886 /*exchange_value=*/box, 2887 /*where=*/oop, 2888 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2889 MacroAssembler::cmpxchgx_hint_acquire_lock(), 2890 noreg, 2891 &cas_failed, 2892 /*check without membar and ldarx first*/true); 2893 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2894 2895 // If the compare-and-exchange succeeded, then we found an unlocked 2896 // object and we have now locked it. 2897 b(cont); 2898 2899 bind(cas_failed); 2900 // We did not see an unlocked object so try the fast recursive case. 2901 2902 // Check if the owner is self by comparing the value in the markOop of object 2903 // (current_header) with the stack pointer. 2904 sub(current_header, current_header, R1_SP); 2905 load_const_optimized(temp, ~(os::vm_page_size()-1) | markOopDesc::lock_mask_in_place); 2906 2907 and_(R0/*==0?*/, current_header, temp); 2908 // If condition is true we are cont and hence we can store 0 as the 2909 // displaced header in the box, which indicates that it is a recursive lock. 2910 mcrf(flag,CCR0); 2911 std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box); 2912 2913 // Handle existing monitor. 2914 if ((EmitSync & 0x02) == 0) { 2915 b(cont); 2916 2917 bind(object_has_monitor); 2918 // The object's monitor m is unlocked iff m->owner == NULL, 2919 // otherwise m->owner may contain a thread or a stack address. 2920 2921 #if INCLUDE_RTM_OPT 2922 // Use the same RTM locking code in 32- and 64-bit VM. 2923 if (use_rtm) { 2924 rtm_inflated_locking(flag, oop, displaced_header, box, temp, /*temp*/ current_header, 2925 rtm_counters, method_data, profile_rtm, cont); 2926 } else { 2927 #endif // INCLUDE_RTM_OPT 2928 2929 // Try to CAS m->owner from NULL to current thread. 2930 addi(temp, displaced_header, ObjectMonitor::owner_offset_in_bytes()-markOopDesc::monitor_value); 2931 cmpxchgd(/*flag=*/flag, 2932 /*current_value=*/current_header, 2933 /*compare_value=*/(intptr_t)0, 2934 /*exchange_value=*/R16_thread, 2935 /*where=*/temp, 2936 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2937 MacroAssembler::cmpxchgx_hint_acquire_lock()); 2938 2939 // Store a non-null value into the box. 2940 std(box, BasicLock::displaced_header_offset_in_bytes(), box); 2941 2942 # ifdef ASSERT 2943 bne(flag, cont); 2944 // We have acquired the monitor, check some invariants. 2945 addi(/*monitor=*/temp, temp, -ObjectMonitor::owner_offset_in_bytes()); 2946 // Invariant 1: _recursions should be 0. 2947 //assert(ObjectMonitor::recursions_size_in_bytes() == 8, "unexpected size"); 2948 asm_assert_mem8_is_zero(ObjectMonitor::recursions_offset_in_bytes(), temp, 2949 "monitor->_recursions should be 0", -1); 2950 # endif 2951 2952 #if INCLUDE_RTM_OPT 2953 } // use_rtm() 2954 #endif 2955 } 2956 2957 bind(cont); 2958 // flag == EQ indicates success 2959 // flag == NE indicates failure 2960 } 2961 2962 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box, 2963 Register temp, Register displaced_header, Register current_header, 2964 bool try_bias, bool use_rtm) { 2965 assert_different_registers(oop, box, temp, displaced_header, current_header); 2966 assert(flag != CCR0, "bad condition register"); 2967 Label cont; 2968 Label object_has_monitor; 2969 2970 // Always do locking in runtime. 2971 if (EmitSync & 0x01) { 2972 cmpdi(flag, oop, 0); // Oop can't be 0 here => always false. 2973 return; 2974 } 2975 2976 if (try_bias) { 2977 biased_locking_exit(flag, oop, current_header, cont); 2978 } 2979 2980 #if INCLUDE_RTM_OPT 2981 if (UseRTMForStackLocks && use_rtm) { 2982 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); 2983 Label L_regular_unlock; 2984 ld(current_header, oopDesc::mark_offset_in_bytes(), oop); // fetch markword 2985 andi(R0, current_header, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits 2986 cmpwi(flag, R0, markOopDesc::unlocked_value); // bits = 001 unlocked 2987 bne(flag, L_regular_unlock); // else RegularLock 2988 tend_(); // otherwise end... 2989 b(cont); // ... and we're done 2990 bind(L_regular_unlock); 2991 } 2992 #endif 2993 2994 // Find the lock address and load the displaced header from the stack. 2995 ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2996 2997 // If the displaced header is 0, we have a recursive unlock. 2998 cmpdi(flag, displaced_header, 0); 2999 beq(flag, cont); 3000 3001 // Handle existing monitor. 3002 if ((EmitSync & 0x02) == 0) { 3003 // The object has an existing monitor iff (mark & monitor_value) != 0. 3004 RTM_OPT_ONLY( if (!(UseRTMForStackLocks && use_rtm)) ) // skip load if already done 3005 ld(current_header, oopDesc::mark_offset_in_bytes(), oop); 3006 andi_(R0, current_header, markOopDesc::monitor_value); 3007 bne(CCR0, object_has_monitor); 3008 } 3009 3010 // Check if it is still a light weight lock, this is is true if we see 3011 // the stack address of the basicLock in the markOop of the object. 3012 // Cmpxchg sets flag to cmpd(current_header, box). 3013 cmpxchgd(/*flag=*/flag, 3014 /*current_value=*/current_header, 3015 /*compare_value=*/box, 3016 /*exchange_value=*/displaced_header, 3017 /*where=*/oop, 3018 MacroAssembler::MemBarRel, 3019 MacroAssembler::cmpxchgx_hint_release_lock(), 3020 noreg, 3021 &cont); 3022 3023 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 3024 3025 // Handle existing monitor. 3026 if ((EmitSync & 0x02) == 0) { 3027 b(cont); 3028 3029 bind(object_has_monitor); 3030 addi(current_header, current_header, -markOopDesc::monitor_value); // monitor 3031 ld(temp, ObjectMonitor::owner_offset_in_bytes(), current_header); 3032 3033 // It's inflated. 3034 #if INCLUDE_RTM_OPT 3035 if (use_rtm) { 3036 Label L_regular_inflated_unlock; 3037 // Clean monitor_value bit to get valid pointer 3038 cmpdi(flag, temp, 0); 3039 bne(flag, L_regular_inflated_unlock); 3040 tend_(); 3041 b(cont); 3042 bind(L_regular_inflated_unlock); 3043 } 3044 #endif 3045 3046 ld(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header); 3047 xorr(temp, R16_thread, temp); // Will be 0 if we are the owner. 3048 orr(temp, temp, displaced_header); // Will be 0 if there are 0 recursions. 3049 cmpdi(flag, temp, 0); 3050 bne(flag, cont); 3051 3052 ld(temp, ObjectMonitor::EntryList_offset_in_bytes(), current_header); 3053 ld(displaced_header, ObjectMonitor::cxq_offset_in_bytes(), current_header); 3054 orr(temp, temp, displaced_header); // Will be 0 if both are 0. 3055 cmpdi(flag, temp, 0); 3056 bne(flag, cont); 3057 release(); 3058 std(temp, ObjectMonitor::owner_offset_in_bytes(), current_header); 3059 } 3060 3061 bind(cont); 3062 // flag == EQ indicates success 3063 // flag == NE indicates failure 3064 } 3065 3066 // Write serialization page so VM thread can do a pseudo remote membar. 3067 // We use the current thread pointer to calculate a thread specific 3068 // offset to write to within the page. This minimizes bus traffic 3069 // due to cache line collision. 3070 void MacroAssembler::serialize_memory(Register thread, Register tmp1, Register tmp2) { 3071 srdi(tmp2, thread, os::get_serialize_page_shift_count()); 3072 3073 int mask = os::vm_page_size() - sizeof(int); 3074 if (Assembler::is_simm(mask, 16)) { 3075 andi(tmp2, tmp2, mask); 3076 } else { 3077 lis(tmp1, (int)((signed short) (mask >> 16))); 3078 ori(tmp1, tmp1, mask & 0x0000ffff); 3079 andr(tmp2, tmp2, tmp1); 3080 } 3081 3082 load_const(tmp1, (long) os::get_memory_serialize_page()); 3083 release(); 3084 stwx(R0, tmp1, tmp2); 3085 } 3086 3087 void MacroAssembler::safepoint_poll(Label& slow_path, Register temp_reg) { 3088 if (SafepointMechanism::uses_thread_local_poll()) { 3089 ld(temp_reg, in_bytes(Thread::polling_page_offset()), R16_thread); 3090 // Armed page has poll_bit set. 3091 andi_(temp_reg, temp_reg, SafepointMechanism::poll_bit()); 3092 } else { 3093 lwz(temp_reg, (RegisterOrConstant)(intptr_t)SafepointSynchronize::address_of_state()); 3094 cmpwi(CCR0, temp_reg, SafepointSynchronize::_not_synchronized); 3095 } 3096 bne(CCR0, slow_path); 3097 } 3098 3099 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2, bool needs_frame) { 3100 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 3101 bs->resolve_jobject(this, value, tmp1, tmp2, needs_frame); 3102 } 3103 3104 // Values for last_Java_pc, and last_Java_sp must comply to the rules 3105 // in frame_ppc.hpp. 3106 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) { 3107 // Always set last_Java_pc and flags first because once last_Java_sp 3108 // is visible has_last_Java_frame is true and users will look at the 3109 // rest of the fields. (Note: flags should always be zero before we 3110 // get here so doesn't need to be set.) 3111 3112 // Verify that last_Java_pc was zeroed on return to Java 3113 asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread, 3114 "last_Java_pc not zeroed before leaving Java", 0x200); 3115 3116 // When returning from calling out from Java mode the frame anchor's 3117 // last_Java_pc will always be set to NULL. It is set here so that 3118 // if we are doing a call to native (not VM) that we capture the 3119 // known pc and don't have to rely on the native call having a 3120 // standard frame linkage where we can find the pc. 3121 if (last_Java_pc != noreg) 3122 std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 3123 3124 // Set last_Java_sp last. 3125 std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 3126 } 3127 3128 void MacroAssembler::reset_last_Java_frame(void) { 3129 asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()), 3130 R16_thread, "SP was not set, still zero", 0x202); 3131 3132 BLOCK_COMMENT("reset_last_Java_frame {"); 3133 li(R0, 0); 3134 3135 // _last_Java_sp = 0 3136 std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 3137 3138 // _last_Java_pc = 0 3139 std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 3140 BLOCK_COMMENT("} reset_last_Java_frame"); 3141 } 3142 3143 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) { 3144 assert_different_registers(sp, tmp1); 3145 3146 // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via 3147 // TOP_IJAVA_FRAME_ABI. 3148 // FIXME: assert that we really have a TOP_IJAVA_FRAME here! 3149 address entry = pc(); 3150 load_const_optimized(tmp1, entry); 3151 3152 set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1); 3153 } 3154 3155 void MacroAssembler::get_vm_result(Register oop_result) { 3156 // Read: 3157 // R16_thread 3158 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 3159 // 3160 // Updated: 3161 // oop_result 3162 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 3163 3164 verify_thread(); 3165 3166 ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread); 3167 li(R0, 0); 3168 std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread); 3169 3170 verify_oop(oop_result); 3171 } 3172 3173 void MacroAssembler::get_vm_result_2(Register metadata_result) { 3174 // Read: 3175 // R16_thread 3176 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 3177 // 3178 // Updated: 3179 // metadata_result 3180 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 3181 3182 ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 3183 li(R0, 0); 3184 std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 3185 } 3186 3187 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) { 3188 Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided. 3189 if (Universe::narrow_klass_base() != 0) { 3190 // Use dst as temp if it is free. 3191 sub_const_optimized(dst, current, Universe::narrow_klass_base(), R0); 3192 current = dst; 3193 } 3194 if (Universe::narrow_klass_shift() != 0) { 3195 srdi(dst, current, Universe::narrow_klass_shift()); 3196 current = dst; 3197 } 3198 return current; 3199 } 3200 3201 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) { 3202 if (UseCompressedClassPointers) { 3203 Register compressedKlass = encode_klass_not_null(ck, klass); 3204 stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop); 3205 } else { 3206 std(klass, oopDesc::klass_offset_in_bytes(), dst_oop); 3207 } 3208 } 3209 3210 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) { 3211 if (UseCompressedClassPointers) { 3212 if (val == noreg) { 3213 val = R0; 3214 li(val, 0); 3215 } 3216 stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed 3217 } 3218 } 3219 3220 int MacroAssembler::instr_size_for_decode_klass_not_null() { 3221 if (!UseCompressedClassPointers) return 0; 3222 int num_instrs = 1; // shift or move 3223 if (Universe::narrow_klass_base() != 0) num_instrs = 7; // shift + load const + add 3224 return num_instrs * BytesPerInstWord; 3225 } 3226 3227 void MacroAssembler::decode_klass_not_null(Register dst, Register src) { 3228 assert(dst != R0, "Dst reg may not be R0, as R0 is used here."); 3229 if (src == noreg) src = dst; 3230 Register shifted_src = src; 3231 if (Universe::narrow_klass_shift() != 0 || 3232 Universe::narrow_klass_base() == 0 && src != dst) { // Move required. 3233 shifted_src = dst; 3234 sldi(shifted_src, src, Universe::narrow_klass_shift()); 3235 } 3236 if (Universe::narrow_klass_base() != 0) { 3237 add_const_optimized(dst, shifted_src, Universe::narrow_klass_base(), R0); 3238 } 3239 } 3240 3241 void MacroAssembler::load_klass(Register dst, Register src) { 3242 if (UseCompressedClassPointers) { 3243 lwz(dst, oopDesc::klass_offset_in_bytes(), src); 3244 // Attention: no null check here! 3245 decode_klass_not_null(dst, dst); 3246 } else { 3247 ld(dst, oopDesc::klass_offset_in_bytes(), src); 3248 } 3249 } 3250 3251 // ((OopHandle)result).resolve(); 3252 void MacroAssembler::resolve_oop_handle(Register result) { 3253 // OopHandle::resolve is an indirection. 3254 ld(result, 0, result); 3255 } 3256 3257 void MacroAssembler::load_mirror_from_const_method(Register mirror, Register const_method) { 3258 ld(mirror, in_bytes(ConstMethod::constants_offset()), const_method); 3259 ld(mirror, ConstantPool::pool_holder_offset_in_bytes(), mirror); 3260 ld(mirror, in_bytes(Klass::java_mirror_offset()), mirror); 3261 resolve_oop_handle(mirror); 3262 } 3263 3264 // Clear Array 3265 // For very short arrays. tmp == R0 is allowed. 3266 void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) { 3267 if (cnt_dwords > 0) { li(tmp, 0); } 3268 for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); } 3269 } 3270 3271 // Version for constant short array length. Kills base_ptr. tmp == R0 is allowed. 3272 void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) { 3273 if (cnt_dwords < 8) { 3274 clear_memory_unrolled(base_ptr, cnt_dwords, tmp); 3275 return; 3276 } 3277 3278 Label loop; 3279 const long loopcnt = cnt_dwords >> 1, 3280 remainder = cnt_dwords & 1; 3281 3282 li(tmp, loopcnt); 3283 mtctr(tmp); 3284 li(tmp, 0); 3285 bind(loop); 3286 std(tmp, 0, base_ptr); 3287 std(tmp, 8, base_ptr); 3288 addi(base_ptr, base_ptr, 16); 3289 bdnz(loop); 3290 if (remainder) { std(tmp, 0, base_ptr); } 3291 } 3292 3293 // Kills both input registers. tmp == R0 is allowed. 3294 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) { 3295 // Procedure for large arrays (uses data cache block zero instruction). 3296 Label startloop, fast, fastloop, small_rest, restloop, done; 3297 const int cl_size = VM_Version::L1_data_cache_line_size(), 3298 cl_dwords = cl_size >> 3, 3299 cl_dw_addr_bits = exact_log2(cl_dwords), 3300 dcbz_min = 1, // Min count of dcbz executions, needs to be >0. 3301 min_cnt = ((dcbz_min + 1) << cl_dw_addr_bits) - 1; 3302 3303 if (const_cnt >= 0) { 3304 // Constant case. 3305 if (const_cnt < min_cnt) { 3306 clear_memory_constlen(base_ptr, const_cnt, tmp); 3307 return; 3308 } 3309 load_const_optimized(cnt_dwords, const_cnt, tmp); 3310 } else { 3311 // cnt_dwords already loaded in register. Need to check size. 3312 cmpdi(CCR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included). 3313 blt(CCR1, small_rest); 3314 } 3315 rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line. 3316 beq(CCR0, fast); // Already 128byte aligned. 3317 3318 subfic(tmp, tmp, cl_dwords); 3319 mtctr(tmp); // Set ctr to hit 128byte boundary (0<ctr<cl_dwords). 3320 subf(cnt_dwords, tmp, cnt_dwords); // rest. 3321 li(tmp, 0); 3322 3323 bind(startloop); // Clear at the beginning to reach 128byte boundary. 3324 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 3325 addi(base_ptr, base_ptr, 8); 3326 bdnz(startloop); 3327 3328 bind(fast); // Clear 128byte blocks. 3329 srdi(tmp, cnt_dwords, cl_dw_addr_bits); // Loop count for 128byte loop (>0). 3330 andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords. 3331 mtctr(tmp); // Load counter. 3332 3333 bind(fastloop); 3334 dcbz(base_ptr); // Clear 128byte aligned block. 3335 addi(base_ptr, base_ptr, cl_size); 3336 bdnz(fastloop); 3337 3338 bind(small_rest); 3339 cmpdi(CCR0, cnt_dwords, 0); // size 0? 3340 beq(CCR0, done); // rest == 0 3341 li(tmp, 0); 3342 mtctr(cnt_dwords); // Load counter. 3343 3344 bind(restloop); // Clear rest. 3345 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 3346 addi(base_ptr, base_ptr, 8); 3347 bdnz(restloop); 3348 3349 bind(done); 3350 } 3351 3352 /////////////////////////////////////////// String intrinsics //////////////////////////////////////////// 3353 3354 #ifdef COMPILER2 3355 // Intrinsics for CompactStrings 3356 3357 // Compress char[] to byte[] by compressing 16 bytes at once. 3358 void MacroAssembler::string_compress_16(Register src, Register dst, Register cnt, 3359 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, 3360 Label& Lfailure) { 3361 3362 const Register tmp0 = R0; 3363 assert_different_registers(src, dst, cnt, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5); 3364 Label Lloop, Lslow; 3365 3366 // Check if cnt >= 8 (= 16 bytes) 3367 lis(tmp1, 0xFF); // tmp1 = 0x00FF00FF00FF00FF 3368 srwi_(tmp2, cnt, 3); 3369 beq(CCR0, Lslow); 3370 ori(tmp1, tmp1, 0xFF); 3371 rldimi(tmp1, tmp1, 32, 0); 3372 mtctr(tmp2); 3373 3374 // 2x unrolled loop 3375 bind(Lloop); 3376 ld(tmp2, 0, src); // _0_1_2_3 (Big Endian) 3377 ld(tmp4, 8, src); // _4_5_6_7 3378 3379 orr(tmp0, tmp2, tmp4); 3380 rldicl(tmp3, tmp2, 6*8, 64-24); // _____1_2 3381 rldimi(tmp2, tmp2, 2*8, 2*8); // _0_2_3_3 3382 rldicl(tmp5, tmp4, 6*8, 64-24); // _____5_6 3383 rldimi(tmp4, tmp4, 2*8, 2*8); // _4_6_7_7 3384 3385 andc_(tmp0, tmp0, tmp1); 3386 bne(CCR0, Lfailure); // Not latin1. 3387 addi(src, src, 16); 3388 3389 rlwimi(tmp3, tmp2, 0*8, 24, 31);// _____1_3 3390 srdi(tmp2, tmp2, 3*8); // ____0_2_ 3391 rlwimi(tmp5, tmp4, 0*8, 24, 31);// _____5_7 3392 srdi(tmp4, tmp4, 3*8); // ____4_6_ 3393 3394 orr(tmp2, tmp2, tmp3); // ____0123 3395 orr(tmp4, tmp4, tmp5); // ____4567 3396 3397 stw(tmp2, 0, dst); 3398 stw(tmp4, 4, dst); 3399 addi(dst, dst, 8); 3400 bdnz(Lloop); 3401 3402 bind(Lslow); // Fallback to slow version 3403 } 3404 3405 // Compress char[] to byte[]. cnt must be positive int. 3406 void MacroAssembler::string_compress(Register src, Register dst, Register cnt, Register tmp, Label& Lfailure) { 3407 Label Lloop; 3408 mtctr(cnt); 3409 3410 bind(Lloop); 3411 lhz(tmp, 0, src); 3412 cmplwi(CCR0, tmp, 0xff); 3413 bgt(CCR0, Lfailure); // Not latin1. 3414 addi(src, src, 2); 3415 stb(tmp, 0, dst); 3416 addi(dst, dst, 1); 3417 bdnz(Lloop); 3418 } 3419 3420 // Inflate byte[] to char[] by inflating 16 bytes at once. 3421 void MacroAssembler::string_inflate_16(Register src, Register dst, Register cnt, 3422 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) { 3423 const Register tmp0 = R0; 3424 assert_different_registers(src, dst, cnt, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5); 3425 Label Lloop, Lslow; 3426 3427 // Check if cnt >= 8 3428 srwi_(tmp2, cnt, 3); 3429 beq(CCR0, Lslow); 3430 lis(tmp1, 0xFF); // tmp1 = 0x00FF00FF 3431 ori(tmp1, tmp1, 0xFF); 3432 mtctr(tmp2); 3433 3434 // 2x unrolled loop 3435 bind(Lloop); 3436 lwz(tmp2, 0, src); // ____0123 (Big Endian) 3437 lwz(tmp4, 4, src); // ____4567 3438 addi(src, src, 8); 3439 3440 rldicl(tmp3, tmp2, 7*8, 64-8); // _______2 3441 rlwimi(tmp2, tmp2, 3*8, 16, 23);// ____0113 3442 rldicl(tmp5, tmp4, 7*8, 64-8); // _______6 3443 rlwimi(tmp4, tmp4, 3*8, 16, 23);// ____4557 3444 3445 andc(tmp0, tmp2, tmp1); // ____0_1_ 3446 rlwimi(tmp2, tmp3, 2*8, 0, 23); // _____2_3 3447 andc(tmp3, tmp4, tmp1); // ____4_5_ 3448 rlwimi(tmp4, tmp5, 2*8, 0, 23); // _____6_7 3449 3450 rldimi(tmp2, tmp0, 3*8, 0*8); // _0_1_2_3 3451 rldimi(tmp4, tmp3, 3*8, 0*8); // _4_5_6_7 3452 3453 std(tmp2, 0, dst); 3454 std(tmp4, 8, dst); 3455 addi(dst, dst, 16); 3456 bdnz(Lloop); 3457 3458 bind(Lslow); // Fallback to slow version 3459 } 3460 3461 // Inflate byte[] to char[]. cnt must be positive int. 3462 void MacroAssembler::string_inflate(Register src, Register dst, Register cnt, Register tmp) { 3463 Label Lloop; 3464 mtctr(cnt); 3465 3466 bind(Lloop); 3467 lbz(tmp, 0, src); 3468 addi(src, src, 1); 3469 sth(tmp, 0, dst); 3470 addi(dst, dst, 2); 3471 bdnz(Lloop); 3472 } 3473 3474 void MacroAssembler::string_compare(Register str1, Register str2, 3475 Register cnt1, Register cnt2, 3476 Register tmp1, Register result, int ae) { 3477 const Register tmp0 = R0, 3478 diff = tmp1; 3479 3480 assert_different_registers(str1, str2, cnt1, cnt2, tmp0, tmp1, result); 3481 Label Ldone, Lslow, Lloop, Lreturn_diff; 3482 3483 // Note: Making use of the fact that compareTo(a, b) == -compareTo(b, a) 3484 // we interchange str1 and str2 in the UL case and negate the result. 3485 // Like this, str1 is always latin1 encoded, except for the UU case. 3486 // In addition, we need 0 (or sign which is 0) extend. 3487 3488 if (ae == StrIntrinsicNode::UU) { 3489 srwi(cnt1, cnt1, 1); 3490 } else { 3491 clrldi(cnt1, cnt1, 32); 3492 } 3493 3494 if (ae != StrIntrinsicNode::LL) { 3495 srwi(cnt2, cnt2, 1); 3496 } else { 3497 clrldi(cnt2, cnt2, 32); 3498 } 3499 3500 // See if the lengths are different, and calculate min in cnt1. 3501 // Save diff in case we need it for a tie-breaker. 3502 subf_(diff, cnt2, cnt1); // diff = cnt1 - cnt2 3503 // if (diff > 0) { cnt1 = cnt2; } 3504 if (VM_Version::has_isel()) { 3505 isel(cnt1, CCR0, Assembler::greater, /*invert*/ false, cnt2); 3506 } else { 3507 Label Lskip; 3508 blt(CCR0, Lskip); 3509 mr(cnt1, cnt2); 3510 bind(Lskip); 3511 } 3512 3513 // Rename registers 3514 Register chr1 = result; 3515 Register chr2 = tmp0; 3516 3517 // Compare multiple characters in fast loop (only implemented for same encoding). 3518 int stride1 = 8, stride2 = 8; 3519 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3520 int log2_chars_per_iter = (ae == StrIntrinsicNode::LL) ? 3 : 2; 3521 Label Lfastloop, Lskipfast; 3522 3523 srwi_(tmp0, cnt1, log2_chars_per_iter); 3524 beq(CCR0, Lskipfast); 3525 rldicl(cnt2, cnt1, 0, 64 - log2_chars_per_iter); // Remaining characters. 3526 li(cnt1, 1 << log2_chars_per_iter); // Initialize for failure case: Rescan characters from current iteration. 3527 mtctr(tmp0); 3528 3529 bind(Lfastloop); 3530 ld(chr1, 0, str1); 3531 ld(chr2, 0, str2); 3532 cmpd(CCR0, chr1, chr2); 3533 bne(CCR0, Lslow); 3534 addi(str1, str1, stride1); 3535 addi(str2, str2, stride2); 3536 bdnz(Lfastloop); 3537 mr(cnt1, cnt2); // Remaining characters. 3538 bind(Lskipfast); 3539 } 3540 3541 // Loop which searches the first difference character by character. 3542 cmpwi(CCR0, cnt1, 0); 3543 beq(CCR0, Lreturn_diff); 3544 bind(Lslow); 3545 mtctr(cnt1); 3546 3547 switch (ae) { 3548 case StrIntrinsicNode::LL: stride1 = 1; stride2 = 1; break; 3549 case StrIntrinsicNode::UL: // fallthru (see comment above) 3550 case StrIntrinsicNode::LU: stride1 = 1; stride2 = 2; break; 3551 case StrIntrinsicNode::UU: stride1 = 2; stride2 = 2; break; 3552 default: ShouldNotReachHere(); break; 3553 } 3554 3555 bind(Lloop); 3556 if (stride1 == 1) { lbz(chr1, 0, str1); } else { lhz(chr1, 0, str1); } 3557 if (stride2 == 1) { lbz(chr2, 0, str2); } else { lhz(chr2, 0, str2); } 3558 subf_(result, chr2, chr1); // result = chr1 - chr2 3559 bne(CCR0, Ldone); 3560 addi(str1, str1, stride1); 3561 addi(str2, str2, stride2); 3562 bdnz(Lloop); 3563 3564 // If strings are equal up to min length, return the length difference. 3565 bind(Lreturn_diff); 3566 mr(result, diff); 3567 3568 // Otherwise, return the difference between the first mismatched chars. 3569 bind(Ldone); 3570 if (ae == StrIntrinsicNode::UL) { 3571 neg(result, result); // Negate result (see note above). 3572 } 3573 } 3574 3575 void MacroAssembler::array_equals(bool is_array_equ, Register ary1, Register ary2, 3576 Register limit, Register tmp1, Register result, bool is_byte) { 3577 const Register tmp0 = R0; 3578 assert_different_registers(ary1, ary2, limit, tmp0, tmp1, result); 3579 Label Ldone, Lskiploop, Lloop, Lfastloop, Lskipfast; 3580 bool limit_needs_shift = false; 3581 3582 if (is_array_equ) { 3583 const int length_offset = arrayOopDesc::length_offset_in_bytes(); 3584 const int base_offset = arrayOopDesc::base_offset_in_bytes(is_byte ? T_BYTE : T_CHAR); 3585 3586 // Return true if the same array. 3587 cmpd(CCR0, ary1, ary2); 3588 beq(CCR0, Lskiploop); 3589 3590 // Return false if one of them is NULL. 3591 cmpdi(CCR0, ary1, 0); 3592 cmpdi(CCR1, ary2, 0); 3593 li(result, 0); 3594 cror(CCR0, Assembler::equal, CCR1, Assembler::equal); 3595 beq(CCR0, Ldone); 3596 3597 // Load the lengths of arrays. 3598 lwz(limit, length_offset, ary1); 3599 lwz(tmp0, length_offset, ary2); 3600 3601 // Return false if the two arrays are not equal length. 3602 cmpw(CCR0, limit, tmp0); 3603 bne(CCR0, Ldone); 3604 3605 // Load array addresses. 3606 addi(ary1, ary1, base_offset); 3607 addi(ary2, ary2, base_offset); 3608 } else { 3609 limit_needs_shift = !is_byte; 3610 li(result, 0); // Assume not equal. 3611 } 3612 3613 // Rename registers 3614 Register chr1 = tmp0; 3615 Register chr2 = tmp1; 3616 3617 // Compare 8 bytes per iteration in fast loop. 3618 const int log2_chars_per_iter = is_byte ? 3 : 2; 3619 3620 srwi_(tmp0, limit, log2_chars_per_iter + (limit_needs_shift ? 1 : 0)); 3621 beq(CCR0, Lskipfast); 3622 mtctr(tmp0); 3623 3624 bind(Lfastloop); 3625 ld(chr1, 0, ary1); 3626 ld(chr2, 0, ary2); 3627 addi(ary1, ary1, 8); 3628 addi(ary2, ary2, 8); 3629 cmpd(CCR0, chr1, chr2); 3630 bne(CCR0, Ldone); 3631 bdnz(Lfastloop); 3632 3633 bind(Lskipfast); 3634 rldicl_(limit, limit, limit_needs_shift ? 64 - 1 : 0, 64 - log2_chars_per_iter); // Remaining characters. 3635 beq(CCR0, Lskiploop); 3636 mtctr(limit); 3637 3638 // Character by character. 3639 bind(Lloop); 3640 if (is_byte) { 3641 lbz(chr1, 0, ary1); 3642 lbz(chr2, 0, ary2); 3643 addi(ary1, ary1, 1); 3644 addi(ary2, ary2, 1); 3645 } else { 3646 lhz(chr1, 0, ary1); 3647 lhz(chr2, 0, ary2); 3648 addi(ary1, ary1, 2); 3649 addi(ary2, ary2, 2); 3650 } 3651 cmpw(CCR0, chr1, chr2); 3652 bne(CCR0, Ldone); 3653 bdnz(Lloop); 3654 3655 bind(Lskiploop); 3656 li(result, 1); // All characters are equal. 3657 bind(Ldone); 3658 } 3659 3660 void MacroAssembler::string_indexof(Register result, Register haystack, Register haycnt, 3661 Register needle, ciTypeArray* needle_values, Register needlecnt, int needlecntval, 3662 Register tmp1, Register tmp2, Register tmp3, Register tmp4, int ae) { 3663 3664 // Ensure 0<needlecnt<=haycnt in ideal graph as prerequisite! 3665 Label L_TooShort, L_Found, L_NotFound, L_End; 3666 Register last_addr = haycnt, // Kill haycnt at the beginning. 3667 addr = tmp1, 3668 n_start = tmp2, 3669 ch1 = tmp3, 3670 ch2 = R0; 3671 3672 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 3673 const int h_csize = (ae == StrIntrinsicNode::LL) ? 1 : 2; 3674 const int n_csize = (ae == StrIntrinsicNode::UU) ? 2 : 1; 3675 3676 // ************************************************************************************************** 3677 // Prepare for main loop: optimized for needle count >=2, bail out otherwise. 3678 // ************************************************************************************************** 3679 3680 // Compute last haystack addr to use if no match gets found. 3681 clrldi(haycnt, haycnt, 32); // Ensure positive int is valid as 64 bit value. 3682 addi(addr, haystack, -h_csize); // Accesses use pre-increment. 3683 if (needlecntval == 0) { // variable needlecnt 3684 cmpwi(CCR6, needlecnt, 2); 3685 clrldi(needlecnt, needlecnt, 32); // Ensure positive int is valid as 64 bit value. 3686 blt(CCR6, L_TooShort); // Variable needlecnt: handle short needle separately. 3687 } 3688 3689 if (n_csize == 2) { lwz(n_start, 0, needle); } else { lhz(n_start, 0, needle); } // Load first 2 characters of needle. 3690 3691 if (needlecntval == 0) { // variable needlecnt 3692 subf(ch1, needlecnt, haycnt); // Last character index to compare is haycnt-needlecnt. 3693 addi(needlecnt, needlecnt, -2); // Rest of needle. 3694 } else { // constant needlecnt 3695 guarantee(needlecntval != 1, "IndexOf with single-character needle must be handled separately"); 3696 assert((needlecntval & 0x7fff) == needlecntval, "wrong immediate"); 3697 addi(ch1, haycnt, -needlecntval); // Last character index to compare is haycnt-needlecnt. 3698 if (needlecntval > 3) { li(needlecnt, needlecntval - 2); } // Rest of needle. 3699 } 3700 3701 if (h_csize == 2) { slwi(ch1, ch1, 1); } // Scale to number of bytes. 3702 3703 if (ae ==StrIntrinsicNode::UL) { 3704 srwi(tmp4, n_start, 1*8); // ___0 3705 rlwimi(n_start, tmp4, 2*8, 0, 23); // _0_1 3706 } 3707 3708 add(last_addr, haystack, ch1); // Point to last address to compare (haystack+2*(haycnt-needlecnt)). 3709 3710 // Main Loop (now we have at least 2 characters). 3711 Label L_OuterLoop, L_InnerLoop, L_FinalCheck, L_Comp1, L_Comp2; 3712 bind(L_OuterLoop); // Search for 1st 2 characters. 3713 Register addr_diff = tmp4; 3714 subf(addr_diff, addr, last_addr); // Difference between already checked address and last address to check. 3715 addi(addr, addr, h_csize); // This is the new address we want to use for comparing. 3716 srdi_(ch2, addr_diff, h_csize); 3717 beq(CCR0, L_FinalCheck); // 2 characters left? 3718 mtctr(ch2); // num of characters / 2 3719 bind(L_InnerLoop); // Main work horse (2x unrolled search loop) 3720 if (h_csize == 2) { // Load 2 characters of haystack (ignore alignment). 3721 lwz(ch1, 0, addr); 3722 lwz(ch2, 2, addr); 3723 } else { 3724 lhz(ch1, 0, addr); 3725 lhz(ch2, 1, addr); 3726 } 3727 cmpw(CCR0, ch1, n_start); // Compare 2 characters (1 would be sufficient but try to reduce branches to CompLoop). 3728 cmpw(CCR1, ch2, n_start); 3729 beq(CCR0, L_Comp1); // Did we find the needle start? 3730 beq(CCR1, L_Comp2); 3731 addi(addr, addr, 2 * h_csize); 3732 bdnz(L_InnerLoop); 3733 bind(L_FinalCheck); 3734 andi_(addr_diff, addr_diff, h_csize); // Remaining characters not covered by InnerLoop: (num of characters) & 1. 3735 beq(CCR0, L_NotFound); 3736 if (h_csize == 2) { lwz(ch1, 0, addr); } else { lhz(ch1, 0, addr); } // One position left at which we have to compare. 3737 cmpw(CCR1, ch1, n_start); 3738 beq(CCR1, L_Comp1); 3739 bind(L_NotFound); 3740 li(result, -1); // not found 3741 b(L_End); 3742 3743 // ************************************************************************************************** 3744 // Special Case: unfortunately, the variable needle case can be called with needlecnt<2 3745 // ************************************************************************************************** 3746 if (needlecntval == 0) { // We have to handle these cases separately. 3747 Label L_OneCharLoop; 3748 bind(L_TooShort); 3749 mtctr(haycnt); 3750 if (n_csize == 2) { lhz(n_start, 0, needle); } else { lbz(n_start, 0, needle); } // First character of needle 3751 bind(L_OneCharLoop); 3752 if (h_csize == 2) { lhzu(ch1, 2, addr); } else { lbzu(ch1, 1, addr); } 3753 cmpw(CCR1, ch1, n_start); 3754 beq(CCR1, L_Found); // Did we find the one character needle? 3755 bdnz(L_OneCharLoop); 3756 li(result, -1); // Not found. 3757 b(L_End); 3758 } 3759 3760 // ************************************************************************************************** 3761 // Regular Case Part II: compare rest of needle (first 2 characters have been compared already) 3762 // ************************************************************************************************** 3763 3764 // Compare the rest 3765 bind(L_Comp2); 3766 addi(addr, addr, h_csize); // First comparison has failed, 2nd one hit. 3767 bind(L_Comp1); // Addr points to possible needle start. 3768 if (needlecntval != 2) { // Const needlecnt==2? 3769 if (needlecntval != 3) { 3770 if (needlecntval == 0) { beq(CCR6, L_Found); } // Variable needlecnt==2? 3771 Register n_ind = tmp4, 3772 h_ind = n_ind; 3773 li(n_ind, 2 * n_csize); // First 2 characters are already compared, use index 2. 3774 mtctr(needlecnt); // Decremented by 2, still > 0. 3775 Label L_CompLoop; 3776 bind(L_CompLoop); 3777 if (ae ==StrIntrinsicNode::UL) { 3778 h_ind = ch1; 3779 sldi(h_ind, n_ind, 1); 3780 } 3781 if (n_csize == 2) { lhzx(ch2, needle, n_ind); } else { lbzx(ch2, needle, n_ind); } 3782 if (h_csize == 2) { lhzx(ch1, addr, h_ind); } else { lbzx(ch1, addr, h_ind); } 3783 cmpw(CCR1, ch1, ch2); 3784 bne(CCR1, L_OuterLoop); 3785 addi(n_ind, n_ind, n_csize); 3786 bdnz(L_CompLoop); 3787 } else { // No loop required if there's only one needle character left. 3788 if (n_csize == 2) { lhz(ch2, 2 * 2, needle); } else { lbz(ch2, 2 * 1, needle); } 3789 if (h_csize == 2) { lhz(ch1, 2 * 2, addr); } else { lbz(ch1, 2 * 1, addr); } 3790 cmpw(CCR1, ch1, ch2); 3791 bne(CCR1, L_OuterLoop); 3792 } 3793 } 3794 // Return index ... 3795 bind(L_Found); 3796 subf(result, haystack, addr); // relative to haystack, ... 3797 if (h_csize == 2) { srdi(result, result, 1); } // in characters. 3798 bind(L_End); 3799 } // string_indexof 3800 3801 void MacroAssembler::string_indexof_char(Register result, Register haystack, Register haycnt, 3802 Register needle, jchar needleChar, Register tmp1, Register tmp2, bool is_byte) { 3803 assert_different_registers(haystack, haycnt, needle, tmp1, tmp2); 3804 3805 Label L_InnerLoop, L_FinalCheck, L_Found1, L_Found2, L_NotFound, L_End; 3806 Register addr = tmp1, 3807 ch1 = tmp2, 3808 ch2 = R0; 3809 3810 const int h_csize = is_byte ? 1 : 2; 3811 3812 //4: 3813 srwi_(tmp2, haycnt, 1); // Shift right by exact_log2(UNROLL_FACTOR). 3814 mr(addr, haystack); 3815 beq(CCR0, L_FinalCheck); 3816 mtctr(tmp2); // Move to count register. 3817 //8: 3818 bind(L_InnerLoop); // Main work horse (2x unrolled search loop). 3819 if (!is_byte) { 3820 lhz(ch1, 0, addr); 3821 lhz(ch2, 2, addr); 3822 } else { 3823 lbz(ch1, 0, addr); 3824 lbz(ch2, 1, addr); 3825 } 3826 (needle != R0) ? cmpw(CCR0, ch1, needle) : cmplwi(CCR0, ch1, (unsigned int)needleChar); 3827 (needle != R0) ? cmpw(CCR1, ch2, needle) : cmplwi(CCR1, ch2, (unsigned int)needleChar); 3828 beq(CCR0, L_Found1); // Did we find the needle? 3829 beq(CCR1, L_Found2); 3830 addi(addr, addr, 2 * h_csize); 3831 bdnz(L_InnerLoop); 3832 //16: 3833 bind(L_FinalCheck); 3834 andi_(R0, haycnt, 1); 3835 beq(CCR0, L_NotFound); 3836 if (!is_byte) { lhz(ch1, 0, addr); } else { lbz(ch1, 0, addr); } // One position left at which we have to compare. 3837 (needle != R0) ? cmpw(CCR1, ch1, needle) : cmplwi(CCR1, ch1, (unsigned int)needleChar); 3838 beq(CCR1, L_Found1); 3839 //21: 3840 bind(L_NotFound); 3841 li(result, -1); // Not found. 3842 b(L_End); 3843 3844 bind(L_Found2); 3845 addi(addr, addr, h_csize); 3846 //24: 3847 bind(L_Found1); // Return index ... 3848 subf(result, haystack, addr); // relative to haystack, ... 3849 if (!is_byte) { srdi(result, result, 1); } // in characters. 3850 bind(L_End); 3851 } // string_indexof_char 3852 3853 3854 void MacroAssembler::has_negatives(Register src, Register cnt, Register result, 3855 Register tmp1, Register tmp2) { 3856 const Register tmp0 = R0; 3857 assert_different_registers(src, result, cnt, tmp0, tmp1, tmp2); 3858 Label Lfastloop, Lslow, Lloop, Lnoneg, Ldone; 3859 3860 // Check if cnt >= 8 (= 16 bytes) 3861 lis(tmp1, (int)(short)0x8080); // tmp1 = 0x8080808080808080 3862 srwi_(tmp2, cnt, 4); 3863 li(result, 1); // Assume there's a negative byte. 3864 beq(CCR0, Lslow); 3865 ori(tmp1, tmp1, 0x8080); 3866 rldimi(tmp1, tmp1, 32, 0); 3867 mtctr(tmp2); 3868 3869 // 2x unrolled loop 3870 bind(Lfastloop); 3871 ld(tmp2, 0, src); 3872 ld(tmp0, 8, src); 3873 3874 orr(tmp0, tmp2, tmp0); 3875 3876 and_(tmp0, tmp0, tmp1); 3877 bne(CCR0, Ldone); // Found negative byte. 3878 addi(src, src, 16); 3879 3880 bdnz(Lfastloop); 3881 3882 bind(Lslow); // Fallback to slow version 3883 rldicl_(tmp0, cnt, 0, 64-4); 3884 beq(CCR0, Lnoneg); 3885 mtctr(tmp0); 3886 bind(Lloop); 3887 lbz(tmp0, 0, src); 3888 addi(src, src, 1); 3889 andi_(tmp0, tmp0, 0x80); 3890 bne(CCR0, Ldone); // Found negative byte. 3891 bdnz(Lloop); 3892 bind(Lnoneg); 3893 li(result, 0); 3894 3895 bind(Ldone); 3896 } 3897 3898 #endif // Compiler2 3899 3900 // Helpers for Intrinsic Emitters 3901 // 3902 // Revert the byte order of a 32bit value in a register 3903 // src: 0x44556677 3904 // dst: 0x77665544 3905 // Three steps to obtain the result: 3906 // 1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word 3907 // into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared. 3908 // This value initializes dst. 3909 // 2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost 3910 // byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go. 3911 // This value is mask inserted into dst with a [0..23] mask of 1s. 3912 // 3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position. 3913 // This value is mask inserted into dst with a [8..15] mask of 1s. 3914 void MacroAssembler::load_reverse_32(Register dst, Register src) { 3915 assert_different_registers(dst, src); 3916 3917 rldicl(dst, src, (4+1)*8, 56); // Rotate byte 4 into position 7 (rightmost), clear all to the left. 3918 rlwimi(dst, src, 3*8, 0, 23); // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone. 3919 rlwimi(dst, src, 1*8, 8, 15); // Insert byte 6 into position 5, leave the rest alone. 3920 } 3921 3922 // Calculate the column addresses of the crc32 lookup table into distinct registers. 3923 // This loop-invariant calculation is moved out of the loop body, reducing the loop 3924 // body size from 20 to 16 instructions. 3925 // Returns the offset that was used to calculate the address of column tc3. 3926 // Due to register shortage, setting tc3 may overwrite table. With the return offset 3927 // at hand, the original table address can be easily reconstructed. 3928 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) { 3929 3930 #ifdef VM_LITTLE_ENDIAN 3931 // This is what we implement (the DOLIT4 part): 3932 // ========================================================================= */ 3933 // #define DOLIT4 c ^= *buf4++; \ 3934 // c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \ 3935 // crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24] 3936 // #define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4 3937 // ========================================================================= */ 3938 const int ix0 = 3*(4*CRC32_COLUMN_SIZE); 3939 const int ix1 = 2*(4*CRC32_COLUMN_SIZE); 3940 const int ix2 = 1*(4*CRC32_COLUMN_SIZE); 3941 const int ix3 = 0*(4*CRC32_COLUMN_SIZE); 3942 #else 3943 // This is what we implement (the DOBIG4 part): 3944 // ========================================================================= 3945 // #define DOBIG4 c ^= *++buf4; \ 3946 // c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \ 3947 // crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24] 3948 // #define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4 3949 // ========================================================================= 3950 const int ix0 = 4*(4*CRC32_COLUMN_SIZE); 3951 const int ix1 = 5*(4*CRC32_COLUMN_SIZE); 3952 const int ix2 = 6*(4*CRC32_COLUMN_SIZE); 3953 const int ix3 = 7*(4*CRC32_COLUMN_SIZE); 3954 #endif 3955 assert_different_registers(table, tc0, tc1, tc2); 3956 assert(table == tc3, "must be!"); 3957 3958 addi(tc0, table, ix0); 3959 addi(tc1, table, ix1); 3960 addi(tc2, table, ix2); 3961 if (ix3 != 0) addi(tc3, table, ix3); 3962 3963 return ix3; 3964 } 3965 3966 /** 3967 * uint32_t crc; 3968 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8); 3969 */ 3970 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) { 3971 assert_different_registers(crc, table, tmp); 3972 assert_different_registers(val, table); 3973 3974 if (crc == val) { // Must rotate first to use the unmodified value. 3975 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 3976 // As we use a word (4-byte) instruction, we have to adapt the mask bit positions. 3977 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 3978 } else { 3979 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 3980 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 3981 } 3982 lwzx(tmp, table, tmp); 3983 xorr(crc, crc, tmp); 3984 } 3985 3986 /** 3987 * uint32_t crc; 3988 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8); 3989 */ 3990 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) { 3991 fold_byte_crc32(crc, crc, table, tmp); 3992 } 3993 3994 /** 3995 * Emits code to update CRC-32 with a byte value according to constants in table. 3996 * 3997 * @param [in,out]crc Register containing the crc. 3998 * @param [in]val Register containing the byte to fold into the CRC. 3999 * @param [in]table Register containing the table of crc constants. 4000 * 4001 * uint32_t crc; 4002 * val = crc_table[(val ^ crc) & 0xFF]; 4003 * crc = val ^ (crc >> 8); 4004 */ 4005 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 4006 BLOCK_COMMENT("update_byte_crc32:"); 4007 xorr(val, val, crc); 4008 fold_byte_crc32(crc, val, table, val); 4009 } 4010 4011 /** 4012 * @param crc register containing existing CRC (32-bit) 4013 * @param buf register pointing to input byte buffer (byte*) 4014 * @param len register containing number of bytes 4015 * @param table register pointing to CRC table 4016 */ 4017 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table, 4018 Register data, bool loopAlignment) { 4019 assert_different_registers(crc, buf, len, table, data); 4020 4021 Label L_mainLoop, L_done; 4022 const int mainLoop_stepping = 1; 4023 const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4; 4024 4025 // Process all bytes in a single-byte loop. 4026 clrldi_(len, len, 32); // Enforce 32 bit. Anything to do? 4027 beq(CCR0, L_done); 4028 4029 mtctr(len); 4030 align(mainLoop_alignment); 4031 BIND(L_mainLoop); 4032 lbz(data, 0, buf); // Byte from buffer, zero-extended. 4033 addi(buf, buf, mainLoop_stepping); // Advance buffer position. 4034 update_byte_crc32(crc, data, table); 4035 bdnz(L_mainLoop); // Iterate. 4036 4037 bind(L_done); 4038 } 4039 4040 /** 4041 * Emits code to update CRC-32 with a 4-byte value according to constants in table 4042 * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c 4043 */ 4044 // A not on the lookup table address(es): 4045 // The lookup table consists of two sets of four columns each. 4046 // The columns {0..3} are used for little-endian machines. 4047 // The columns {4..7} are used for big-endian machines. 4048 // To save the effort of adding the column offset to the table address each time 4049 // a table element is looked up, it is possible to pass the pre-calculated 4050 // column addresses. 4051 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary. 4052 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc, 4053 Register t0, Register t1, Register t2, Register t3, 4054 Register tc0, Register tc1, Register tc2, Register tc3) { 4055 assert_different_registers(crc, t3); 4056 4057 // XOR crc with next four bytes of buffer. 4058 lwz(t3, bufDisp, buf); 4059 if (bufInc != 0) { 4060 addi(buf, buf, bufInc); 4061 } 4062 xorr(t3, t3, crc); 4063 4064 // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices. 4065 rlwinm(t0, t3, 2, 24-2, 31-2); // ((t1 >> 0) & 0xff) << 2 4066 rlwinm(t1, t3, 32+(2- 8), 24-2, 31-2); // ((t1 >> 8) & 0xff) << 2 4067 rlwinm(t2, t3, 32+(2-16), 24-2, 31-2); // ((t1 >> 16) & 0xff) << 2 4068 rlwinm(t3, t3, 32+(2-24), 24-2, 31-2); // ((t1 >> 24) & 0xff) << 2 4069 4070 // Use the pre-calculated column addresses. 4071 // Load pre-calculated table values. 4072 lwzx(t0, tc0, t0); 4073 lwzx(t1, tc1, t1); 4074 lwzx(t2, tc2, t2); 4075 lwzx(t3, tc3, t3); 4076 4077 // Calculate new crc from table values. 4078 xorr(t0, t0, t1); 4079 xorr(t2, t2, t3); 4080 xorr(crc, t0, t2); // Now crc contains the final checksum value. 4081 } 4082 4083 /** 4084 * @param crc register containing existing CRC (32-bit) 4085 * @param buf register pointing to input byte buffer (byte*) 4086 * @param len register containing number of bytes 4087 * @param table register pointing to CRC table 4088 * 4089 * Uses R9..R12 as work register. Must be saved/restored by caller! 4090 */ 4091 void MacroAssembler::kernel_crc32_2word(Register crc, Register buf, Register len, Register table, 4092 Register t0, Register t1, Register t2, Register t3, 4093 Register tc0, Register tc1, Register tc2, Register tc3, 4094 bool invertCRC) { 4095 assert_different_registers(crc, buf, len, table); 4096 4097 Label L_mainLoop, L_tail; 4098 Register tmp = t0; 4099 Register data = t0; 4100 Register tmp2 = t1; 4101 const int mainLoop_stepping = 8; 4102 const int tailLoop_stepping = 1; 4103 const int log_stepping = exact_log2(mainLoop_stepping); 4104 const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32; 4105 const int complexThreshold = 2*mainLoop_stepping; 4106 4107 // Don't test for len <= 0 here. This pathological case should not occur anyway. 4108 // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles 4109 // for all well-behaved cases. The situation itself is detected and handled correctly 4110 // within update_byteLoop_crc32. 4111 assert(tailLoop_stepping == 1, "check tailLoop_stepping!"); 4112 4113 BLOCK_COMMENT("kernel_crc32_2word {"); 4114 4115 if (invertCRC) { 4116 nand(crc, crc, crc); // 1s complement of crc 4117 } 4118 4119 // Check for short (<mainLoop_stepping) buffer. 4120 cmpdi(CCR0, len, complexThreshold); 4121 blt(CCR0, L_tail); 4122 4123 // Pre-mainLoop alignment did show a slight (1%) positive effect on performance. 4124 // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions. 4125 { 4126 // Align buf addr to mainLoop_stepping boundary. 4127 neg(tmp2, buf); // Calculate # preLoop iterations for alignment. 4128 rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63. 4129 4130 if (complexThreshold > mainLoop_stepping) { 4131 sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 4132 } else { 4133 sub(tmp, len, tmp2); // Remaining bytes for main loop. 4134 cmpdi(CCR0, tmp, mainLoop_stepping); 4135 blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing 4136 mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 4137 } 4138 update_byteLoop_crc32(crc, buf, tmp2, table, data, false); 4139 } 4140 4141 srdi(tmp2, len, log_stepping); // #iterations for mainLoop 4142 andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop 4143 mtctr(tmp2); 4144 4145 #ifdef VM_LITTLE_ENDIAN 4146 Register crc_rv = crc; 4147 #else 4148 Register crc_rv = tmp; // Load_reverse needs separate registers to work on. 4149 // Occupies tmp, but frees up crc. 4150 load_reverse_32(crc_rv, crc); // Revert byte order because we are dealing with big-endian data. 4151 tmp = crc; 4152 #endif 4153 4154 int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3); 4155 4156 align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement. 4157 BIND(L_mainLoop); 4158 update_1word_crc32(crc_rv, buf, table, 0, 0, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 4159 update_1word_crc32(crc_rv, buf, table, 4, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 4160 bdnz(L_mainLoop); 4161 4162 #ifndef VM_LITTLE_ENDIAN 4163 load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data. 4164 tmp = crc_rv; // Tmp uses it's original register again. 4165 #endif 4166 4167 // Restore original table address for tailLoop. 4168 if (reconstructTableOffset != 0) { 4169 addi(table, table, -reconstructTableOffset); 4170 } 4171 4172 // Process last few (<complexThreshold) bytes of buffer. 4173 BIND(L_tail); 4174 update_byteLoop_crc32(crc, buf, len, table, data, false); 4175 4176 if (invertCRC) { 4177 nand(crc, crc, crc); // 1s complement of crc 4178 } 4179 BLOCK_COMMENT("} kernel_crc32_2word"); 4180 } 4181 4182 /** 4183 * @param crc register containing existing CRC (32-bit) 4184 * @param buf register pointing to input byte buffer (byte*) 4185 * @param len register containing number of bytes 4186 * @param table register pointing to CRC table 4187 * 4188 * uses R9..R12 as work register. Must be saved/restored by caller! 4189 */ 4190 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table, 4191 Register t0, Register t1, Register t2, Register t3, 4192 Register tc0, Register tc1, Register tc2, Register tc3, 4193 bool invertCRC) { 4194 assert_different_registers(crc, buf, len, table); 4195 4196 Label L_mainLoop, L_tail; 4197 Register tmp = t0; 4198 Register data = t0; 4199 Register tmp2 = t1; 4200 const int mainLoop_stepping = 4; 4201 const int tailLoop_stepping = 1; 4202 const int log_stepping = exact_log2(mainLoop_stepping); 4203 const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32; 4204 const int complexThreshold = 2*mainLoop_stepping; 4205 4206 // Don't test for len <= 0 here. This pathological case should not occur anyway. 4207 // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles 4208 // for all well-behaved cases. The situation itself is detected and handled correctly 4209 // within update_byteLoop_crc32. 4210 assert(tailLoop_stepping == 1, "check tailLoop_stepping!"); 4211 4212 BLOCK_COMMENT("kernel_crc32_1word {"); 4213 4214 if (invertCRC) { 4215 nand(crc, crc, crc); // 1s complement of crc 4216 } 4217 4218 // Check for short (<mainLoop_stepping) buffer. 4219 cmpdi(CCR0, len, complexThreshold); 4220 blt(CCR0, L_tail); 4221 4222 // Pre-mainLoop alignment did show a slight (1%) positive effect on performance. 4223 // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions. 4224 { 4225 // Align buf addr to mainLoop_stepping boundary. 4226 neg(tmp2, buf); // Calculate # preLoop iterations for alignment. 4227 rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63. 4228 4229 if (complexThreshold > mainLoop_stepping) { 4230 sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 4231 } else { 4232 sub(tmp, len, tmp2); // Remaining bytes for main loop. 4233 cmpdi(CCR0, tmp, mainLoop_stepping); 4234 blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing 4235 mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 4236 } 4237 update_byteLoop_crc32(crc, buf, tmp2, table, data, false); 4238 } 4239 4240 srdi(tmp2, len, log_stepping); // #iterations for mainLoop 4241 andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop 4242 mtctr(tmp2); 4243 4244 #ifdef VM_LITTLE_ENDIAN 4245 Register crc_rv = crc; 4246 #else 4247 Register crc_rv = tmp; // Load_reverse needs separate registers to work on. 4248 // Occupies tmp, but frees up crc. 4249 load_reverse_32(crc_rv, crc); // Revert byte order because we are dealing with big-endian data. 4250 tmp = crc; 4251 #endif 4252 4253 int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3); 4254 4255 align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement. 4256 BIND(L_mainLoop); 4257 update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 4258 bdnz(L_mainLoop); 4259 4260 #ifndef VM_LITTLE_ENDIAN 4261 load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data. 4262 tmp = crc_rv; // Tmp uses it's original register again. 4263 #endif 4264 4265 // Restore original table address for tailLoop. 4266 if (reconstructTableOffset != 0) { 4267 addi(table, table, -reconstructTableOffset); 4268 } 4269 4270 // Process last few (<complexThreshold) bytes of buffer. 4271 BIND(L_tail); 4272 update_byteLoop_crc32(crc, buf, len, table, data, false); 4273 4274 if (invertCRC) { 4275 nand(crc, crc, crc); // 1s complement of crc 4276 } 4277 BLOCK_COMMENT("} kernel_crc32_1word"); 4278 } 4279 4280 /** 4281 * @param crc register containing existing CRC (32-bit) 4282 * @param buf register pointing to input byte buffer (byte*) 4283 * @param len register containing number of bytes 4284 * @param table register pointing to CRC table 4285 * 4286 * Uses R7_ARG5, R8_ARG6 as work registers. 4287 */ 4288 void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len, Register table, 4289 Register t0, Register t1, Register t2, Register t3, 4290 bool invertCRC) { 4291 assert_different_registers(crc, buf, len, table); 4292 4293 Register data = t0; // Holds the current byte to be folded into crc. 4294 4295 BLOCK_COMMENT("kernel_crc32_1byte {"); 4296 4297 if (invertCRC) { 4298 nand(crc, crc, crc); // 1s complement of crc 4299 } 4300 4301 // Process all bytes in a single-byte loop. 4302 update_byteLoop_crc32(crc, buf, len, table, data, true); 4303 4304 if (invertCRC) { 4305 nand(crc, crc, crc); // 1s complement of crc 4306 } 4307 BLOCK_COMMENT("} kernel_crc32_1byte"); 4308 } 4309 4310 /** 4311 * @param crc register containing existing CRC (32-bit) 4312 * @param buf register pointing to input byte buffer (byte*) 4313 * @param len register containing number of bytes 4314 * @param table register pointing to CRC table 4315 * @param constants register pointing to CRC table for 128-bit aligned memory 4316 * @param barretConstants register pointing to table for barrett reduction 4317 * @param t0-t4 temp registers 4318 */ 4319 void MacroAssembler::kernel_crc32_1word_vpmsum(Register crc, Register buf, Register len, Register table, 4320 Register constants, Register barretConstants, 4321 Register t0, Register t1, Register t2, Register t3, Register t4, 4322 bool invertCRC) { 4323 assert_different_registers(crc, buf, len, table); 4324 4325 Label L_alignedHead, L_tail; 4326 4327 BLOCK_COMMENT("kernel_crc32_1word_vpmsum {"); 4328 4329 // 1. ~c 4330 if (invertCRC) { 4331 nand(crc, crc, crc); // 1s complement of crc 4332 } 4333 4334 // 2. use kernel_crc32_1word for short len 4335 clrldi(len, len, 32); 4336 cmpdi(CCR0, len, 512); 4337 blt(CCR0, L_tail); 4338 4339 // 3. calculate from 0 to first aligned address 4340 const int alignment = 16; 4341 Register prealign = t0; 4342 4343 andi_(prealign, buf, alignment - 1); 4344 beq(CCR0, L_alignedHead); 4345 subfic(prealign, prealign, alignment); 4346 4347 subf(len, prealign, len); 4348 update_byteLoop_crc32(crc, buf, prealign, table, t2, false); 4349 4350 // 4. calculate from first aligned address as far as possible 4351 BIND(L_alignedHead); 4352 kernel_crc32_1word_aligned(crc, buf, len, constants, barretConstants, t0, t1, t2, t3, t4); 4353 4354 // 5. remaining bytes 4355 BIND(L_tail); 4356 Register tc0 = t4; 4357 Register tc1 = constants; 4358 Register tc2 = barretConstants; 4359 kernel_crc32_1word(crc, buf, len, table, t0, t1, t2, t3, tc0, tc1, tc2, table, false); 4360 4361 // 6. ~c 4362 if (invertCRC) { 4363 nand(crc, crc, crc); // 1s complement of crc 4364 } 4365 4366 BLOCK_COMMENT("} kernel_crc32_1word_vpmsum"); 4367 } 4368 4369 /** 4370 * @param crc register containing existing CRC (32-bit) 4371 * @param buf register pointing to input byte buffer (byte*) 4372 * @param len register containing number of bytes (will get updated to remaining bytes) 4373 * @param constants register pointing to CRC table for 128-bit aligned memory 4374 * @param barretConstants register pointing to table for barrett reduction 4375 * @param t0-t4 temp registers 4376 * Precondition: len should be >= 512. Otherwise, nothing will be done. 4377 */ 4378 void MacroAssembler::kernel_crc32_1word_aligned(Register crc, Register buf, Register len, 4379 Register constants, Register barretConstants, 4380 Register t0, Register t1, Register t2, Register t3, Register t4) { 4381 4382 // Save non-volatile vector registers (frameless). 4383 Register offset = t1; 4384 int offsetInt = 0; 4385 offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP); 4386 offsetInt -= 16; li(offset, offsetInt); stvx(VR21, offset, R1_SP); 4387 offsetInt -= 16; li(offset, offsetInt); stvx(VR22, offset, R1_SP); 4388 offsetInt -= 16; li(offset, offsetInt); stvx(VR23, offset, R1_SP); 4389 offsetInt -= 16; li(offset, offsetInt); stvx(VR24, offset, R1_SP); 4390 offsetInt -= 16; li(offset, offsetInt); stvx(VR25, offset, R1_SP); 4391 #ifndef VM_LITTLE_ENDIAN 4392 offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP); 4393 #endif 4394 offsetInt -= 8; std(R14, offsetInt, R1_SP); 4395 offsetInt -= 8; std(R15, offsetInt, R1_SP); 4396 offsetInt -= 8; std(R16, offsetInt, R1_SP); 4397 offsetInt -= 8; std(R17, offsetInt, R1_SP); 4398 4399 // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor 4400 // bytes per iteration. The basic scheme is: 4401 // lvx: load vector (Big Endian needs reversal) 4402 // vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift 4403 // vxor: xor partial results together to get unroll_factor2 vectors 4404 4405 // Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors. 4406 4407 // Using 16 * unroll_factor / unroll_factor_2 bytes for constants. 4408 const int unroll_factor = 2048; 4409 const int unroll_factor2 = 8; 4410 4411 // Support registers. 4412 Register offs[] = { noreg, t0, t1, t2, t3, t4, crc /* will live in VCRC */, R14 }; 4413 Register num_bytes = R15, 4414 loop_count = R16, 4415 cur_const = R17; 4416 // Constant array for outer loop: unroll_factor2 - 1 registers, 4417 // Constant array for inner loop: unroll_factor / unroll_factor2 registers. 4418 VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 }, 4419 consts1[] = { VR23, VR24 }; 4420 // Data register arrays: 2 arrays with unroll_factor2 registers. 4421 VectorRegister data0[] = { VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7 }, 4422 data1[] = { VR8, VR9, VR10, VR11, VR12, VR13, VR14, VR15 }; 4423 4424 VectorRegister VCRC = data0[0]; 4425 VectorRegister Vc = VR25; 4426 VectorRegister swap_bytes = VR26; // Only for Big Endian. 4427 4428 // We have at least 1 iteration (ensured by caller). 4429 Label L_outer_loop, L_inner_loop, L_last; 4430 4431 // If supported set DSCR pre-fetch to deepest. 4432 if (VM_Version::has_mfdscr()) { 4433 load_const_optimized(t0, VM_Version::_dscr_val | 7); 4434 mtdscr(t0); 4435 } 4436 4437 mtvrwz(VCRC, crc); // crc lives lives in VCRC, now 4438 4439 for (int i = 1; i < unroll_factor2; ++i) { 4440 li(offs[i], 16 * i); 4441 } 4442 4443 // Load consts for outer loop 4444 lvx(consts0[0], constants); 4445 for (int i = 1; i < unroll_factor2 - 1; ++i) { 4446 lvx(consts0[i], offs[i], constants); 4447 } 4448 addi(constants, constants, (unroll_factor2 - 1) * 16); 4449 4450 load_const_optimized(num_bytes, 16 * unroll_factor); 4451 load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off. 4452 4453 // Reuse data registers outside of the loop. 4454 VectorRegister Vtmp = data1[0]; 4455 VectorRegister Vtmp2 = data1[1]; 4456 VectorRegister zeroes = data1[2]; 4457 4458 vspltisb(Vtmp, 0); 4459 vsldoi(VCRC, Vtmp, VCRC, 8); // 96 bit zeroes, 32 bit CRC. 4460 4461 // Load vector for vpermxor (to xor both 64 bit parts together) 4462 lvsl(Vtmp, buf); // 000102030405060708090a0b0c0d0e0f 4463 vspltisb(Vc, 4); 4464 vsl(Vc, Vtmp, Vc); // 00102030405060708090a0b0c0d0e0f0 4465 xxspltd(Vc->to_vsr(), Vc->to_vsr(), 0); 4466 vor(Vc, Vtmp, Vc); // 001122334455667708192a3b4c5d6e7f 4467 4468 #ifdef VM_LITTLE_ENDIAN 4469 #define BE_swap_bytes(x) 4470 #else 4471 vspltisb(Vtmp2, 0xf); 4472 vxor(swap_bytes, Vtmp, Vtmp2); 4473 #define BE_swap_bytes(x) vperm(x, x, x, swap_bytes) 4474 #endif 4475 4476 cmpd(CCR0, len, num_bytes); 4477 blt(CCR0, L_last); 4478 4479 // ********** Main loop start ********** 4480 align(32); 4481 bind(L_outer_loop); 4482 4483 // Begin of unrolled first iteration (no xor). 4484 lvx(data1[0], buf); 4485 mr(cur_const, constants); 4486 for (int i = 1; i < unroll_factor2 / 2; ++i) { 4487 lvx(data1[i], offs[i], buf); 4488 } 4489 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. 4490 lvx(consts1[0], cur_const); 4491 mtctr(loop_count); 4492 for (int i = 0; i < unroll_factor2 / 2; ++i) { 4493 BE_swap_bytes(data1[i]); 4494 if (i == 0) { vxor(data1[0], data1[0], VCRC); } // xor in previous CRC. 4495 lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf); 4496 vpmsumw(data0[i], data1[i], consts1[0]); 4497 } 4498 addi(buf, buf, 16 * unroll_factor2); 4499 subf(len, num_bytes, len); 4500 lvx(consts1[1], offs[1], cur_const); 4501 addi(cur_const, cur_const, 32); 4502 // Begin of unrolled second iteration (head). 4503 for (int i = 0; i < unroll_factor2 / 2; ++i) { 4504 BE_swap_bytes(data1[i + unroll_factor2 / 2]); 4505 if (i == 0) { lvx(data1[0], buf); } else { lvx(data1[i], offs[i], buf); } 4506 vpmsumw(data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[0]); 4507 } 4508 for (int i = 0; i < unroll_factor2 / 2; ++i) { 4509 BE_swap_bytes(data1[i]); 4510 lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf); 4511 vpmsumw(data1[i], data1[i], consts1[1]); 4512 } 4513 addi(buf, buf, 16 * unroll_factor2); 4514 4515 // Generate most performance relevant code. Loads + half of the vpmsumw have been generated. 4516 // Double-iteration allows using the 2 constant registers alternatingly. 4517 align(32); 4518 bind(L_inner_loop); 4519 for (int j = 1; j < 3; ++j) { // j < unroll_factor / unroll_factor2 - 1 for complete unrolling. 4520 if (j & 1) { 4521 lvx(consts1[0], cur_const); 4522 } else { 4523 lvx(consts1[1], offs[1], cur_const); 4524 addi(cur_const, cur_const, 32); 4525 } 4526 for (int i = 0; i < unroll_factor2; ++i) { 4527 int idx = i + unroll_factor2 / 2, inc = 0; // For modulo-scheduled input. 4528 if (idx >= unroll_factor2) { idx -= unroll_factor2; inc = 1; } 4529 BE_swap_bytes(data1[idx]); 4530 vxor(data0[i], data0[i], data1[i]); 4531 if (i == 0) lvx(data1[0], buf); else lvx(data1[i], offs[i], buf); 4532 vpmsumw(data1[idx], data1[idx], consts1[(j + inc) & 1]); 4533 } 4534 addi(buf, buf, 16 * unroll_factor2); 4535 } 4536 bdnz(L_inner_loop); 4537 4538 // Tail of last iteration (no loads). 4539 for (int i = 0; i < unroll_factor2 / 2; ++i) { 4540 BE_swap_bytes(data1[i + unroll_factor2 / 2]); 4541 vxor(data0[i], data0[i], data1[i]); 4542 vpmsumw(data1[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[1]); 4543 } 4544 for (int i = 0; i < unroll_factor2 / 2; ++i) { 4545 vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); // First half of fixup shifts. 4546 vxor(data0[i + unroll_factor2 / 2], data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2]); 4547 } 4548 4549 // Last data register is ok, other ones need fixup shift. 4550 for (int i = unroll_factor2 / 2; i < unroll_factor2 - 1; ++i) { 4551 vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); 4552 } 4553 4554 // Combine to 128 bit result vector VCRC = data0[0]. 4555 for (int i = 1; i < unroll_factor2; i<<=1) { 4556 for (int j = 0; j <= unroll_factor2 - 2*i; j+=2*i) { 4557 vxor(data0[j], data0[j], data0[j+i]); 4558 } 4559 } 4560 cmpd(CCR0, len, num_bytes); 4561 bge(CCR0, L_outer_loop); 4562 4563 // Last chance with lower num_bytes. 4564 bind(L_last); 4565 srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations. 4566 add_const_optimized(constants, constants, 16 * (unroll_factor / unroll_factor2)); // Point behind last one. 4567 sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used. 4568 clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2)); 4569 subf(constants, R0, constants); // Point to constant to be used first. 4570 4571 addic_(loop_count, loop_count, -1); // One double-iteration peeled off. 4572 bgt(CCR0, L_outer_loop); 4573 // ********** Main loop end ********** 4574 #undef BE_swap_bytes 4575 4576 // Restore DSCR pre-fetch value. 4577 if (VM_Version::has_mfdscr()) { 4578 load_const_optimized(t0, VM_Version::_dscr_val); 4579 mtdscr(t0); 4580 } 4581 4582 vspltisb(zeroes, 0); 4583 4584 // Combine to 64 bit result. 4585 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. 4586 4587 // Reduce to 32 bit CRC: Remainder by multiply-high. 4588 lvx(Vtmp, barretConstants); 4589 vsldoi(Vtmp2, zeroes, VCRC, 12); // Extract high 32 bit. 4590 vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply by inverse long poly. 4591 vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit. 4592 vsldoi(Vtmp, zeroes, Vtmp, 8); 4593 vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply quotient by long poly. 4594 vxor(VCRC, VCRC, Vtmp2); // Remainder fits into 32 bit. 4595 4596 // Move result. len is already updated. 4597 vsldoi(VCRC, VCRC, zeroes, 8); 4598 mfvrd(crc, VCRC); 4599 4600 // Restore non-volatile Vector registers (frameless). 4601 offsetInt = 0; 4602 offsetInt -= 16; li(offset, offsetInt); lvx(VR20, offset, R1_SP); 4603 offsetInt -= 16; li(offset, offsetInt); lvx(VR21, offset, R1_SP); 4604 offsetInt -= 16; li(offset, offsetInt); lvx(VR22, offset, R1_SP); 4605 offsetInt -= 16; li(offset, offsetInt); lvx(VR23, offset, R1_SP); 4606 offsetInt -= 16; li(offset, offsetInt); lvx(VR24, offset, R1_SP); 4607 offsetInt -= 16; li(offset, offsetInt); lvx(VR25, offset, R1_SP); 4608 #ifndef VM_LITTLE_ENDIAN 4609 offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP); 4610 #endif 4611 offsetInt -= 8; ld(R14, offsetInt, R1_SP); 4612 offsetInt -= 8; ld(R15, offsetInt, R1_SP); 4613 offsetInt -= 8; ld(R16, offsetInt, R1_SP); 4614 offsetInt -= 8; ld(R17, offsetInt, R1_SP); 4615 } 4616 4617 void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp, bool invertCRC) { 4618 assert_different_registers(crc, buf, /* len, not used!! */ table, tmp); 4619 4620 BLOCK_COMMENT("kernel_crc32_singleByte:"); 4621 if (invertCRC) { 4622 nand(crc, crc, crc); // 1s complement of crc 4623 } 4624 4625 lbz(tmp, 0, buf); // Byte from buffer, zero-extended. 4626 update_byte_crc32(crc, tmp, table); 4627 4628 if (invertCRC) { 4629 nand(crc, crc, crc); // 1s complement of crc 4630 } 4631 } 4632 4633 void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) { 4634 assert_different_registers(crc, val, table); 4635 4636 BLOCK_COMMENT("kernel_crc32_singleByteReg:"); 4637 if (invertCRC) { 4638 nand(crc, crc, crc); // 1s complement of crc 4639 } 4640 4641 update_byte_crc32(crc, val, table); 4642 4643 if (invertCRC) { 4644 nand(crc, crc, crc); // 1s complement of crc 4645 } 4646 } 4647 4648 // dest_lo += src1 + src2 4649 // dest_hi += carry1 + carry2 4650 void MacroAssembler::add2_with_carry(Register dest_hi, 4651 Register dest_lo, 4652 Register src1, Register src2) { 4653 li(R0, 0); 4654 addc(dest_lo, dest_lo, src1); 4655 adde(dest_hi, dest_hi, R0); 4656 addc(dest_lo, dest_lo, src2); 4657 adde(dest_hi, dest_hi, R0); 4658 } 4659 4660 // Multiply 64 bit by 64 bit first loop. 4661 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, 4662 Register x_xstart, 4663 Register y, Register y_idx, 4664 Register z, 4665 Register carry, 4666 Register product_high, Register product, 4667 Register idx, Register kdx, 4668 Register tmp) { 4669 // jlong carry, x[], y[], z[]; 4670 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) { 4671 // huge_128 product = y[idx] * x[xstart] + carry; 4672 // z[kdx] = (jlong)product; 4673 // carry = (jlong)(product >>> 64); 4674 // } 4675 // z[xstart] = carry; 4676 4677 Label L_first_loop, L_first_loop_exit; 4678 Label L_one_x, L_one_y, L_multiply; 4679 4680 addic_(xstart, xstart, -1); 4681 blt(CCR0, L_one_x); // Special case: length of x is 1. 4682 4683 // Load next two integers of x. 4684 sldi(tmp, xstart, LogBytesPerInt); 4685 ldx(x_xstart, x, tmp); 4686 #ifdef VM_LITTLE_ENDIAN 4687 rldicl(x_xstart, x_xstart, 32, 0); 4688 #endif 4689 4690 align(32, 16); 4691 bind(L_first_loop); 4692 4693 cmpdi(CCR0, idx, 1); 4694 blt(CCR0, L_first_loop_exit); 4695 addi(idx, idx, -2); 4696 beq(CCR0, L_one_y); 4697 4698 // Load next two integers of y. 4699 sldi(tmp, idx, LogBytesPerInt); 4700 ldx(y_idx, y, tmp); 4701 #ifdef VM_LITTLE_ENDIAN 4702 rldicl(y_idx, y_idx, 32, 0); 4703 #endif 4704 4705 4706 bind(L_multiply); 4707 multiply64(product_high, product, x_xstart, y_idx); 4708 4709 li(tmp, 0); 4710 addc(product, product, carry); // Add carry to result. 4711 adde(product_high, product_high, tmp); // Add carry of the last addition. 4712 addi(kdx, kdx, -2); 4713 4714 // Store result. 4715 #ifdef VM_LITTLE_ENDIAN 4716 rldicl(product, product, 32, 0); 4717 #endif 4718 sldi(tmp, kdx, LogBytesPerInt); 4719 stdx(product, z, tmp); 4720 mr_if_needed(carry, product_high); 4721 b(L_first_loop); 4722 4723 4724 bind(L_one_y); // Load one 32 bit portion of y as (0,value). 4725 4726 lwz(y_idx, 0, y); 4727 b(L_multiply); 4728 4729 4730 bind(L_one_x); // Load one 32 bit portion of x as (0,value). 4731 4732 lwz(x_xstart, 0, x); 4733 b(L_first_loop); 4734 4735 bind(L_first_loop_exit); 4736 } 4737 4738 // Multiply 64 bit by 64 bit and add 128 bit. 4739 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, 4740 Register z, Register yz_idx, 4741 Register idx, Register carry, 4742 Register product_high, Register product, 4743 Register tmp, int offset) { 4744 4745 // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry; 4746 // z[kdx] = (jlong)product; 4747 4748 sldi(tmp, idx, LogBytesPerInt); 4749 if (offset) { 4750 addi(tmp, tmp, offset); 4751 } 4752 ldx(yz_idx, y, tmp); 4753 #ifdef VM_LITTLE_ENDIAN 4754 rldicl(yz_idx, yz_idx, 32, 0); 4755 #endif 4756 4757 multiply64(product_high, product, x_xstart, yz_idx); 4758 ldx(yz_idx, z, tmp); 4759 #ifdef VM_LITTLE_ENDIAN 4760 rldicl(yz_idx, yz_idx, 32, 0); 4761 #endif 4762 4763 add2_with_carry(product_high, product, carry, yz_idx); 4764 4765 sldi(tmp, idx, LogBytesPerInt); 4766 if (offset) { 4767 addi(tmp, tmp, offset); 4768 } 4769 #ifdef VM_LITTLE_ENDIAN 4770 rldicl(product, product, 32, 0); 4771 #endif 4772 stdx(product, z, tmp); 4773 } 4774 4775 // Multiply 128 bit by 128 bit. Unrolled inner loop. 4776 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, 4777 Register y, Register z, 4778 Register yz_idx, Register idx, Register carry, 4779 Register product_high, Register product, 4780 Register carry2, Register tmp) { 4781 4782 // jlong carry, x[], y[], z[]; 4783 // int kdx = ystart+1; 4784 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 4785 // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry; 4786 // z[kdx+idx+1] = (jlong)product; 4787 // jlong carry2 = (jlong)(product >>> 64); 4788 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2; 4789 // z[kdx+idx] = (jlong)product; 4790 // carry = (jlong)(product >>> 64); 4791 // } 4792 // idx += 2; 4793 // if (idx > 0) { 4794 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry; 4795 // z[kdx+idx] = (jlong)product; 4796 // carry = (jlong)(product >>> 64); 4797 // } 4798 4799 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 4800 const Register jdx = R0; 4801 4802 // Scale the index. 4803 srdi_(jdx, idx, 2); 4804 beq(CCR0, L_third_loop_exit); 4805 mtctr(jdx); 4806 4807 align(32, 16); 4808 bind(L_third_loop); 4809 4810 addi(idx, idx, -4); 4811 4812 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8); 4813 mr_if_needed(carry2, product_high); 4814 4815 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0); 4816 mr_if_needed(carry, product_high); 4817 bdnz(L_third_loop); 4818 4819 bind(L_third_loop_exit); // Handle any left-over operand parts. 4820 4821 andi_(idx, idx, 0x3); 4822 beq(CCR0, L_post_third_loop_done); 4823 4824 Label L_check_1; 4825 4826 addic_(idx, idx, -2); 4827 blt(CCR0, L_check_1); 4828 4829 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0); 4830 mr_if_needed(carry, product_high); 4831 4832 bind(L_check_1); 4833 4834 addi(idx, idx, 0x2); 4835 andi_(idx, idx, 0x1); 4836 addic_(idx, idx, -1); 4837 blt(CCR0, L_post_third_loop_done); 4838 4839 sldi(tmp, idx, LogBytesPerInt); 4840 lwzx(yz_idx, y, tmp); 4841 multiply64(product_high, product, x_xstart, yz_idx); 4842 lwzx(yz_idx, z, tmp); 4843 4844 add2_with_carry(product_high, product, yz_idx, carry); 4845 4846 sldi(tmp, idx, LogBytesPerInt); 4847 stwx(product, z, tmp); 4848 srdi(product, product, 32); 4849 4850 sldi(product_high, product_high, 32); 4851 orr(product, product, product_high); 4852 mr_if_needed(carry, product); 4853 4854 bind(L_post_third_loop_done); 4855 } // multiply_128_x_128_loop 4856 4857 void MacroAssembler::muladd(Register out, Register in, 4858 Register offset, Register len, Register k, 4859 Register tmp1, Register tmp2, Register carry) { 4860 4861 // Labels 4862 Label LOOP, SKIP; 4863 4864 // Make sure length is positive. 4865 cmpdi (CCR0, len, 0); 4866 4867 // Prepare variables 4868 subi (offset, offset, 4); 4869 li (carry, 0); 4870 ble (CCR0, SKIP); 4871 4872 mtctr (len); 4873 subi (len, len, 1 ); 4874 sldi (len, len, 2 ); 4875 4876 // Main loop 4877 bind(LOOP); 4878 lwzx (tmp1, len, in ); 4879 lwzx (tmp2, offset, out ); 4880 mulld (tmp1, tmp1, k ); 4881 add (tmp2, carry, tmp2 ); 4882 add (tmp2, tmp1, tmp2 ); 4883 stwx (tmp2, offset, out ); 4884 srdi (carry, tmp2, 32 ); 4885 subi (offset, offset, 4 ); 4886 subi (len, len, 4 ); 4887 bdnz (LOOP); 4888 bind(SKIP); 4889 } 4890 4891 void MacroAssembler::multiply_to_len(Register x, Register xlen, 4892 Register y, Register ylen, 4893 Register z, Register zlen, 4894 Register tmp1, Register tmp2, 4895 Register tmp3, Register tmp4, 4896 Register tmp5, Register tmp6, 4897 Register tmp7, Register tmp8, 4898 Register tmp9, Register tmp10, 4899 Register tmp11, Register tmp12, 4900 Register tmp13) { 4901 4902 ShortBranchVerifier sbv(this); 4903 4904 assert_different_registers(x, xlen, y, ylen, z, zlen, 4905 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6); 4906 assert_different_registers(x, xlen, y, ylen, z, zlen, 4907 tmp1, tmp2, tmp3, tmp4, tmp5, tmp7); 4908 assert_different_registers(x, xlen, y, ylen, z, zlen, 4909 tmp1, tmp2, tmp3, tmp4, tmp5, tmp8); 4910 4911 const Register idx = tmp1; 4912 const Register kdx = tmp2; 4913 const Register xstart = tmp3; 4914 4915 const Register y_idx = tmp4; 4916 const Register carry = tmp5; 4917 const Register product = tmp6; 4918 const Register product_high = tmp7; 4919 const Register x_xstart = tmp8; 4920 const Register tmp = tmp9; 4921 4922 // First Loop. 4923 // 4924 // final static long LONG_MASK = 0xffffffffL; 4925 // int xstart = xlen - 1; 4926 // int ystart = ylen - 1; 4927 // long carry = 0; 4928 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 4929 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; 4930 // z[kdx] = (int)product; 4931 // carry = product >>> 32; 4932 // } 4933 // z[xstart] = (int)carry; 4934 4935 mr_if_needed(idx, ylen); // idx = ylen 4936 mr_if_needed(kdx, zlen); // kdx = xlen + ylen 4937 li(carry, 0); // carry = 0 4938 4939 Label L_done; 4940 4941 addic_(xstart, xlen, -1); 4942 blt(CCR0, L_done); 4943 4944 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, 4945 carry, product_high, product, idx, kdx, tmp); 4946 4947 Label L_second_loop; 4948 4949 cmpdi(CCR0, kdx, 0); 4950 beq(CCR0, L_second_loop); 4951 4952 Label L_carry; 4953 4954 addic_(kdx, kdx, -1); 4955 beq(CCR0, L_carry); 4956 4957 // Store lower 32 bits of carry. 4958 sldi(tmp, kdx, LogBytesPerInt); 4959 stwx(carry, z, tmp); 4960 srdi(carry, carry, 32); 4961 addi(kdx, kdx, -1); 4962 4963 4964 bind(L_carry); 4965 4966 // Store upper 32 bits of carry. 4967 sldi(tmp, kdx, LogBytesPerInt); 4968 stwx(carry, z, tmp); 4969 4970 // Second and third (nested) loops. 4971 // 4972 // for (int i = xstart-1; i >= 0; i--) { // Second loop 4973 // carry = 0; 4974 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop 4975 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + 4976 // (z[k] & LONG_MASK) + carry; 4977 // z[k] = (int)product; 4978 // carry = product >>> 32; 4979 // } 4980 // z[i] = (int)carry; 4981 // } 4982 // 4983 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx 4984 4985 bind(L_second_loop); 4986 4987 li(carry, 0); // carry = 0; 4988 4989 addic_(xstart, xstart, -1); // i = xstart-1; 4990 blt(CCR0, L_done); 4991 4992 Register zsave = tmp10; 4993 4994 mr(zsave, z); 4995 4996 4997 Label L_last_x; 4998 4999 sldi(tmp, xstart, LogBytesPerInt); 5000 add(z, z, tmp); // z = z + k - j 5001 addi(z, z, 4); 5002 addic_(xstart, xstart, -1); // i = xstart-1; 5003 blt(CCR0, L_last_x); 5004 5005 sldi(tmp, xstart, LogBytesPerInt); 5006 ldx(x_xstart, x, tmp); 5007 #ifdef VM_LITTLE_ENDIAN 5008 rldicl(x_xstart, x_xstart, 32, 0); 5009 #endif 5010 5011 5012 Label L_third_loop_prologue; 5013 5014 bind(L_third_loop_prologue); 5015 5016 Register xsave = tmp11; 5017 Register xlensave = tmp12; 5018 Register ylensave = tmp13; 5019 5020 mr(xsave, x); 5021 mr(xlensave, xstart); 5022 mr(ylensave, ylen); 5023 5024 5025 multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen, 5026 carry, product_high, product, x, tmp); 5027 5028 mr(z, zsave); 5029 mr(x, xsave); 5030 mr(xlen, xlensave); // This is the decrement of the loop counter! 5031 mr(ylen, ylensave); 5032 5033 addi(tmp3, xlen, 1); 5034 sldi(tmp, tmp3, LogBytesPerInt); 5035 stwx(carry, z, tmp); 5036 addic_(tmp3, tmp3, -1); 5037 blt(CCR0, L_done); 5038 5039 srdi(carry, carry, 32); 5040 sldi(tmp, tmp3, LogBytesPerInt); 5041 stwx(carry, z, tmp); 5042 b(L_second_loop); 5043 5044 // Next infrequent code is moved outside loops. 5045 bind(L_last_x); 5046 5047 lwz(x_xstart, 0, x); 5048 b(L_third_loop_prologue); 5049 5050 bind(L_done); 5051 } // multiply_to_len 5052 5053 void MacroAssembler::asm_assert(bool check_equal, const char *msg, int id) { 5054 #ifdef ASSERT 5055 Label ok; 5056 if (check_equal) { 5057 beq(CCR0, ok); 5058 } else { 5059 bne(CCR0, ok); 5060 } 5061 stop(msg, id); 5062 bind(ok); 5063 #endif 5064 } 5065 5066 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset, 5067 Register mem_base, const char* msg, int id) { 5068 #ifdef ASSERT 5069 switch (size) { 5070 case 4: 5071 lwz(R0, mem_offset, mem_base); 5072 cmpwi(CCR0, R0, 0); 5073 break; 5074 case 8: 5075 ld(R0, mem_offset, mem_base); 5076 cmpdi(CCR0, R0, 0); 5077 break; 5078 default: 5079 ShouldNotReachHere(); 5080 } 5081 asm_assert(check_equal, msg, id); 5082 #endif // ASSERT 5083 } 5084 5085 void MacroAssembler::verify_thread() { 5086 if (VerifyThread) { 5087 unimplemented("'VerifyThread' currently not implemented on PPC"); 5088 } 5089 } 5090 5091 // READ: oop. KILL: R0. Volatile floats perhaps. 5092 void MacroAssembler::verify_oop(Register oop, const char* msg) { 5093 if (!VerifyOops) { 5094 return; 5095 } 5096 5097 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address(); 5098 const Register tmp = R11; // Will be preserved. 5099 const int nbytes_save = MacroAssembler::num_volatile_regs * 8; 5100 save_volatile_gprs(R1_SP, -nbytes_save); // except R0 5101 5102 mr_if_needed(R4_ARG2, oop); 5103 save_LR_CR(tmp); // save in old frame 5104 push_frame_reg_args(nbytes_save, tmp); 5105 // load FunctionDescriptor** / entry_address * 5106 load_const_optimized(tmp, fd, R0); 5107 // load FunctionDescriptor* / entry_address 5108 ld(tmp, 0, tmp); 5109 load_const_optimized(R3_ARG1, (address)msg, R0); 5110 // Call destination for its side effect. 5111 call_c(tmp); 5112 5113 pop_frame(); 5114 restore_LR_CR(tmp); 5115 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0 5116 } 5117 5118 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) { 5119 if (!VerifyOops) { 5120 return; 5121 } 5122 5123 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address(); 5124 const Register tmp = R11; // Will be preserved. 5125 const int nbytes_save = MacroAssembler::num_volatile_regs * 8; 5126 save_volatile_gprs(R1_SP, -nbytes_save); // except R0 5127 5128 ld(R4_ARG2, offs, base); 5129 save_LR_CR(tmp); // save in old frame 5130 push_frame_reg_args(nbytes_save, tmp); 5131 // load FunctionDescriptor** / entry_address * 5132 load_const_optimized(tmp, fd, R0); 5133 // load FunctionDescriptor* / entry_address 5134 ld(tmp, 0, tmp); 5135 load_const_optimized(R3_ARG1, (address)msg, R0); 5136 // Call destination for its side effect. 5137 call_c(tmp); 5138 5139 pop_frame(); 5140 restore_LR_CR(tmp); 5141 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0 5142 } 5143 5144 const char* stop_types[] = { 5145 "stop", 5146 "untested", 5147 "unimplemented", 5148 "shouldnotreachhere" 5149 }; 5150 5151 static void stop_on_request(int tp, const char* msg) { 5152 tty->print("PPC assembly code requires stop: (%s) %s\n", stop_types[tp%/*stop_end*/4], msg); 5153 guarantee(false, "PPC assembly code requires stop: %s", msg); 5154 } 5155 5156 // Call a C-function that prints output. 5157 void MacroAssembler::stop(int type, const char* msg, int id) { 5158 #ifndef PRODUCT 5159 block_comment(err_msg("stop: %s %s {", stop_types[type%stop_end], msg)); 5160 #else 5161 block_comment("stop {"); 5162 #endif 5163 5164 // setup arguments 5165 load_const_optimized(R3_ARG1, type); 5166 load_const_optimized(R4_ARG2, (void *)msg, /*tmp=*/R0); 5167 call_VM_leaf(CAST_FROM_FN_PTR(address, stop_on_request), R3_ARG1, R4_ARG2); 5168 illtrap(); 5169 emit_int32(id); 5170 block_comment("} stop;"); 5171 } 5172 5173 #ifndef PRODUCT 5174 // Write pattern 0x0101010101010101 in memory region [low-before, high+after]. 5175 // Val, addr are temp registers. 5176 // If low == addr, addr is killed. 5177 // High is preserved. 5178 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) { 5179 if (!ZapMemory) return; 5180 5181 assert_different_registers(low, val); 5182 5183 BLOCK_COMMENT("zap memory region {"); 5184 load_const_optimized(val, 0x0101010101010101); 5185 int size = before + after; 5186 if (low == high && size < 5 && size > 0) { 5187 int offset = -before*BytesPerWord; 5188 for (int i = 0; i < size; ++i) { 5189 std(val, offset, low); 5190 offset += (1*BytesPerWord); 5191 } 5192 } else { 5193 addi(addr, low, -before*BytesPerWord); 5194 assert_different_registers(high, val); 5195 if (after) addi(high, high, after * BytesPerWord); 5196 Label loop; 5197 bind(loop); 5198 std(val, 0, addr); 5199 addi(addr, addr, 8); 5200 cmpd(CCR6, addr, high); 5201 ble(CCR6, loop); 5202 if (after) addi(high, high, -after * BytesPerWord); // Correct back to old value. 5203 } 5204 BLOCK_COMMENT("} zap memory region"); 5205 } 5206 5207 #endif // !PRODUCT 5208 5209 void SkipIfEqualZero::skip_to_label_if_equal_zero(MacroAssembler* masm, Register temp, 5210 const bool* flag_addr, Label& label) { 5211 int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true); 5212 assert(sizeof(bool) == 1, "PowerPC ABI"); 5213 masm->lbz(temp, simm16_offset, temp); 5214 masm->cmpwi(CCR0, temp, 0); 5215 masm->beq(CCR0, label); 5216 } 5217 5218 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() { 5219 skip_to_label_if_equal_zero(masm, temp, flag_addr, _label); 5220 } 5221 5222 SkipIfEqualZero::~SkipIfEqualZero() { 5223 _masm->bind(_label); 5224 }