1 /* 2 * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2012, 2018, SAP SE. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.inline.hpp" 28 #include "compiler/disassembler.hpp" 29 #include "gc/shared/collectedHeap.inline.hpp" 30 #include "gc/shared/barrierSet.hpp" 31 #include "gc/shared/barrierSetAssembler.hpp" 32 #include "interpreter/interpreter.hpp" 33 #include "memory/resourceArea.hpp" 34 #include "nativeInst_ppc.hpp" 35 #include "prims/methodHandles.hpp" 36 #include "runtime/biasedLocking.hpp" 37 #include "runtime/icache.hpp" 38 #include "runtime/interfaceSupport.inline.hpp" 39 #include "runtime/objectMonitor.hpp" 40 #include "runtime/os.hpp" 41 #include "runtime/safepoint.hpp" 42 #include "runtime/safepointMechanism.hpp" 43 #include "runtime/sharedRuntime.hpp" 44 #include "runtime/stubRoutines.hpp" 45 #include "utilities/macros.hpp" 46 #ifdef COMPILER2 47 #include "opto/intrinsicnode.hpp" 48 #endif 49 50 #ifdef PRODUCT 51 #define BLOCK_COMMENT(str) // nothing 52 #else 53 #define BLOCK_COMMENT(str) block_comment(str) 54 #endif 55 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 56 57 #ifdef ASSERT 58 // On RISC, there's no benefit to verifying instruction boundaries. 59 bool AbstractAssembler::pd_check_instruction_mark() { return false; } 60 #endif 61 62 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) { 63 assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range"); 64 if (Assembler::is_simm(si31, 16)) { 65 ld(d, si31, a); 66 if (emit_filler_nop) nop(); 67 } else { 68 const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31); 69 const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31); 70 addis(d, a, hi); 71 ld(d, lo, d); 72 } 73 } 74 75 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) { 76 assert_different_registers(d, a); 77 ld_largeoffset_unchecked(d, si31, a, emit_filler_nop); 78 } 79 80 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base, 81 size_t size_in_bytes, bool is_signed) { 82 switch (size_in_bytes) { 83 case 8: ld(dst, offs, base); break; 84 case 4: is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break; 85 case 2: is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break; 86 case 1: lbz(dst, offs, base); if (is_signed) extsb(dst, dst); break; // lba doesn't exist :( 87 default: ShouldNotReachHere(); 88 } 89 } 90 91 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base, 92 size_t size_in_bytes) { 93 switch (size_in_bytes) { 94 case 8: std(dst, offs, base); break; 95 case 4: stw(dst, offs, base); break; 96 case 2: sth(dst, offs, base); break; 97 case 1: stb(dst, offs, base); break; 98 default: ShouldNotReachHere(); 99 } 100 } 101 102 void MacroAssembler::align(int modulus, int max, int rem) { 103 int padding = (rem + modulus - (offset() % modulus)) % modulus; 104 if (padding > max) return; 105 for (int c = (padding >> 2); c > 0; --c) { nop(); } 106 } 107 108 // Issue instructions that calculate given TOC from global TOC. 109 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16, 110 bool add_relocation, bool emit_dummy_addr) { 111 int offset = -1; 112 if (emit_dummy_addr) { 113 offset = -128; // dummy address 114 } else if (addr != (address)(intptr_t)-1) { 115 offset = MacroAssembler::offset_to_global_toc(addr); 116 } 117 118 if (hi16) { 119 addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset)); 120 } 121 if (lo16) { 122 if (add_relocation) { 123 // Relocate at the addi to avoid confusion with a load from the method's TOC. 124 relocate(internal_word_Relocation::spec(addr)); 125 } 126 addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset)); 127 } 128 } 129 130 address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) { 131 const int offset = MacroAssembler::offset_to_global_toc(addr); 132 133 const address inst2_addr = a; 134 const int inst2 = *(int *)inst2_addr; 135 136 // The relocation points to the second instruction, the addi, 137 // and the addi reads and writes the same register dst. 138 const int dst = inv_rt_field(inst2); 139 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 140 141 // Now, find the preceding addis which writes to dst. 142 int inst1 = 0; 143 address inst1_addr = inst2_addr - BytesPerInstWord; 144 while (inst1_addr >= bound) { 145 inst1 = *(int *) inst1_addr; 146 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 147 // Stop, found the addis which writes dst. 148 break; 149 } 150 inst1_addr -= BytesPerInstWord; 151 } 152 153 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 154 set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset)); 155 set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset)); 156 return inst1_addr; 157 } 158 159 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) { 160 const address inst2_addr = a; 161 const int inst2 = *(int *)inst2_addr; 162 163 // The relocation points to the second instruction, the addi, 164 // and the addi reads and writes the same register dst. 165 const int dst = inv_rt_field(inst2); 166 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 167 168 // Now, find the preceding addis which writes to dst. 169 int inst1 = 0; 170 address inst1_addr = inst2_addr - BytesPerInstWord; 171 while (inst1_addr >= bound) { 172 inst1 = *(int *) inst1_addr; 173 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 174 // stop, found the addis which writes dst 175 break; 176 } 177 inst1_addr -= BytesPerInstWord; 178 } 179 180 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 181 182 int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0); 183 // -1 is a special case 184 if (offset == -1) { 185 return (address)(intptr_t)-1; 186 } else { 187 return global_toc() + offset; 188 } 189 } 190 191 #ifdef _LP64 192 // Patch compressed oops or klass constants. 193 // Assembler sequence is 194 // 1) compressed oops: 195 // lis rx = const.hi 196 // ori rx = rx | const.lo 197 // 2) compressed klass: 198 // lis rx = const.hi 199 // clrldi rx = rx & 0xFFFFffff // clearMS32b, optional 200 // ori rx = rx | const.lo 201 // Clrldi will be passed by. 202 address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) { 203 assert(UseCompressedOops, "Should only patch compressed oops"); 204 205 const address inst2_addr = a; 206 const int inst2 = *(int *)inst2_addr; 207 208 // The relocation points to the second instruction, the ori, 209 // and the ori reads and writes the same register dst. 210 const int dst = inv_rta_field(inst2); 211 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 212 // Now, find the preceding addis which writes to dst. 213 int inst1 = 0; 214 address inst1_addr = inst2_addr - BytesPerInstWord; 215 bool inst1_found = false; 216 while (inst1_addr >= bound) { 217 inst1 = *(int *)inst1_addr; 218 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; } 219 inst1_addr -= BytesPerInstWord; 220 } 221 assert(inst1_found, "inst is not lis"); 222 223 int xc = (data >> 16) & 0xffff; 224 int xd = (data >> 0) & 0xffff; 225 226 set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo 227 set_imm((int *)inst2_addr, (xd)); // unsigned int 228 return inst1_addr; 229 } 230 231 // Get compressed oop or klass constant. 232 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) { 233 assert(UseCompressedOops, "Should only patch compressed oops"); 234 235 const address inst2_addr = a; 236 const int inst2 = *(int *)inst2_addr; 237 238 // The relocation points to the second instruction, the ori, 239 // and the ori reads and writes the same register dst. 240 const int dst = inv_rta_field(inst2); 241 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 242 // Now, find the preceding lis which writes to dst. 243 int inst1 = 0; 244 address inst1_addr = inst2_addr - BytesPerInstWord; 245 bool inst1_found = false; 246 247 while (inst1_addr >= bound) { 248 inst1 = *(int *) inst1_addr; 249 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;} 250 inst1_addr -= BytesPerInstWord; 251 } 252 assert(inst1_found, "inst is not lis"); 253 254 uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff)); 255 uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16); 256 257 return (int) (xl | xh); 258 } 259 #endif // _LP64 260 261 // Returns true if successful. 262 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a, 263 Register toc, bool fixed_size) { 264 int toc_offset = 0; 265 // Use RelocationHolder::none for the constant pool entry, otherwise 266 // we will end up with a failing NativeCall::verify(x) where x is 267 // the address of the constant pool entry. 268 // FIXME: We should insert relocation information for oops at the constant 269 // pool entries instead of inserting it at the loads; patching of a constant 270 // pool entry should be less expensive. 271 address const_address = address_constant((address)a.value(), RelocationHolder::none); 272 if (const_address == NULL) { return false; } // allocation failure 273 // Relocate at the pc of the load. 274 relocate(a.rspec()); 275 toc_offset = (int)(const_address - code()->consts()->start()); 276 ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size); 277 return true; 278 } 279 280 bool MacroAssembler::is_load_const_from_method_toc_at(address a) { 281 const address inst1_addr = a; 282 const int inst1 = *(int *)inst1_addr; 283 284 // The relocation points to the ld or the addis. 285 return (is_ld(inst1)) || 286 (is_addis(inst1) && inv_ra_field(inst1) != 0); 287 } 288 289 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) { 290 assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc"); 291 292 const address inst1_addr = a; 293 const int inst1 = *(int *)inst1_addr; 294 295 if (is_ld(inst1)) { 296 return inv_d1_field(inst1); 297 } else if (is_addis(inst1)) { 298 const int dst = inv_rt_field(inst1); 299 300 // Now, find the succeeding ld which reads and writes to dst. 301 address inst2_addr = inst1_addr + BytesPerInstWord; 302 int inst2 = 0; 303 while (true) { 304 inst2 = *(int *) inst2_addr; 305 if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) { 306 // Stop, found the ld which reads and writes dst. 307 break; 308 } 309 inst2_addr += BytesPerInstWord; 310 } 311 return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2); 312 } 313 ShouldNotReachHere(); 314 return 0; 315 } 316 317 // Get the constant from a `load_const' sequence. 318 long MacroAssembler::get_const(address a) { 319 assert(is_load_const_at(a), "not a load of a constant"); 320 const int *p = (const int*) a; 321 unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48); 322 if (is_ori(*(p+1))) { 323 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32); 324 x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16); 325 x |= (((unsigned long) (get_imm(a,4) & 0xffff))); 326 } else if (is_lis(*(p+1))) { 327 x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32); 328 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16); 329 x |= (((unsigned long) (get_imm(a,3) & 0xffff))); 330 } else { 331 ShouldNotReachHere(); 332 return (long) 0; 333 } 334 return (long) x; 335 } 336 337 // Patch the 64 bit constant of a `load_const' sequence. This is a low 338 // level procedure. It neither flushes the instruction cache nor is it 339 // mt safe. 340 void MacroAssembler::patch_const(address a, long x) { 341 assert(is_load_const_at(a), "not a load of a constant"); 342 int *p = (int*) a; 343 if (is_ori(*(p+1))) { 344 set_imm(0 + p, (x >> 48) & 0xffff); 345 set_imm(1 + p, (x >> 32) & 0xffff); 346 set_imm(3 + p, (x >> 16) & 0xffff); 347 set_imm(4 + p, x & 0xffff); 348 } else if (is_lis(*(p+1))) { 349 set_imm(0 + p, (x >> 48) & 0xffff); 350 set_imm(2 + p, (x >> 32) & 0xffff); 351 set_imm(1 + p, (x >> 16) & 0xffff); 352 set_imm(3 + p, x & 0xffff); 353 } else { 354 ShouldNotReachHere(); 355 } 356 } 357 358 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) { 359 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 360 int index = oop_recorder()->allocate_metadata_index(obj); 361 RelocationHolder rspec = metadata_Relocation::spec(index); 362 return AddressLiteral((address)obj, rspec); 363 } 364 365 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) { 366 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 367 int index = oop_recorder()->find_index(obj); 368 RelocationHolder rspec = metadata_Relocation::spec(index); 369 return AddressLiteral((address)obj, rspec); 370 } 371 372 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) { 373 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 374 int oop_index = oop_recorder()->allocate_oop_index(obj); 375 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 376 } 377 378 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) { 379 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 380 int oop_index = oop_recorder()->find_index(obj); 381 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 382 } 383 384 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr, 385 Register tmp, int offset) { 386 intptr_t value = *delayed_value_addr; 387 if (value != 0) { 388 return RegisterOrConstant(value + offset); 389 } 390 391 // Load indirectly to solve generation ordering problem. 392 // static address, no relocation 393 int simm16_offset = load_const_optimized(tmp, delayed_value_addr, noreg, true); 394 ld(tmp, simm16_offset, tmp); // must be aligned ((xa & 3) == 0) 395 396 if (offset != 0) { 397 addi(tmp, tmp, offset); 398 } 399 400 return RegisterOrConstant(tmp); 401 } 402 403 #ifndef PRODUCT 404 void MacroAssembler::pd_print_patched_instruction(address branch) { 405 Unimplemented(); // TODO: PPC port 406 } 407 #endif // ndef PRODUCT 408 409 // Conditional far branch for destinations encodable in 24+2 bits. 410 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) { 411 412 // If requested by flag optimize, relocate the bc_far as a 413 // runtime_call and prepare for optimizing it when the code gets 414 // relocated. 415 if (optimize == bc_far_optimize_on_relocate) { 416 relocate(relocInfo::runtime_call_type); 417 } 418 419 // variant 2: 420 // 421 // b!cxx SKIP 422 // bxx DEST 423 // SKIP: 424 // 425 426 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 427 opposite_bcond(inv_boint_bcond(boint))); 428 429 // We emit two branches. 430 // First, a conditional branch which jumps around the far branch. 431 const address not_taken_pc = pc() + 2 * BytesPerInstWord; 432 const address bc_pc = pc(); 433 bc(opposite_boint, biint, not_taken_pc); 434 435 const int bc_instr = *(int*)bc_pc; 436 assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition"); 437 assert(opposite_boint == inv_bo_field(bc_instr), "postcondition"); 438 assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))), 439 opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))), 440 "postcondition"); 441 assert(biint == inv_bi_field(bc_instr), "postcondition"); 442 443 // Second, an unconditional far branch which jumps to dest. 444 // Note: target(dest) remembers the current pc (see CodeSection::target) 445 // and returns the current pc if the label is not bound yet; when 446 // the label gets bound, the unconditional far branch will be patched. 447 const address target_pc = target(dest); 448 const address b_pc = pc(); 449 b(target_pc); 450 451 assert(not_taken_pc == pc(), "postcondition"); 452 assert(dest.is_bound() || target_pc == b_pc, "postcondition"); 453 } 454 455 // 1 or 2 instructions 456 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) { 457 if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) { 458 bc(boint, biint, dest); 459 } else { 460 bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate); 461 } 462 } 463 464 bool MacroAssembler::is_bc_far_at(address instruction_addr) { 465 return is_bc_far_variant1_at(instruction_addr) || 466 is_bc_far_variant2_at(instruction_addr) || 467 is_bc_far_variant3_at(instruction_addr); 468 } 469 470 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) { 471 if (is_bc_far_variant1_at(instruction_addr)) { 472 const address instruction_1_addr = instruction_addr; 473 const int instruction_1 = *(int*)instruction_1_addr; 474 return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr); 475 } else if (is_bc_far_variant2_at(instruction_addr)) { 476 const address instruction_2_addr = instruction_addr + 4; 477 return bxx_destination(instruction_2_addr); 478 } else if (is_bc_far_variant3_at(instruction_addr)) { 479 return instruction_addr + 8; 480 } 481 // variant 4 ??? 482 ShouldNotReachHere(); 483 return NULL; 484 } 485 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) { 486 487 if (is_bc_far_variant3_at(instruction_addr)) { 488 // variant 3, far cond branch to the next instruction, already patched to nops: 489 // 490 // nop 491 // endgroup 492 // SKIP/DEST: 493 // 494 return; 495 } 496 497 // first, extract boint and biint from the current branch 498 int boint = 0; 499 int biint = 0; 500 501 ResourceMark rm; 502 const int code_size = 2 * BytesPerInstWord; 503 CodeBuffer buf(instruction_addr, code_size); 504 MacroAssembler masm(&buf); 505 if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) { 506 // Far branch to next instruction: Optimize it by patching nops (produce variant 3). 507 masm.nop(); 508 masm.endgroup(); 509 } else { 510 if (is_bc_far_variant1_at(instruction_addr)) { 511 // variant 1, the 1st instruction contains the destination address: 512 // 513 // bcxx DEST 514 // nop 515 // 516 const int instruction_1 = *(int*)(instruction_addr); 517 boint = inv_bo_field(instruction_1); 518 biint = inv_bi_field(instruction_1); 519 } else if (is_bc_far_variant2_at(instruction_addr)) { 520 // variant 2, the 2nd instruction contains the destination address: 521 // 522 // b!cxx SKIP 523 // bxx DEST 524 // SKIP: 525 // 526 const int instruction_1 = *(int*)(instruction_addr); 527 boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))), 528 opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1)))); 529 biint = inv_bi_field(instruction_1); 530 } else { 531 // variant 4??? 532 ShouldNotReachHere(); 533 } 534 535 // second, set the new branch destination and optimize the code 536 if (dest != instruction_addr + 4 && // the bc_far is still unbound! 537 masm.is_within_range_of_bcxx(dest, instruction_addr)) { 538 // variant 1: 539 // 540 // bcxx DEST 541 // nop 542 // 543 masm.bc(boint, biint, dest); 544 masm.nop(); 545 } else { 546 // variant 2: 547 // 548 // b!cxx SKIP 549 // bxx DEST 550 // SKIP: 551 // 552 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 553 opposite_bcond(inv_boint_bcond(boint))); 554 const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord; 555 masm.bc(opposite_boint, biint, not_taken_pc); 556 masm.b(dest); 557 } 558 } 559 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 560 } 561 562 // Emit a NOT mt-safe patchable 64 bit absolute call/jump. 563 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) { 564 // get current pc 565 uint64_t start_pc = (uint64_t) pc(); 566 567 const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last 568 const address pc_of_b = (address) (start_pc + (0*BytesPerInstWord)); // b is first 569 570 // relocate here 571 if (rt != relocInfo::none) { 572 relocate(rt); 573 } 574 575 if ( ReoptimizeCallSequences && 576 (( link && is_within_range_of_b(dest, pc_of_bl)) || 577 (!link && is_within_range_of_b(dest, pc_of_b)))) { 578 // variant 2: 579 // Emit an optimized, pc-relative call/jump. 580 581 if (link) { 582 // some padding 583 nop(); 584 nop(); 585 nop(); 586 nop(); 587 nop(); 588 nop(); 589 590 // do the call 591 assert(pc() == pc_of_bl, "just checking"); 592 bl(dest, relocInfo::none); 593 } else { 594 // do the jump 595 assert(pc() == pc_of_b, "just checking"); 596 b(dest, relocInfo::none); 597 598 // some padding 599 nop(); 600 nop(); 601 nop(); 602 nop(); 603 nop(); 604 nop(); 605 } 606 607 // Assert that we can identify the emitted call/jump. 608 assert(is_bxx64_patchable_variant2_at((address)start_pc, link), 609 "can't identify emitted call"); 610 } else { 611 // variant 1: 612 mr(R0, R11); // spill R11 -> R0. 613 614 // Load the destination address into CTR, 615 // calculate destination relative to global toc. 616 calculate_address_from_global_toc(R11, dest, true, true, false); 617 618 mtctr(R11); 619 mr(R11, R0); // spill R11 <- R0. 620 nop(); 621 622 // do the call/jump 623 if (link) { 624 bctrl(); 625 } else{ 626 bctr(); 627 } 628 // Assert that we can identify the emitted call/jump. 629 assert(is_bxx64_patchable_variant1b_at((address)start_pc, link), 630 "can't identify emitted call"); 631 } 632 633 // Assert that we can identify the emitted call/jump. 634 assert(is_bxx64_patchable_at((address)start_pc, link), 635 "can't identify emitted call"); 636 assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest, 637 "wrong encoding of dest address"); 638 } 639 640 // Identify a bxx64_patchable instruction. 641 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) { 642 return is_bxx64_patchable_variant1b_at(instruction_addr, link) 643 //|| is_bxx64_patchable_variant1_at(instruction_addr, link) 644 || is_bxx64_patchable_variant2_at(instruction_addr, link); 645 } 646 647 // Does the call64_patchable instruction use a pc-relative encoding of 648 // the call destination? 649 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) { 650 // variant 2 is pc-relative 651 return is_bxx64_patchable_variant2_at(instruction_addr, link); 652 } 653 654 // Identify variant 1. 655 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) { 656 unsigned int* instr = (unsigned int*) instruction_addr; 657 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 658 && is_mtctr(instr[5]) // mtctr 659 && is_load_const_at(instruction_addr); 660 } 661 662 // Identify variant 1b: load destination relative to global toc. 663 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) { 664 unsigned int* instr = (unsigned int*) instruction_addr; 665 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 666 && is_mtctr(instr[3]) // mtctr 667 && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr); 668 } 669 670 // Identify variant 2. 671 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) { 672 unsigned int* instr = (unsigned int*) instruction_addr; 673 if (link) { 674 return is_bl (instr[6]) // bl dest is last 675 && is_nop(instr[0]) // nop 676 && is_nop(instr[1]) // nop 677 && is_nop(instr[2]) // nop 678 && is_nop(instr[3]) // nop 679 && is_nop(instr[4]) // nop 680 && is_nop(instr[5]); // nop 681 } else { 682 return is_b (instr[0]) // b dest is first 683 && is_nop(instr[1]) // nop 684 && is_nop(instr[2]) // nop 685 && is_nop(instr[3]) // nop 686 && is_nop(instr[4]) // nop 687 && is_nop(instr[5]) // nop 688 && is_nop(instr[6]); // nop 689 } 690 } 691 692 // Set dest address of a bxx64_patchable instruction. 693 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) { 694 ResourceMark rm; 695 int code_size = MacroAssembler::bxx64_patchable_size; 696 CodeBuffer buf(instruction_addr, code_size); 697 MacroAssembler masm(&buf); 698 masm.bxx64_patchable(dest, relocInfo::none, link); 699 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 700 } 701 702 // Get dest address of a bxx64_patchable instruction. 703 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) { 704 if (is_bxx64_patchable_variant1_at(instruction_addr, link)) { 705 return (address) (unsigned long) get_const(instruction_addr); 706 } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) { 707 unsigned int* instr = (unsigned int*) instruction_addr; 708 if (link) { 709 const int instr_idx = 6; // bl is last 710 int branchoffset = branch_destination(instr[instr_idx], 0); 711 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 712 } else { 713 const int instr_idx = 0; // b is first 714 int branchoffset = branch_destination(instr[instr_idx], 0); 715 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 716 } 717 // Load dest relative to global toc. 718 } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) { 719 return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, 720 instruction_addr); 721 } else { 722 ShouldNotReachHere(); 723 return NULL; 724 } 725 } 726 727 // Uses ordering which corresponds to ABI: 728 // _savegpr0_14: std r14,-144(r1) 729 // _savegpr0_15: std r15,-136(r1) 730 // _savegpr0_16: std r16,-128(r1) 731 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) { 732 std(R14, offset, dst); offset += 8; 733 std(R15, offset, dst); offset += 8; 734 std(R16, offset, dst); offset += 8; 735 std(R17, offset, dst); offset += 8; 736 std(R18, offset, dst); offset += 8; 737 std(R19, offset, dst); offset += 8; 738 std(R20, offset, dst); offset += 8; 739 std(R21, offset, dst); offset += 8; 740 std(R22, offset, dst); offset += 8; 741 std(R23, offset, dst); offset += 8; 742 std(R24, offset, dst); offset += 8; 743 std(R25, offset, dst); offset += 8; 744 std(R26, offset, dst); offset += 8; 745 std(R27, offset, dst); offset += 8; 746 std(R28, offset, dst); offset += 8; 747 std(R29, offset, dst); offset += 8; 748 std(R30, offset, dst); offset += 8; 749 std(R31, offset, dst); offset += 8; 750 751 stfd(F14, offset, dst); offset += 8; 752 stfd(F15, offset, dst); offset += 8; 753 stfd(F16, offset, dst); offset += 8; 754 stfd(F17, offset, dst); offset += 8; 755 stfd(F18, offset, dst); offset += 8; 756 stfd(F19, offset, dst); offset += 8; 757 stfd(F20, offset, dst); offset += 8; 758 stfd(F21, offset, dst); offset += 8; 759 stfd(F22, offset, dst); offset += 8; 760 stfd(F23, offset, dst); offset += 8; 761 stfd(F24, offset, dst); offset += 8; 762 stfd(F25, offset, dst); offset += 8; 763 stfd(F26, offset, dst); offset += 8; 764 stfd(F27, offset, dst); offset += 8; 765 stfd(F28, offset, dst); offset += 8; 766 stfd(F29, offset, dst); offset += 8; 767 stfd(F30, offset, dst); offset += 8; 768 stfd(F31, offset, dst); 769 } 770 771 // Uses ordering which corresponds to ABI: 772 // _restgpr0_14: ld r14,-144(r1) 773 // _restgpr0_15: ld r15,-136(r1) 774 // _restgpr0_16: ld r16,-128(r1) 775 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) { 776 ld(R14, offset, src); offset += 8; 777 ld(R15, offset, src); offset += 8; 778 ld(R16, offset, src); offset += 8; 779 ld(R17, offset, src); offset += 8; 780 ld(R18, offset, src); offset += 8; 781 ld(R19, offset, src); offset += 8; 782 ld(R20, offset, src); offset += 8; 783 ld(R21, offset, src); offset += 8; 784 ld(R22, offset, src); offset += 8; 785 ld(R23, offset, src); offset += 8; 786 ld(R24, offset, src); offset += 8; 787 ld(R25, offset, src); offset += 8; 788 ld(R26, offset, src); offset += 8; 789 ld(R27, offset, src); offset += 8; 790 ld(R28, offset, src); offset += 8; 791 ld(R29, offset, src); offset += 8; 792 ld(R30, offset, src); offset += 8; 793 ld(R31, offset, src); offset += 8; 794 795 // FP registers 796 lfd(F14, offset, src); offset += 8; 797 lfd(F15, offset, src); offset += 8; 798 lfd(F16, offset, src); offset += 8; 799 lfd(F17, offset, src); offset += 8; 800 lfd(F18, offset, src); offset += 8; 801 lfd(F19, offset, src); offset += 8; 802 lfd(F20, offset, src); offset += 8; 803 lfd(F21, offset, src); offset += 8; 804 lfd(F22, offset, src); offset += 8; 805 lfd(F23, offset, src); offset += 8; 806 lfd(F24, offset, src); offset += 8; 807 lfd(F25, offset, src); offset += 8; 808 lfd(F26, offset, src); offset += 8; 809 lfd(F27, offset, src); offset += 8; 810 lfd(F28, offset, src); offset += 8; 811 lfd(F29, offset, src); offset += 8; 812 lfd(F30, offset, src); offset += 8; 813 lfd(F31, offset, src); 814 } 815 816 // For verify_oops. 817 void MacroAssembler::save_volatile_gprs(Register dst, int offset) { 818 std(R2, offset, dst); offset += 8; 819 std(R3, offset, dst); offset += 8; 820 std(R4, offset, dst); offset += 8; 821 std(R5, offset, dst); offset += 8; 822 std(R6, offset, dst); offset += 8; 823 std(R7, offset, dst); offset += 8; 824 std(R8, offset, dst); offset += 8; 825 std(R9, offset, dst); offset += 8; 826 std(R10, offset, dst); offset += 8; 827 std(R11, offset, dst); offset += 8; 828 std(R12, offset, dst); offset += 8; 829 830 stfd(F0, offset, dst); offset += 8; 831 stfd(F1, offset, dst); offset += 8; 832 stfd(F2, offset, dst); offset += 8; 833 stfd(F3, offset, dst); offset += 8; 834 stfd(F4, offset, dst); offset += 8; 835 stfd(F5, offset, dst); offset += 8; 836 stfd(F6, offset, dst); offset += 8; 837 stfd(F7, offset, dst); offset += 8; 838 stfd(F8, offset, dst); offset += 8; 839 stfd(F9, offset, dst); offset += 8; 840 stfd(F10, offset, dst); offset += 8; 841 stfd(F11, offset, dst); offset += 8; 842 stfd(F12, offset, dst); offset += 8; 843 stfd(F13, offset, dst); 844 } 845 846 // For verify_oops. 847 void MacroAssembler::restore_volatile_gprs(Register src, int offset) { 848 ld(R2, offset, src); offset += 8; 849 ld(R3, offset, src); offset += 8; 850 ld(R4, offset, src); offset += 8; 851 ld(R5, offset, src); offset += 8; 852 ld(R6, offset, src); offset += 8; 853 ld(R7, offset, src); offset += 8; 854 ld(R8, offset, src); offset += 8; 855 ld(R9, offset, src); offset += 8; 856 ld(R10, offset, src); offset += 8; 857 ld(R11, offset, src); offset += 8; 858 ld(R12, offset, src); offset += 8; 859 860 lfd(F0, offset, src); offset += 8; 861 lfd(F1, offset, src); offset += 8; 862 lfd(F2, offset, src); offset += 8; 863 lfd(F3, offset, src); offset += 8; 864 lfd(F4, offset, src); offset += 8; 865 lfd(F5, offset, src); offset += 8; 866 lfd(F6, offset, src); offset += 8; 867 lfd(F7, offset, src); offset += 8; 868 lfd(F8, offset, src); offset += 8; 869 lfd(F9, offset, src); offset += 8; 870 lfd(F10, offset, src); offset += 8; 871 lfd(F11, offset, src); offset += 8; 872 lfd(F12, offset, src); offset += 8; 873 lfd(F13, offset, src); 874 } 875 876 void MacroAssembler::save_LR_CR(Register tmp) { 877 mfcr(tmp); 878 std(tmp, _abi(cr), R1_SP); 879 mflr(tmp); 880 std(tmp, _abi(lr), R1_SP); 881 // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad) 882 } 883 884 void MacroAssembler::restore_LR_CR(Register tmp) { 885 assert(tmp != R1_SP, "must be distinct"); 886 ld(tmp, _abi(lr), R1_SP); 887 mtlr(tmp); 888 ld(tmp, _abi(cr), R1_SP); 889 mtcr(tmp); 890 } 891 892 address MacroAssembler::get_PC_trash_LR(Register result) { 893 Label L; 894 bl(L); 895 bind(L); 896 address lr_pc = pc(); 897 mflr(result); 898 return lr_pc; 899 } 900 901 void MacroAssembler::resize_frame(Register offset, Register tmp) { 902 #ifdef ASSERT 903 assert_different_registers(offset, tmp, R1_SP); 904 andi_(tmp, offset, frame::alignment_in_bytes-1); 905 asm_assert_eq("resize_frame: unaligned", 0x204); 906 #endif 907 908 // tmp <- *(SP) 909 ld(tmp, _abi(callers_sp), R1_SP); 910 // addr <- SP + offset; 911 // *(addr) <- tmp; 912 // SP <- addr 913 stdux(tmp, R1_SP, offset); 914 } 915 916 void MacroAssembler::resize_frame(int offset, Register tmp) { 917 assert(is_simm(offset, 16), "too big an offset"); 918 assert_different_registers(tmp, R1_SP); 919 assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned"); 920 // tmp <- *(SP) 921 ld(tmp, _abi(callers_sp), R1_SP); 922 // addr <- SP + offset; 923 // *(addr) <- tmp; 924 // SP <- addr 925 stdu(tmp, offset, R1_SP); 926 } 927 928 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) { 929 // (addr == tmp1) || (addr == tmp2) is allowed here! 930 assert(tmp1 != tmp2, "must be distinct"); 931 932 // compute offset w.r.t. current stack pointer 933 // tmp_1 <- addr - SP (!) 934 subf(tmp1, R1_SP, addr); 935 936 // atomically update SP keeping back link. 937 resize_frame(tmp1/* offset */, tmp2/* tmp */); 938 } 939 940 void MacroAssembler::push_frame(Register bytes, Register tmp) { 941 #ifdef ASSERT 942 assert(bytes != R0, "r0 not allowed here"); 943 andi_(R0, bytes, frame::alignment_in_bytes-1); 944 asm_assert_eq("push_frame(Reg, Reg): unaligned", 0x203); 945 #endif 946 neg(tmp, bytes); 947 stdux(R1_SP, R1_SP, tmp); 948 } 949 950 // Push a frame of size `bytes'. 951 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) { 952 long offset = align_addr(bytes, frame::alignment_in_bytes); 953 if (is_simm(-offset, 16)) { 954 stdu(R1_SP, -offset, R1_SP); 955 } else { 956 load_const_optimized(tmp, -offset); 957 stdux(R1_SP, R1_SP, tmp); 958 } 959 } 960 961 // Push a frame of size `bytes' plus abi_reg_args on top. 962 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) { 963 push_frame(bytes + frame::abi_reg_args_size, tmp); 964 } 965 966 // Setup up a new C frame with a spill area for non-volatile GPRs and 967 // additional space for local variables. 968 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes, 969 Register tmp) { 970 push_frame(bytes + frame::abi_reg_args_size + frame::spill_nonvolatiles_size, tmp); 971 } 972 973 // Pop current C frame. 974 void MacroAssembler::pop_frame() { 975 ld(R1_SP, _abi(callers_sp), R1_SP); 976 } 977 978 #if defined(ABI_ELFv2) 979 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) { 980 // TODO(asmundak): make sure the caller uses R12 as function descriptor 981 // most of the times. 982 if (R12 != r_function_entry) { 983 mr(R12, r_function_entry); 984 } 985 mtctr(R12); 986 // Do a call or a branch. 987 if (and_link) { 988 bctrl(); 989 } else { 990 bctr(); 991 } 992 _last_calls_return_pc = pc(); 993 994 return _last_calls_return_pc; 995 } 996 997 // Call a C function via a function descriptor and use full C 998 // calling conventions. Updates and returns _last_calls_return_pc. 999 address MacroAssembler::call_c(Register r_function_entry) { 1000 return branch_to(r_function_entry, /*and_link=*/true); 1001 } 1002 1003 // For tail calls: only branch, don't link, so callee returns to caller of this function. 1004 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) { 1005 return branch_to(r_function_entry, /*and_link=*/false); 1006 } 1007 1008 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) { 1009 load_const(R12, function_entry, R0); 1010 return branch_to(R12, /*and_link=*/true); 1011 } 1012 1013 #else 1014 // Generic version of a call to C function via a function descriptor 1015 // with variable support for C calling conventions (TOC, ENV, etc.). 1016 // Updates and returns _last_calls_return_pc. 1017 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call, 1018 bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) { 1019 // we emit standard ptrgl glue code here 1020 assert((function_descriptor != R0), "function_descriptor cannot be R0"); 1021 1022 // retrieve necessary entries from the function descriptor 1023 ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor); 1024 mtctr(R0); 1025 1026 if (load_toc_of_callee) { 1027 ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor); 1028 } 1029 if (load_env_of_callee) { 1030 ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor); 1031 } else if (load_toc_of_callee) { 1032 li(R11, 0); 1033 } 1034 1035 // do a call or a branch 1036 if (and_link) { 1037 bctrl(); 1038 } else { 1039 bctr(); 1040 } 1041 _last_calls_return_pc = pc(); 1042 1043 return _last_calls_return_pc; 1044 } 1045 1046 // Call a C function via a function descriptor and use full C calling 1047 // conventions. 1048 // We don't use the TOC in generated code, so there is no need to save 1049 // and restore its value. 1050 address MacroAssembler::call_c(Register fd) { 1051 return branch_to(fd, /*and_link=*/true, 1052 /*save toc=*/false, 1053 /*restore toc=*/false, 1054 /*load toc=*/true, 1055 /*load env=*/true); 1056 } 1057 1058 address MacroAssembler::call_c_and_return_to_caller(Register fd) { 1059 return branch_to(fd, /*and_link=*/false, 1060 /*save toc=*/false, 1061 /*restore toc=*/false, 1062 /*load toc=*/true, 1063 /*load env=*/true); 1064 } 1065 1066 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) { 1067 if (rt != relocInfo::none) { 1068 // this call needs to be relocatable 1069 if (!ReoptimizeCallSequences 1070 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1071 || fd == NULL // support code-size estimation 1072 || !fd->is_friend_function() 1073 || fd->entry() == NULL) { 1074 // it's not a friend function as defined by class FunctionDescriptor, 1075 // so do a full call-c here. 1076 load_const(R11, (address)fd, R0); 1077 1078 bool has_env = (fd != NULL && fd->env() != NULL); 1079 return branch_to(R11, /*and_link=*/true, 1080 /*save toc=*/false, 1081 /*restore toc=*/false, 1082 /*load toc=*/true, 1083 /*load env=*/has_env); 1084 } else { 1085 // It's a friend function. Load the entry point and don't care about 1086 // toc and env. Use an optimizable call instruction, but ensure the 1087 // same code-size as in the case of a non-friend function. 1088 nop(); 1089 nop(); 1090 nop(); 1091 bl64_patchable(fd->entry(), rt); 1092 _last_calls_return_pc = pc(); 1093 return _last_calls_return_pc; 1094 } 1095 } else { 1096 // This call does not need to be relocatable, do more aggressive 1097 // optimizations. 1098 if (!ReoptimizeCallSequences 1099 || !fd->is_friend_function()) { 1100 // It's not a friend function as defined by class FunctionDescriptor, 1101 // so do a full call-c here. 1102 load_const(R11, (address)fd, R0); 1103 return branch_to(R11, /*and_link=*/true, 1104 /*save toc=*/false, 1105 /*restore toc=*/false, 1106 /*load toc=*/true, 1107 /*load env=*/true); 1108 } else { 1109 // it's a friend function, load the entry point and don't care about 1110 // toc and env. 1111 address dest = fd->entry(); 1112 if (is_within_range_of_b(dest, pc())) { 1113 bl(dest); 1114 } else { 1115 bl64_patchable(dest, rt); 1116 } 1117 _last_calls_return_pc = pc(); 1118 return _last_calls_return_pc; 1119 } 1120 } 1121 } 1122 1123 // Call a C function. All constants needed reside in TOC. 1124 // 1125 // Read the address to call from the TOC. 1126 // Read env from TOC, if fd specifies an env. 1127 // Read new TOC from TOC. 1128 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd, 1129 relocInfo::relocType rt, Register toc) { 1130 if (!ReoptimizeCallSequences 1131 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1132 || !fd->is_friend_function()) { 1133 // It's not a friend function as defined by class FunctionDescriptor, 1134 // so do a full call-c here. 1135 assert(fd->entry() != NULL, "function must be linked"); 1136 1137 AddressLiteral fd_entry(fd->entry()); 1138 bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true); 1139 mtctr(R11); 1140 if (fd->env() == NULL) { 1141 li(R11, 0); 1142 nop(); 1143 } else { 1144 AddressLiteral fd_env(fd->env()); 1145 success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true); 1146 } 1147 AddressLiteral fd_toc(fd->toc()); 1148 // Set R2_TOC (load from toc) 1149 success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true); 1150 bctrl(); 1151 _last_calls_return_pc = pc(); 1152 if (!success) { return NULL; } 1153 } else { 1154 // It's a friend function, load the entry point and don't care about 1155 // toc and env. Use an optimizable call instruction, but ensure the 1156 // same code-size as in the case of a non-friend function. 1157 nop(); 1158 bl64_patchable(fd->entry(), rt); 1159 _last_calls_return_pc = pc(); 1160 } 1161 return _last_calls_return_pc; 1162 } 1163 #endif // ABI_ELFv2 1164 1165 void MacroAssembler::call_VM_base(Register oop_result, 1166 Register last_java_sp, 1167 address entry_point, 1168 bool check_exceptions) { 1169 BLOCK_COMMENT("call_VM {"); 1170 // Determine last_java_sp register. 1171 if (!last_java_sp->is_valid()) { 1172 last_java_sp = R1_SP; 1173 } 1174 set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1); 1175 1176 // ARG1 must hold thread address. 1177 mr(R3_ARG1, R16_thread); 1178 #if defined(ABI_ELFv2) 1179 address return_pc = call_c(entry_point, relocInfo::none); 1180 #else 1181 address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none); 1182 #endif 1183 1184 reset_last_Java_frame(); 1185 1186 // Check for pending exceptions. 1187 if (check_exceptions) { 1188 // We don't check for exceptions here. 1189 ShouldNotReachHere(); 1190 } 1191 1192 // Get oop result if there is one and reset the value in the thread. 1193 if (oop_result->is_valid()) { 1194 get_vm_result(oop_result); 1195 } 1196 1197 _last_calls_return_pc = return_pc; 1198 BLOCK_COMMENT("} call_VM"); 1199 } 1200 1201 void MacroAssembler::call_VM_leaf_base(address entry_point) { 1202 BLOCK_COMMENT("call_VM_leaf {"); 1203 #if defined(ABI_ELFv2) 1204 call_c(entry_point, relocInfo::none); 1205 #else 1206 call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none); 1207 #endif 1208 BLOCK_COMMENT("} call_VM_leaf"); 1209 } 1210 1211 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) { 1212 call_VM_base(oop_result, noreg, entry_point, check_exceptions); 1213 } 1214 1215 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, 1216 bool check_exceptions) { 1217 // R3_ARG1 is reserved for the thread. 1218 mr_if_needed(R4_ARG2, arg_1); 1219 call_VM(oop_result, entry_point, check_exceptions); 1220 } 1221 1222 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, 1223 bool check_exceptions) { 1224 // R3_ARG1 is reserved for the thread 1225 mr_if_needed(R4_ARG2, arg_1); 1226 assert(arg_2 != R4_ARG2, "smashed argument"); 1227 mr_if_needed(R5_ARG3, arg_2); 1228 call_VM(oop_result, entry_point, check_exceptions); 1229 } 1230 1231 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3, 1232 bool check_exceptions) { 1233 // R3_ARG1 is reserved for the thread 1234 mr_if_needed(R4_ARG2, arg_1); 1235 assert(arg_2 != R4_ARG2, "smashed argument"); 1236 mr_if_needed(R5_ARG3, arg_2); 1237 mr_if_needed(R6_ARG4, arg_3); 1238 call_VM(oop_result, entry_point, check_exceptions); 1239 } 1240 1241 void MacroAssembler::call_VM_leaf(address entry_point) { 1242 call_VM_leaf_base(entry_point); 1243 } 1244 1245 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) { 1246 mr_if_needed(R3_ARG1, arg_1); 1247 call_VM_leaf(entry_point); 1248 } 1249 1250 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) { 1251 mr_if_needed(R3_ARG1, arg_1); 1252 assert(arg_2 != R3_ARG1, "smashed argument"); 1253 mr_if_needed(R4_ARG2, arg_2); 1254 call_VM_leaf(entry_point); 1255 } 1256 1257 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) { 1258 mr_if_needed(R3_ARG1, arg_1); 1259 assert(arg_2 != R3_ARG1, "smashed argument"); 1260 mr_if_needed(R4_ARG2, arg_2); 1261 assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument"); 1262 mr_if_needed(R5_ARG3, arg_3); 1263 call_VM_leaf(entry_point); 1264 } 1265 1266 // Check whether instruction is a read access to the polling page 1267 // which was emitted by load_from_polling_page(..). 1268 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext, 1269 address* polling_address_ptr) { 1270 if (!is_ld(instruction)) 1271 return false; // It's not a ld. Fail. 1272 1273 int rt = inv_rt_field(instruction); 1274 int ra = inv_ra_field(instruction); 1275 int ds = inv_ds_field(instruction); 1276 if (!(ds == 0 && ra != 0 && rt == 0)) { 1277 return false; // It's not a ld(r0, X, ra). Fail. 1278 } 1279 1280 if (!ucontext) { 1281 // Set polling address. 1282 if (polling_address_ptr != NULL) { 1283 *polling_address_ptr = NULL; 1284 } 1285 return true; // No ucontext given. Can't check value of ra. Assume true. 1286 } 1287 1288 #ifdef LINUX 1289 // Ucontext given. Check that register ra contains the address of 1290 // the safepoing polling page. 1291 ucontext_t* uc = (ucontext_t*) ucontext; 1292 // Set polling address. 1293 address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds; 1294 if (polling_address_ptr != NULL) { 1295 *polling_address_ptr = addr; 1296 } 1297 return os::is_poll_address(addr); 1298 #else 1299 // Not on Linux, ucontext must be NULL. 1300 ShouldNotReachHere(); 1301 return false; 1302 #endif 1303 } 1304 1305 bool MacroAssembler::is_memory_serialization(int instruction, JavaThread* thread, void* ucontext) { 1306 #ifdef LINUX 1307 ucontext_t* uc = (ucontext_t*) ucontext; 1308 1309 if (is_stwx(instruction) || is_stwux(instruction)) { 1310 int ra = inv_ra_field(instruction); 1311 int rb = inv_rb_field(instruction); 1312 1313 // look up content of ra and rb in ucontext 1314 address ra_val=(address)uc->uc_mcontext.regs->gpr[ra]; 1315 long rb_val=(long)uc->uc_mcontext.regs->gpr[rb]; 1316 return os::is_memory_serialize_page(thread, ra_val+rb_val); 1317 } else if (is_stw(instruction) || is_stwu(instruction)) { 1318 int ra = inv_ra_field(instruction); 1319 int d1 = inv_d1_field(instruction); 1320 1321 // look up content of ra in ucontext 1322 address ra_val=(address)uc->uc_mcontext.regs->gpr[ra]; 1323 return os::is_memory_serialize_page(thread, ra_val+d1); 1324 } else { 1325 return false; 1326 } 1327 #else 1328 // workaround not needed on !LINUX :-) 1329 ShouldNotCallThis(); 1330 return false; 1331 #endif 1332 } 1333 1334 void MacroAssembler::bang_stack_with_offset(int offset) { 1335 // When increasing the stack, the old stack pointer will be written 1336 // to the new top of stack according to the PPC64 abi. 1337 // Therefore, stack banging is not necessary when increasing 1338 // the stack by <= os::vm_page_size() bytes. 1339 // When increasing the stack by a larger amount, this method is 1340 // called repeatedly to bang the intermediate pages. 1341 1342 // Stack grows down, caller passes positive offset. 1343 assert(offset > 0, "must bang with positive offset"); 1344 1345 long stdoffset = -offset; 1346 1347 if (is_simm(stdoffset, 16)) { 1348 // Signed 16 bit offset, a simple std is ok. 1349 if (UseLoadInstructionsForStackBangingPPC64) { 1350 ld(R0, (int)(signed short)stdoffset, R1_SP); 1351 } else { 1352 std(R0,(int)(signed short)stdoffset, R1_SP); 1353 } 1354 } else if (is_simm(stdoffset, 31)) { 1355 const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset); 1356 const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset); 1357 1358 Register tmp = R11; 1359 addis(tmp, R1_SP, hi); 1360 if (UseLoadInstructionsForStackBangingPPC64) { 1361 ld(R0, lo, tmp); 1362 } else { 1363 std(R0, lo, tmp); 1364 } 1365 } else { 1366 ShouldNotReachHere(); 1367 } 1368 } 1369 1370 // If instruction is a stack bang of the form 1371 // std R0, x(Ry), (see bang_stack_with_offset()) 1372 // stdu R1_SP, x(R1_SP), (see push_frame(), resize_frame()) 1373 // or stdux R1_SP, Rx, R1_SP (see push_frame(), resize_frame()) 1374 // return the banged address. Otherwise, return 0. 1375 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) { 1376 #ifdef LINUX 1377 ucontext_t* uc = (ucontext_t*) ucontext; 1378 int rs = inv_rs_field(instruction); 1379 int ra = inv_ra_field(instruction); 1380 if ( (is_ld(instruction) && rs == 0 && UseLoadInstructionsForStackBangingPPC64) 1381 || (is_std(instruction) && rs == 0 && !UseLoadInstructionsForStackBangingPPC64) 1382 || (is_stdu(instruction) && rs == 1)) { 1383 int ds = inv_ds_field(instruction); 1384 // return banged address 1385 return ds+(address)uc->uc_mcontext.regs->gpr[ra]; 1386 } else if (is_stdux(instruction) && rs == 1) { 1387 int rb = inv_rb_field(instruction); 1388 address sp = (address)uc->uc_mcontext.regs->gpr[1]; 1389 long rb_val = (long)uc->uc_mcontext.regs->gpr[rb]; 1390 return ra != 1 || rb_val >= 0 ? NULL // not a stack bang 1391 : sp + rb_val; // banged address 1392 } 1393 return NULL; // not a stack bang 1394 #else 1395 // workaround not needed on !LINUX :-) 1396 ShouldNotCallThis(); 1397 return NULL; 1398 #endif 1399 } 1400 1401 void MacroAssembler::reserved_stack_check(Register return_pc) { 1402 // Test if reserved zone needs to be enabled. 1403 Label no_reserved_zone_enabling; 1404 1405 ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread); 1406 cmpld(CCR0, R1_SP, R0); 1407 blt_predict_taken(CCR0, no_reserved_zone_enabling); 1408 1409 // Enable reserved zone again, throw stack overflow exception. 1410 push_frame_reg_args(0, R0); 1411 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread); 1412 pop_frame(); 1413 mtlr(return_pc); 1414 load_const_optimized(R0, StubRoutines::throw_delayed_StackOverflowError_entry()); 1415 mtctr(R0); 1416 bctr(); 1417 1418 should_not_reach_here(); 1419 1420 bind(no_reserved_zone_enabling); 1421 } 1422 1423 void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base, 1424 bool cmpxchgx_hint) { 1425 Label retry; 1426 bind(retry); 1427 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1428 stdcx_(exchange_value, addr_base); 1429 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1430 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1431 } else { 1432 bne( CCR0, retry); // StXcx_ sets CCR0. 1433 } 1434 } 1435 1436 void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base, 1437 Register tmp, bool cmpxchgx_hint) { 1438 Label retry; 1439 bind(retry); 1440 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1441 add(tmp, dest_current_value, inc_value); 1442 stdcx_(tmp, addr_base); 1443 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1444 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1445 } else { 1446 bne( CCR0, retry); // StXcx_ sets CCR0. 1447 } 1448 } 1449 1450 // Word/sub-word atomic helper functions 1451 1452 // Temps and addr_base are killed if size < 4 and processor does not support respective instructions. 1453 // Only signed types are supported with size < 4. 1454 // Atomic add always kills tmp1. 1455 void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value, 1456 Register addr_base, Register tmp1, Register tmp2, Register tmp3, 1457 bool cmpxchgx_hint, bool is_add, int size) { 1458 // Sub-word instructions are available since Power 8. 1459 // For older processors, instruction_type != size holds, and we 1460 // emulate the sub-word instructions by constructing a 4-byte value 1461 // that leaves the other bytes unchanged. 1462 const int instruction_type = VM_Version::has_lqarx() ? size : 4; 1463 1464 Label retry; 1465 Register shift_amount = noreg, 1466 val32 = dest_current_value, 1467 modval = is_add ? tmp1 : exchange_value; 1468 1469 if (instruction_type != size) { 1470 assert_different_registers(tmp1, tmp2, tmp3, dest_current_value, exchange_value, addr_base); 1471 modval = tmp1; 1472 shift_amount = tmp2; 1473 val32 = tmp3; 1474 // Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned. 1475 #ifdef VM_LITTLE_ENDIAN 1476 rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8; 1477 clrrdi(addr_base, addr_base, 2); 1478 #else 1479 xori(shift_amount, addr_base, (size == 1) ? 3 : 2); 1480 clrrdi(addr_base, addr_base, 2); 1481 rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16; 1482 #endif 1483 } 1484 1485 // atomic emulation loop 1486 bind(retry); 1487 1488 switch (instruction_type) { 1489 case 4: lwarx(val32, addr_base, cmpxchgx_hint); break; 1490 case 2: lharx(val32, addr_base, cmpxchgx_hint); break; 1491 case 1: lbarx(val32, addr_base, cmpxchgx_hint); break; 1492 default: ShouldNotReachHere(); 1493 } 1494 1495 if (instruction_type != size) { 1496 srw(dest_current_value, val32, shift_amount); 1497 } 1498 1499 if (is_add) { add(modval, dest_current_value, exchange_value); } 1500 1501 if (instruction_type != size) { 1502 // Transform exchange value such that the replacement can be done by one xor instruction. 1503 xorr(modval, dest_current_value, is_add ? modval : exchange_value); 1504 clrldi(modval, modval, (size == 1) ? 56 : 48); 1505 slw(modval, modval, shift_amount); 1506 xorr(modval, val32, modval); 1507 } 1508 1509 switch (instruction_type) { 1510 case 4: stwcx_(modval, addr_base); break; 1511 case 2: sthcx_(modval, addr_base); break; 1512 case 1: stbcx_(modval, addr_base); break; 1513 default: ShouldNotReachHere(); 1514 } 1515 1516 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1517 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1518 } else { 1519 bne( CCR0, retry); // StXcx_ sets CCR0. 1520 } 1521 1522 // l?arx zero-extends, but Java wants byte/short values sign-extended. 1523 if (size == 1) { 1524 extsb(dest_current_value, dest_current_value); 1525 } else if (size == 2) { 1526 extsh(dest_current_value, dest_current_value); 1527 }; 1528 } 1529 1530 // Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions. 1531 // Only signed types are supported with size < 4. 1532 void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value, 1533 Register compare_value, Register exchange_value, 1534 Register addr_base, Register tmp1, Register tmp2, 1535 Label &retry, Label &failed, bool cmpxchgx_hint, int size) { 1536 // Sub-word instructions are available since Power 8. 1537 // For older processors, instruction_type != size holds, and we 1538 // emulate the sub-word instructions by constructing a 4-byte value 1539 // that leaves the other bytes unchanged. 1540 const int instruction_type = VM_Version::has_lqarx() ? size : 4; 1541 1542 Register shift_amount = noreg, 1543 val32 = dest_current_value, 1544 modval = exchange_value; 1545 1546 if (instruction_type != size) { 1547 assert_different_registers(tmp1, tmp2, dest_current_value, compare_value, exchange_value, addr_base); 1548 shift_amount = tmp1; 1549 val32 = tmp2; 1550 modval = tmp2; 1551 // Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned. 1552 #ifdef VM_LITTLE_ENDIAN 1553 rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8; 1554 clrrdi(addr_base, addr_base, 2); 1555 #else 1556 xori(shift_amount, addr_base, (size == 1) ? 3 : 2); 1557 clrrdi(addr_base, addr_base, 2); 1558 rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16; 1559 #endif 1560 // Transform exchange value such that the replacement can be done by one xor instruction. 1561 xorr(exchange_value, compare_value, exchange_value); 1562 clrldi(exchange_value, exchange_value, (size == 1) ? 56 : 48); 1563 slw(exchange_value, exchange_value, shift_amount); 1564 } 1565 1566 // atomic emulation loop 1567 bind(retry); 1568 1569 switch (instruction_type) { 1570 case 4: lwarx(val32, addr_base, cmpxchgx_hint); break; 1571 case 2: lharx(val32, addr_base, cmpxchgx_hint); break; 1572 case 1: lbarx(val32, addr_base, cmpxchgx_hint); break; 1573 default: ShouldNotReachHere(); 1574 } 1575 1576 if (instruction_type != size) { 1577 srw(dest_current_value, val32, shift_amount); 1578 } 1579 if (size == 1) { 1580 extsb(dest_current_value, dest_current_value); 1581 } else if (size == 2) { 1582 extsh(dest_current_value, dest_current_value); 1583 }; 1584 1585 cmpw(flag, dest_current_value, compare_value); 1586 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1587 bne_predict_not_taken(flag, failed); 1588 } else { 1589 bne( flag, failed); 1590 } 1591 // branch to done => (flag == ne), (dest_current_value != compare_value) 1592 // fall through => (flag == eq), (dest_current_value == compare_value) 1593 1594 if (instruction_type != size) { 1595 xorr(modval, val32, exchange_value); 1596 } 1597 1598 switch (instruction_type) { 1599 case 4: stwcx_(modval, addr_base); break; 1600 case 2: sthcx_(modval, addr_base); break; 1601 case 1: stbcx_(modval, addr_base); break; 1602 default: ShouldNotReachHere(); 1603 } 1604 } 1605 1606 // CmpxchgX sets condition register to cmpX(current, compare). 1607 void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value, 1608 Register compare_value, Register exchange_value, 1609 Register addr_base, Register tmp1, Register tmp2, 1610 int semantics, bool cmpxchgx_hint, 1611 Register int_flag_success, bool contention_hint, bool weak, int size) { 1612 Label retry; 1613 Label failed; 1614 Label done; 1615 1616 // Save one branch if result is returned via register and 1617 // result register is different from the other ones. 1618 bool use_result_reg = (int_flag_success != noreg); 1619 bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value && 1620 int_flag_success != exchange_value && int_flag_success != addr_base && 1621 int_flag_success != tmp1 && int_flag_success != tmp2); 1622 assert(!weak || flag == CCR0, "weak only supported with CCR0"); 1623 assert(size == 1 || size == 2 || size == 4, "unsupported"); 1624 1625 if (use_result_reg && preset_result_reg) { 1626 li(int_flag_success, 0); // preset (assume cas failed) 1627 } 1628 1629 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1630 if (contention_hint) { // Don't try to reserve if cmp fails. 1631 switch (size) { 1632 case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break; 1633 case 2: lha(dest_current_value, 0, addr_base); break; 1634 case 4: lwz(dest_current_value, 0, addr_base); break; 1635 default: ShouldNotReachHere(); 1636 } 1637 cmpw(flag, dest_current_value, compare_value); 1638 bne(flag, failed); 1639 } 1640 1641 // release/fence semantics 1642 if (semantics & MemBarRel) { 1643 release(); 1644 } 1645 1646 cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base, tmp1, tmp2, 1647 retry, failed, cmpxchgx_hint, size); 1648 if (!weak || use_result_reg) { 1649 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1650 bne_predict_not_taken(CCR0, weak ? failed : retry); // StXcx_ sets CCR0. 1651 } else { 1652 bne( CCR0, weak ? failed : retry); // StXcx_ sets CCR0. 1653 } 1654 } 1655 // fall through => (flag == eq), (dest_current_value == compare_value), (swapped) 1656 1657 // Result in register (must do this at the end because int_flag_success can be the 1658 // same register as one above). 1659 if (use_result_reg) { 1660 li(int_flag_success, 1); 1661 } 1662 1663 if (semantics & MemBarFenceAfter) { 1664 fence(); 1665 } else if (semantics & MemBarAcq) { 1666 isync(); 1667 } 1668 1669 if (use_result_reg && !preset_result_reg) { 1670 b(done); 1671 } 1672 1673 bind(failed); 1674 if (use_result_reg && !preset_result_reg) { 1675 li(int_flag_success, 0); 1676 } 1677 1678 bind(done); 1679 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1680 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1681 } 1682 1683 // Preforms atomic compare exchange: 1684 // if (compare_value == *addr_base) 1685 // *addr_base = exchange_value 1686 // int_flag_success = 1; 1687 // else 1688 // int_flag_success = 0; 1689 // 1690 // ConditionRegister flag = cmp(compare_value, *addr_base) 1691 // Register dest_current_value = *addr_base 1692 // Register compare_value Used to compare with value in memory 1693 // Register exchange_value Written to memory if compare_value == *addr_base 1694 // Register addr_base The memory location to compareXChange 1695 // Register int_flag_success Set to 1 if exchange_value was written to *addr_base 1696 // 1697 // To avoid the costly compare exchange the value is tested beforehand. 1698 // Several special cases exist to avoid that unnecessary information is generated. 1699 // 1700 void MacroAssembler::cmpxchgd(ConditionRegister flag, 1701 Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value, 1702 Register addr_base, int semantics, bool cmpxchgx_hint, 1703 Register int_flag_success, Label* failed_ext, bool contention_hint, bool weak) { 1704 Label retry; 1705 Label failed_int; 1706 Label& failed = (failed_ext != NULL) ? *failed_ext : failed_int; 1707 Label done; 1708 1709 // Save one branch if result is returned via register and result register is different from the other ones. 1710 bool use_result_reg = (int_flag_success!=noreg); 1711 bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() && 1712 int_flag_success!=exchange_value && int_flag_success!=addr_base); 1713 assert(!weak || flag == CCR0, "weak only supported with CCR0"); 1714 assert(int_flag_success == noreg || failed_ext == NULL, "cannot have both"); 1715 1716 if (use_result_reg && preset_result_reg) { 1717 li(int_flag_success, 0); // preset (assume cas failed) 1718 } 1719 1720 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1721 if (contention_hint) { // Don't try to reserve if cmp fails. 1722 ld(dest_current_value, 0, addr_base); 1723 cmpd(flag, compare_value, dest_current_value); 1724 bne(flag, failed); 1725 } 1726 1727 // release/fence semantics 1728 if (semantics & MemBarRel) { 1729 release(); 1730 } 1731 1732 // atomic emulation loop 1733 bind(retry); 1734 1735 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1736 cmpd(flag, compare_value, dest_current_value); 1737 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1738 bne_predict_not_taken(flag, failed); 1739 } else { 1740 bne( flag, failed); 1741 } 1742 1743 stdcx_(exchange_value, addr_base); 1744 if (!weak || use_result_reg || failed_ext) { 1745 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1746 bne_predict_not_taken(CCR0, weak ? failed : retry); // stXcx_ sets CCR0 1747 } else { 1748 bne( CCR0, weak ? failed : retry); // stXcx_ sets CCR0 1749 } 1750 } 1751 1752 // result in register (must do this at the end because int_flag_success can be the same register as one above) 1753 if (use_result_reg) { 1754 li(int_flag_success, 1); 1755 } 1756 1757 if (semantics & MemBarFenceAfter) { 1758 fence(); 1759 } else if (semantics & MemBarAcq) { 1760 isync(); 1761 } 1762 1763 if (use_result_reg && !preset_result_reg) { 1764 b(done); 1765 } 1766 1767 bind(failed_int); 1768 if (use_result_reg && !preset_result_reg) { 1769 li(int_flag_success, 0); 1770 } 1771 1772 bind(done); 1773 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1774 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1775 } 1776 1777 // Look up the method for a megamorphic invokeinterface call. 1778 // The target method is determined by <intf_klass, itable_index>. 1779 // The receiver klass is in recv_klass. 1780 // On success, the result will be in method_result, and execution falls through. 1781 // On failure, execution transfers to the given label. 1782 void MacroAssembler::lookup_interface_method(Register recv_klass, 1783 Register intf_klass, 1784 RegisterOrConstant itable_index, 1785 Register method_result, 1786 Register scan_temp, 1787 Register temp2, 1788 Label& L_no_such_interface, 1789 bool return_method) { 1790 assert_different_registers(recv_klass, intf_klass, method_result, scan_temp); 1791 1792 // Compute start of first itableOffsetEntry (which is at the end of the vtable). 1793 int vtable_base = in_bytes(Klass::vtable_start_offset()); 1794 int itentry_off = itableMethodEntry::method_offset_in_bytes(); 1795 int logMEsize = exact_log2(itableMethodEntry::size() * wordSize); 1796 int scan_step = itableOffsetEntry::size() * wordSize; 1797 int log_vte_size= exact_log2(vtableEntry::size_in_bytes()); 1798 1799 lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass); 1800 // %%% We should store the aligned, prescaled offset in the klassoop. 1801 // Then the next several instructions would fold away. 1802 1803 sldi(scan_temp, scan_temp, log_vte_size); 1804 addi(scan_temp, scan_temp, vtable_base); 1805 add(scan_temp, recv_klass, scan_temp); 1806 1807 // Adjust recv_klass by scaled itable_index, so we can free itable_index. 1808 if (return_method) { 1809 if (itable_index.is_register()) { 1810 Register itable_offset = itable_index.as_register(); 1811 sldi(method_result, itable_offset, logMEsize); 1812 if (itentry_off) { addi(method_result, method_result, itentry_off); } 1813 add(method_result, method_result, recv_klass); 1814 } else { 1815 long itable_offset = (long)itable_index.as_constant(); 1816 // static address, no relocation 1817 add_const_optimized(method_result, recv_klass, (itable_offset << logMEsize) + itentry_off, temp2); 1818 } 1819 } 1820 1821 // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) { 1822 // if (scan->interface() == intf) { 1823 // result = (klass + scan->offset() + itable_index); 1824 // } 1825 // } 1826 Label search, found_method; 1827 1828 for (int peel = 1; peel >= 0; peel--) { 1829 // %%%% Could load both offset and interface in one ldx, if they were 1830 // in the opposite order. This would save a load. 1831 ld(temp2, itableOffsetEntry::interface_offset_in_bytes(), scan_temp); 1832 1833 // Check that this entry is non-null. A null entry means that 1834 // the receiver class doesn't implement the interface, and wasn't the 1835 // same as when the caller was compiled. 1836 cmpd(CCR0, temp2, intf_klass); 1837 1838 if (peel) { 1839 beq(CCR0, found_method); 1840 } else { 1841 bne(CCR0, search); 1842 // (invert the test to fall through to found_method...) 1843 } 1844 1845 if (!peel) break; 1846 1847 bind(search); 1848 1849 cmpdi(CCR0, temp2, 0); 1850 beq(CCR0, L_no_such_interface); 1851 addi(scan_temp, scan_temp, scan_step); 1852 } 1853 1854 bind(found_method); 1855 1856 // Got a hit. 1857 if (return_method) { 1858 int ito_offset = itableOffsetEntry::offset_offset_in_bytes(); 1859 lwz(scan_temp, ito_offset, scan_temp); 1860 ldx(method_result, scan_temp, method_result); 1861 } 1862 } 1863 1864 // virtual method calling 1865 void MacroAssembler::lookup_virtual_method(Register recv_klass, 1866 RegisterOrConstant vtable_index, 1867 Register method_result) { 1868 1869 assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg()); 1870 1871 const int base = in_bytes(Klass::vtable_start_offset()); 1872 assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); 1873 1874 if (vtable_index.is_register()) { 1875 sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord); 1876 add(recv_klass, vtable_index.as_register(), recv_klass); 1877 } else { 1878 addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord); 1879 } 1880 ld(R19_method, base + vtableEntry::method_offset_in_bytes(), recv_klass); 1881 } 1882 1883 /////////////////////////////////////////// subtype checking //////////////////////////////////////////// 1884 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, 1885 Register super_klass, 1886 Register temp1_reg, 1887 Register temp2_reg, 1888 Label* L_success, 1889 Label* L_failure, 1890 Label* L_slow_path, 1891 RegisterOrConstant super_check_offset) { 1892 1893 const Register check_cache_offset = temp1_reg; 1894 const Register cached_super = temp2_reg; 1895 1896 assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super); 1897 1898 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1899 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1900 1901 bool must_load_sco = (super_check_offset.constant_or_zero() == -1); 1902 bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset); 1903 1904 Label L_fallthrough; 1905 int label_nulls = 0; 1906 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 1907 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 1908 if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; } 1909 assert(label_nulls <= 1 || 1910 (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path), 1911 "at most one NULL in the batch, usually"); 1912 1913 // If the pointers are equal, we are done (e.g., String[] elements). 1914 // This self-check enables sharing of secondary supertype arrays among 1915 // non-primary types such as array-of-interface. Otherwise, each such 1916 // type would need its own customized SSA. 1917 // We move this check to the front of the fast path because many 1918 // type checks are in fact trivially successful in this manner, 1919 // so we get a nicely predicted branch right at the start of the check. 1920 cmpd(CCR0, sub_klass, super_klass); 1921 beq(CCR0, *L_success); 1922 1923 // Check the supertype display: 1924 if (must_load_sco) { 1925 // The super check offset is always positive... 1926 lwz(check_cache_offset, sco_offset, super_klass); 1927 super_check_offset = RegisterOrConstant(check_cache_offset); 1928 // super_check_offset is register. 1929 assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register()); 1930 } 1931 // The loaded value is the offset from KlassOopDesc. 1932 1933 ld(cached_super, super_check_offset, sub_klass); 1934 cmpd(CCR0, cached_super, super_klass); 1935 1936 // This check has worked decisively for primary supers. 1937 // Secondary supers are sought in the super_cache ('super_cache_addr'). 1938 // (Secondary supers are interfaces and very deeply nested subtypes.) 1939 // This works in the same check above because of a tricky aliasing 1940 // between the super_cache and the primary super display elements. 1941 // (The 'super_check_addr' can address either, as the case requires.) 1942 // Note that the cache is updated below if it does not help us find 1943 // what we need immediately. 1944 // So if it was a primary super, we can just fail immediately. 1945 // Otherwise, it's the slow path for us (no success at this point). 1946 1947 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); } 1948 1949 if (super_check_offset.is_register()) { 1950 beq(CCR0, *L_success); 1951 cmpwi(CCR0, super_check_offset.as_register(), sc_offset); 1952 if (L_failure == &L_fallthrough) { 1953 beq(CCR0, *L_slow_path); 1954 } else { 1955 bne(CCR0, *L_failure); 1956 FINAL_JUMP(*L_slow_path); 1957 } 1958 } else { 1959 if (super_check_offset.as_constant() == sc_offset) { 1960 // Need a slow path; fast failure is impossible. 1961 if (L_slow_path == &L_fallthrough) { 1962 beq(CCR0, *L_success); 1963 } else { 1964 bne(CCR0, *L_slow_path); 1965 FINAL_JUMP(*L_success); 1966 } 1967 } else { 1968 // No slow path; it's a fast decision. 1969 if (L_failure == &L_fallthrough) { 1970 beq(CCR0, *L_success); 1971 } else { 1972 bne(CCR0, *L_failure); 1973 FINAL_JUMP(*L_success); 1974 } 1975 } 1976 } 1977 1978 bind(L_fallthrough); 1979 #undef FINAL_JUMP 1980 } 1981 1982 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 1983 Register super_klass, 1984 Register temp1_reg, 1985 Register temp2_reg, 1986 Label* L_success, 1987 Register result_reg) { 1988 const Register array_ptr = temp1_reg; // current value from cache array 1989 const Register temp = temp2_reg; 1990 1991 assert_different_registers(sub_klass, super_klass, array_ptr, temp); 1992 1993 int source_offset = in_bytes(Klass::secondary_supers_offset()); 1994 int target_offset = in_bytes(Klass::secondary_super_cache_offset()); 1995 1996 int length_offset = Array<Klass*>::length_offset_in_bytes(); 1997 int base_offset = Array<Klass*>::base_offset_in_bytes(); 1998 1999 Label hit, loop, failure, fallthru; 2000 2001 ld(array_ptr, source_offset, sub_klass); 2002 2003 // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated."); 2004 lwz(temp, length_offset, array_ptr); 2005 cmpwi(CCR0, temp, 0); 2006 beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0 2007 2008 mtctr(temp); // load ctr 2009 2010 bind(loop); 2011 // Oops in table are NO MORE compressed. 2012 ld(temp, base_offset, array_ptr); 2013 cmpd(CCR0, temp, super_klass); 2014 beq(CCR0, hit); 2015 addi(array_ptr, array_ptr, BytesPerWord); 2016 bdnz(loop); 2017 2018 bind(failure); 2019 if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss) 2020 b(fallthru); 2021 2022 bind(hit); 2023 std(super_klass, target_offset, sub_klass); // save result to cache 2024 if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit) 2025 if (L_success != NULL) { b(*L_success); } 2026 else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided 2027 2028 bind(fallthru); 2029 } 2030 2031 // Try fast path, then go to slow one if not successful 2032 void MacroAssembler::check_klass_subtype(Register sub_klass, 2033 Register super_klass, 2034 Register temp1_reg, 2035 Register temp2_reg, 2036 Label& L_success) { 2037 Label L_failure; 2038 check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure); 2039 check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success); 2040 bind(L_failure); // Fallthru if not successful. 2041 } 2042 2043 void MacroAssembler::check_method_handle_type(Register mtype_reg, Register mh_reg, 2044 Register temp_reg, 2045 Label& wrong_method_type) { 2046 assert_different_registers(mtype_reg, mh_reg, temp_reg); 2047 // Compare method type against that of the receiver. 2048 load_heap_oop_not_null(temp_reg, delayed_value(java_lang_invoke_MethodHandle::type_offset_in_bytes, temp_reg), mh_reg); 2049 cmpd(CCR0, temp_reg, mtype_reg); 2050 bne(CCR0, wrong_method_type); 2051 } 2052 2053 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot, 2054 Register temp_reg, 2055 int extra_slot_offset) { 2056 // cf. TemplateTable::prepare_invoke(), if (load_receiver). 2057 int stackElementSize = Interpreter::stackElementSize; 2058 int offset = extra_slot_offset * stackElementSize; 2059 if (arg_slot.is_constant()) { 2060 offset += arg_slot.as_constant() * stackElementSize; 2061 return offset; 2062 } else { 2063 assert(temp_reg != noreg, "must specify"); 2064 sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize)); 2065 if (offset != 0) 2066 addi(temp_reg, temp_reg, offset); 2067 return temp_reg; 2068 } 2069 } 2070 2071 // Supports temp2_reg = R0. 2072 void MacroAssembler::biased_locking_enter(ConditionRegister cr_reg, Register obj_reg, 2073 Register mark_reg, Register temp_reg, 2074 Register temp2_reg, Label& done, Label* slow_case) { 2075 assert(UseBiasedLocking, "why call this otherwise?"); 2076 2077 #ifdef ASSERT 2078 assert_different_registers(obj_reg, mark_reg, temp_reg, temp2_reg); 2079 #endif 2080 2081 Label cas_label; 2082 2083 // Branch to done if fast path fails and no slow_case provided. 2084 Label *slow_case_int = (slow_case != NULL) ? slow_case : &done; 2085 2086 // Biased locking 2087 // See whether the lock is currently biased toward our thread and 2088 // whether the epoch is still valid 2089 // Note that the runtime guarantees sufficient alignment of JavaThread 2090 // pointers to allow age to be placed into low bits 2091 assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, 2092 "biased locking makes assumptions about bit layout"); 2093 2094 if (PrintBiasedLockingStatistics) { 2095 load_const(temp2_reg, (address) BiasedLocking::total_entry_count_addr(), temp_reg); 2096 lwzx(temp_reg, temp2_reg); 2097 addi(temp_reg, temp_reg, 1); 2098 stwx(temp_reg, temp2_reg); 2099 } 2100 2101 andi(temp_reg, mark_reg, markOopDesc::biased_lock_mask_in_place); 2102 cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern); 2103 bne(cr_reg, cas_label); 2104 2105 load_klass(temp_reg, obj_reg); 2106 2107 load_const_optimized(temp2_reg, ~((int) markOopDesc::age_mask_in_place)); 2108 ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg); 2109 orr(temp_reg, R16_thread, temp_reg); 2110 xorr(temp_reg, mark_reg, temp_reg); 2111 andr(temp_reg, temp_reg, temp2_reg); 2112 cmpdi(cr_reg, temp_reg, 0); 2113 if (PrintBiasedLockingStatistics) { 2114 Label l; 2115 bne(cr_reg, l); 2116 load_const(temp2_reg, (address) BiasedLocking::biased_lock_entry_count_addr()); 2117 lwzx(mark_reg, temp2_reg); 2118 addi(mark_reg, mark_reg, 1); 2119 stwx(mark_reg, temp2_reg); 2120 // restore mark_reg 2121 ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg); 2122 bind(l); 2123 } 2124 beq(cr_reg, done); 2125 2126 Label try_revoke_bias; 2127 Label try_rebias; 2128 2129 // At this point we know that the header has the bias pattern and 2130 // that we are not the bias owner in the current epoch. We need to 2131 // figure out more details about the state of the header in order to 2132 // know what operations can be legally performed on the object's 2133 // header. 2134 2135 // If the low three bits in the xor result aren't clear, that means 2136 // the prototype header is no longer biased and we have to revoke 2137 // the bias on this object. 2138 andi(temp2_reg, temp_reg, markOopDesc::biased_lock_mask_in_place); 2139 cmpwi(cr_reg, temp2_reg, 0); 2140 bne(cr_reg, try_revoke_bias); 2141 2142 // Biasing is still enabled for this data type. See whether the 2143 // epoch of the current bias is still valid, meaning that the epoch 2144 // bits of the mark word are equal to the epoch bits of the 2145 // prototype header. (Note that the prototype header's epoch bits 2146 // only change at a safepoint.) If not, attempt to rebias the object 2147 // toward the current thread. Note that we must be absolutely sure 2148 // that the current epoch is invalid in order to do this because 2149 // otherwise the manipulations it performs on the mark word are 2150 // illegal. 2151 2152 int shift_amount = 64 - markOopDesc::epoch_shift; 2153 // rotate epoch bits to right (little) end and set other bits to 0 2154 // [ big part | epoch | little part ] -> [ 0..0 | epoch ] 2155 rldicl_(temp2_reg, temp_reg, shift_amount, 64 - markOopDesc::epoch_bits); 2156 // branch if epoch bits are != 0, i.e. they differ, because the epoch has been incremented 2157 bne(CCR0, try_rebias); 2158 2159 // The epoch of the current bias is still valid but we know nothing 2160 // about the owner; it might be set or it might be clear. Try to 2161 // acquire the bias of the object using an atomic operation. If this 2162 // fails we will go in to the runtime to revoke the object's bias. 2163 // Note that we first construct the presumed unbiased header so we 2164 // don't accidentally blow away another thread's valid bias. 2165 andi(mark_reg, mark_reg, (markOopDesc::biased_lock_mask_in_place | 2166 markOopDesc::age_mask_in_place | 2167 markOopDesc::epoch_mask_in_place)); 2168 orr(temp_reg, R16_thread, mark_reg); 2169 2170 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2171 2172 // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg). 2173 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg, 2174 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg, 2175 /*where=*/obj_reg, 2176 MacroAssembler::MemBarAcq, 2177 MacroAssembler::cmpxchgx_hint_acquire_lock(), 2178 noreg, slow_case_int); // bail out if failed 2179 2180 // If the biasing toward our thread failed, this means that 2181 // another thread succeeded in biasing it toward itself and we 2182 // need to revoke that bias. The revocation will occur in the 2183 // interpreter runtime in the slow case. 2184 if (PrintBiasedLockingStatistics) { 2185 load_const(temp2_reg, (address) BiasedLocking::anonymously_biased_lock_entry_count_addr(), temp_reg); 2186 lwzx(temp_reg, temp2_reg); 2187 addi(temp_reg, temp_reg, 1); 2188 stwx(temp_reg, temp2_reg); 2189 } 2190 b(done); 2191 2192 bind(try_rebias); 2193 // At this point we know the epoch has expired, meaning that the 2194 // current "bias owner", if any, is actually invalid. Under these 2195 // circumstances _only_, we are allowed to use the current header's 2196 // value as the comparison value when doing the cas to acquire the 2197 // bias in the current epoch. In other words, we allow transfer of 2198 // the bias from one thread to another directly in this situation. 2199 load_klass(temp_reg, obj_reg); 2200 andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place); 2201 orr(temp2_reg, R16_thread, temp2_reg); 2202 ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg); 2203 orr(temp_reg, temp2_reg, temp_reg); 2204 2205 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2206 2207 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg, 2208 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg, 2209 /*where=*/obj_reg, 2210 MacroAssembler::MemBarAcq, 2211 MacroAssembler::cmpxchgx_hint_acquire_lock(), 2212 noreg, slow_case_int); // bail out if failed 2213 2214 // If the biasing toward our thread failed, this means that 2215 // another thread succeeded in biasing it toward itself and we 2216 // need to revoke that bias. The revocation will occur in the 2217 // interpreter runtime in the slow case. 2218 if (PrintBiasedLockingStatistics) { 2219 load_const(temp2_reg, (address) BiasedLocking::rebiased_lock_entry_count_addr(), temp_reg); 2220 lwzx(temp_reg, temp2_reg); 2221 addi(temp_reg, temp_reg, 1); 2222 stwx(temp_reg, temp2_reg); 2223 } 2224 b(done); 2225 2226 bind(try_revoke_bias); 2227 // The prototype mark in the klass doesn't have the bias bit set any 2228 // more, indicating that objects of this data type are not supposed 2229 // to be biased any more. We are going to try to reset the mark of 2230 // this object to the prototype value and fall through to the 2231 // CAS-based locking scheme. Note that if our CAS fails, it means 2232 // that another thread raced us for the privilege of revoking the 2233 // bias of this particular object, so it's okay to continue in the 2234 // normal locking code. 2235 load_klass(temp_reg, obj_reg); 2236 ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg); 2237 andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place); 2238 orr(temp_reg, temp_reg, temp2_reg); 2239 2240 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2241 2242 // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg). 2243 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg, 2244 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg, 2245 /*where=*/obj_reg, 2246 MacroAssembler::MemBarAcq, 2247 MacroAssembler::cmpxchgx_hint_acquire_lock()); 2248 2249 // reload markOop in mark_reg before continuing with lightweight locking 2250 ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg); 2251 2252 // Fall through to the normal CAS-based lock, because no matter what 2253 // the result of the above CAS, some thread must have succeeded in 2254 // removing the bias bit from the object's header. 2255 if (PrintBiasedLockingStatistics) { 2256 Label l; 2257 bne(cr_reg, l); 2258 load_const(temp2_reg, (address) BiasedLocking::revoked_lock_entry_count_addr(), temp_reg); 2259 lwzx(temp_reg, temp2_reg); 2260 addi(temp_reg, temp_reg, 1); 2261 stwx(temp_reg, temp2_reg); 2262 bind(l); 2263 } 2264 2265 bind(cas_label); 2266 } 2267 2268 void MacroAssembler::biased_locking_exit (ConditionRegister cr_reg, Register mark_addr, Register temp_reg, Label& done) { 2269 // Check for biased locking unlock case, which is a no-op 2270 // Note: we do not have to check the thread ID for two reasons. 2271 // First, the interpreter checks for IllegalMonitorStateException at 2272 // a higher level. Second, if the bias was revoked while we held the 2273 // lock, the object could not be rebiased toward another thread, so 2274 // the bias bit would be clear. 2275 2276 ld(temp_reg, 0, mark_addr); 2277 andi(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place); 2278 2279 cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern); 2280 beq(cr_reg, done); 2281 } 2282 2283 // allocation (for C1) 2284 void MacroAssembler::eden_allocate( 2285 Register obj, // result: pointer to object after successful allocation 2286 Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise 2287 int con_size_in_bytes, // object size in bytes if known at compile time 2288 Register t1, // temp register 2289 Register t2, // temp register 2290 Label& slow_case // continuation point if fast allocation fails 2291 ) { 2292 b(slow_case); 2293 } 2294 2295 void MacroAssembler::tlab_allocate( 2296 Register obj, // result: pointer to object after successful allocation 2297 Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise 2298 int con_size_in_bytes, // object size in bytes if known at compile time 2299 Register t1, // temp register 2300 Label& slow_case // continuation point if fast allocation fails 2301 ) { 2302 // make sure arguments make sense 2303 assert_different_registers(obj, var_size_in_bytes, t1); 2304 assert(0 <= con_size_in_bytes && is_simm13(con_size_in_bytes), "illegal object size"); 2305 assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment"); 2306 2307 const Register new_top = t1; 2308 //verify_tlab(); not implemented 2309 2310 ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread); 2311 ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread); 2312 if (var_size_in_bytes == noreg) { 2313 addi(new_top, obj, con_size_in_bytes); 2314 } else { 2315 add(new_top, obj, var_size_in_bytes); 2316 } 2317 cmpld(CCR0, new_top, R0); 2318 bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case); 2319 2320 #ifdef ASSERT 2321 // make sure new free pointer is properly aligned 2322 { 2323 Label L; 2324 andi_(R0, new_top, MinObjAlignmentInBytesMask); 2325 beq(CCR0, L); 2326 stop("updated TLAB free is not properly aligned", 0x934); 2327 bind(L); 2328 } 2329 #endif // ASSERT 2330 2331 // update the tlab top pointer 2332 std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread); 2333 //verify_tlab(); not implemented 2334 } 2335 void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register t1, Register t2) { 2336 unimplemented("incr_allocated_bytes"); 2337 } 2338 2339 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset, 2340 int insts_call_instruction_offset, Register Rtoc) { 2341 // Start the stub. 2342 address stub = start_a_stub(64); 2343 if (stub == NULL) { return NULL; } // CodeCache full: bail out 2344 2345 // Create a trampoline stub relocation which relates this trampoline stub 2346 // with the call instruction at insts_call_instruction_offset in the 2347 // instructions code-section. 2348 relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset)); 2349 const int stub_start_offset = offset(); 2350 2351 // For java_to_interp stubs we use R11_scratch1 as scratch register 2352 // and in call trampoline stubs we use R12_scratch2. This way we 2353 // can distinguish them (see is_NativeCallTrampolineStub_at()). 2354 Register reg_scratch = R12_scratch2; 2355 2356 // Now, create the trampoline stub's code: 2357 // - load the TOC 2358 // - load the call target from the constant pool 2359 // - call 2360 if (Rtoc == noreg) { 2361 calculate_address_from_global_toc(reg_scratch, method_toc()); 2362 Rtoc = reg_scratch; 2363 } 2364 2365 ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false); 2366 mtctr(reg_scratch); 2367 bctr(); 2368 2369 const address stub_start_addr = addr_at(stub_start_offset); 2370 2371 // Assert that the encoded destination_toc_offset can be identified and that it is correct. 2372 assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(), 2373 "encoded offset into the constant pool must match"); 2374 // Trampoline_stub_size should be good. 2375 assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size"); 2376 assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline"); 2377 2378 // End the stub. 2379 end_a_stub(); 2380 return stub; 2381 } 2382 2383 // TM on PPC64. 2384 void MacroAssembler::atomic_inc_ptr(Register addr, Register result, int simm16) { 2385 Label retry; 2386 bind(retry); 2387 ldarx(result, addr, /*hint*/ false); 2388 addi(result, result, simm16); 2389 stdcx_(result, addr); 2390 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 2391 bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0 2392 } else { 2393 bne( CCR0, retry); // stXcx_ sets CCR0 2394 } 2395 } 2396 2397 void MacroAssembler::atomic_ori_int(Register addr, Register result, int uimm16) { 2398 Label retry; 2399 bind(retry); 2400 lwarx(result, addr, /*hint*/ false); 2401 ori(result, result, uimm16); 2402 stwcx_(result, addr); 2403 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 2404 bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0 2405 } else { 2406 bne( CCR0, retry); // stXcx_ sets CCR0 2407 } 2408 } 2409 2410 #if INCLUDE_RTM_OPT 2411 2412 // Update rtm_counters based on abort status 2413 // input: abort_status 2414 // rtm_counters (RTMLockingCounters*) 2415 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters_Reg) { 2416 // Mapping to keep PreciseRTMLockingStatistics similar to x86. 2417 // x86 ppc (! means inverted, ? means not the same) 2418 // 0 31 Set if abort caused by XABORT instruction. 2419 // 1 ! 7 If set, the transaction may succeed on a retry. This bit is always clear if bit 0 is set. 2420 // 2 13 Set if another logical processor conflicted with a memory address that was part of the transaction that aborted. 2421 // 3 10 Set if an internal buffer overflowed. 2422 // 4 ?12 Set if a debug breakpoint was hit. 2423 // 5 ?32 Set if an abort occurred during execution of a nested transaction. 2424 const int tm_failure_bit[] = {Assembler::tm_tabort, // Note: Seems like signal handler sets this, too. 2425 Assembler::tm_failure_persistent, // inverted: transient 2426 Assembler::tm_trans_cf, 2427 Assembler::tm_footprint_of, 2428 Assembler::tm_non_trans_cf, 2429 Assembler::tm_suspended}; 2430 const bool tm_failure_inv[] = {false, true, false, false, false, false}; 2431 assert(sizeof(tm_failure_bit)/sizeof(int) == RTMLockingCounters::ABORT_STATUS_LIMIT, "adapt mapping!"); 2432 2433 const Register addr_Reg = R0; 2434 // Keep track of offset to where rtm_counters_Reg had pointed to. 2435 int counters_offs = RTMLockingCounters::abort_count_offset(); 2436 addi(addr_Reg, rtm_counters_Reg, counters_offs); 2437 const Register temp_Reg = rtm_counters_Reg; 2438 2439 //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically 2440 ldx(temp_Reg, addr_Reg); 2441 addi(temp_Reg, temp_Reg, 1); 2442 stdx(temp_Reg, addr_Reg); 2443 2444 if (PrintPreciseRTMLockingStatistics) { 2445 int counters_offs_delta = RTMLockingCounters::abortX_count_offset() - counters_offs; 2446 2447 //mftexasr(abort_status); done by caller 2448 for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) { 2449 counters_offs += counters_offs_delta; 2450 li(temp_Reg, counters_offs_delta); // can't use addi with R0 2451 add(addr_Reg, addr_Reg, temp_Reg); // point to next counter 2452 counters_offs_delta = sizeof(uintx); 2453 2454 Label check_abort; 2455 rldicr_(temp_Reg, abort_status, tm_failure_bit[i], 0); 2456 if (tm_failure_inv[i]) { 2457 bne(CCR0, check_abort); 2458 } else { 2459 beq(CCR0, check_abort); 2460 } 2461 //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically 2462 ldx(temp_Reg, addr_Reg); 2463 addi(temp_Reg, temp_Reg, 1); 2464 stdx(temp_Reg, addr_Reg); 2465 bind(check_abort); 2466 } 2467 } 2468 li(temp_Reg, -counters_offs); // can't use addi with R0 2469 add(rtm_counters_Reg, addr_Reg, temp_Reg); // restore 2470 } 2471 2472 // Branch if (random & (count-1) != 0), count is 2^n 2473 // tmp and CR0 are killed 2474 void MacroAssembler::branch_on_random_using_tb(Register tmp, int count, Label& brLabel) { 2475 mftb(tmp); 2476 andi_(tmp, tmp, count-1); 2477 bne(CCR0, brLabel); 2478 } 2479 2480 // Perform abort ratio calculation, set no_rtm bit if high ratio. 2481 // input: rtm_counters_Reg (RTMLockingCounters* address) - KILLED 2482 void MacroAssembler::rtm_abort_ratio_calculation(Register rtm_counters_Reg, 2483 RTMLockingCounters* rtm_counters, 2484 Metadata* method_data) { 2485 Label L_done, L_check_always_rtm1, L_check_always_rtm2; 2486 2487 if (RTMLockingCalculationDelay > 0) { 2488 // Delay calculation. 2489 ld(rtm_counters_Reg, (RegisterOrConstant)(intptr_t)RTMLockingCounters::rtm_calculation_flag_addr()); 2490 cmpdi(CCR0, rtm_counters_Reg, 0); 2491 beq(CCR0, L_done); 2492 load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload 2493 } 2494 // Abort ratio calculation only if abort_count > RTMAbortThreshold. 2495 // Aborted transactions = abort_count * 100 2496 // All transactions = total_count * RTMTotalCountIncrRate 2497 // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio) 2498 ld(R0, RTMLockingCounters::abort_count_offset(), rtm_counters_Reg); 2499 if (is_simm(RTMAbortThreshold, 16)) { // cmpdi can handle 16bit immediate only. 2500 cmpdi(CCR0, R0, RTMAbortThreshold); 2501 blt(CCR0, L_check_always_rtm2); // reload of rtm_counters_Reg not necessary 2502 } else { 2503 load_const_optimized(rtm_counters_Reg, RTMAbortThreshold); 2504 cmpd(CCR0, R0, rtm_counters_Reg); 2505 blt(CCR0, L_check_always_rtm1); // reload of rtm_counters_Reg required 2506 } 2507 mulli(R0, R0, 100); 2508 2509 const Register tmpReg = rtm_counters_Reg; 2510 ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg); 2511 mulli(tmpReg, tmpReg, RTMTotalCountIncrRate); // allowable range: int16 2512 mulli(tmpReg, tmpReg, RTMAbortRatio); // allowable range: int16 2513 cmpd(CCR0, R0, tmpReg); 2514 blt(CCR0, L_check_always_rtm1); // jump to reload 2515 if (method_data != NULL) { 2516 // Set rtm_state to "no rtm" in MDO. 2517 // Not using a metadata relocation. Method and Class Loader are kept alive anyway. 2518 // (See nmethod::metadata_do and CodeBuffer::finalize_oop_references.) 2519 load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg); 2520 atomic_ori_int(R0, tmpReg, NoRTM); 2521 } 2522 b(L_done); 2523 2524 bind(L_check_always_rtm1); 2525 load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload 2526 bind(L_check_always_rtm2); 2527 ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg); 2528 int64_t thresholdValue = RTMLockingThreshold / RTMTotalCountIncrRate; 2529 if (is_simm(thresholdValue, 16)) { // cmpdi can handle 16bit immediate only. 2530 cmpdi(CCR0, tmpReg, thresholdValue); 2531 } else { 2532 load_const_optimized(R0, thresholdValue); 2533 cmpd(CCR0, tmpReg, R0); 2534 } 2535 blt(CCR0, L_done); 2536 if (method_data != NULL) { 2537 // Set rtm_state to "always rtm" in MDO. 2538 // Not using a metadata relocation. See above. 2539 load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg); 2540 atomic_ori_int(R0, tmpReg, UseRTM); 2541 } 2542 bind(L_done); 2543 } 2544 2545 // Update counters and perform abort ratio calculation. 2546 // input: abort_status_Reg 2547 void MacroAssembler::rtm_profiling(Register abort_status_Reg, Register temp_Reg, 2548 RTMLockingCounters* rtm_counters, 2549 Metadata* method_data, 2550 bool profile_rtm) { 2551 2552 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2553 // Update rtm counters based on state at abort. 2554 // Reads abort_status_Reg, updates flags. 2555 assert_different_registers(abort_status_Reg, temp_Reg); 2556 load_const_optimized(temp_Reg, (address)rtm_counters, R0); 2557 rtm_counters_update(abort_status_Reg, temp_Reg); 2558 if (profile_rtm) { 2559 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2560 rtm_abort_ratio_calculation(temp_Reg, rtm_counters, method_data); 2561 } 2562 } 2563 2564 // Retry on abort if abort's status indicates non-persistent failure. 2565 // inputs: retry_count_Reg 2566 // : abort_status_Reg 2567 // output: retry_count_Reg decremented by 1 2568 void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, 2569 Label& retryLabel, Label* checkRetry) { 2570 Label doneRetry; 2571 rldicr_(R0, abort_status_Reg, tm_failure_persistent, 0); 2572 bne(CCR0, doneRetry); 2573 if (checkRetry) { bind(*checkRetry); } 2574 addic_(retry_count_Reg, retry_count_Reg, -1); 2575 blt(CCR0, doneRetry); 2576 b(retryLabel); 2577 bind(doneRetry); 2578 } 2579 2580 // Spin and retry if lock is busy. 2581 // inputs: owner_addr_Reg (monitor address) 2582 // : retry_count_Reg 2583 // output: retry_count_Reg decremented by 1 2584 // CTR is killed 2585 void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register owner_addr_Reg, Label& retryLabel) { 2586 Label SpinLoop, doneRetry, doRetry; 2587 addic_(retry_count_Reg, retry_count_Reg, -1); 2588 blt(CCR0, doneRetry); 2589 2590 if (RTMSpinLoopCount > 1) { 2591 li(R0, RTMSpinLoopCount); 2592 mtctr(R0); 2593 } 2594 2595 // low thread priority 2596 smt_prio_low(); 2597 bind(SpinLoop); 2598 2599 if (RTMSpinLoopCount > 1) { 2600 bdz(doRetry); 2601 ld(R0, 0, owner_addr_Reg); 2602 cmpdi(CCR0, R0, 0); 2603 bne(CCR0, SpinLoop); 2604 } 2605 2606 bind(doRetry); 2607 2608 // restore thread priority to default in userspace 2609 #ifdef LINUX 2610 smt_prio_medium_low(); 2611 #else 2612 smt_prio_medium(); 2613 #endif 2614 2615 b(retryLabel); 2616 2617 bind(doneRetry); 2618 } 2619 2620 // Use RTM for normal stack locks. 2621 // Input: objReg (object to lock) 2622 void MacroAssembler::rtm_stack_locking(ConditionRegister flag, 2623 Register obj, Register mark_word, Register tmp, 2624 Register retry_on_abort_count_Reg, 2625 RTMLockingCounters* stack_rtm_counters, 2626 Metadata* method_data, bool profile_rtm, 2627 Label& DONE_LABEL, Label& IsInflated) { 2628 assert(UseRTMForStackLocks, "why call this otherwise?"); 2629 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); 2630 Label L_rtm_retry, L_decrement_retry, L_on_abort; 2631 2632 if (RTMRetryCount > 0) { 2633 load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 2634 bind(L_rtm_retry); 2635 } 2636 andi_(R0, mark_word, markOopDesc::monitor_value); // inflated vs stack-locked|neutral|biased 2637 bne(CCR0, IsInflated); 2638 2639 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2640 Label L_noincrement; 2641 if (RTMTotalCountIncrRate > 1) { 2642 branch_on_random_using_tb(tmp, RTMTotalCountIncrRate, L_noincrement); 2643 } 2644 assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM"); 2645 load_const_optimized(tmp, (address)stack_rtm_counters->total_count_addr(), R0); 2646 //atomic_inc_ptr(tmp, /*temp, will be reloaded*/mark_word); We don't increment atomically 2647 ldx(mark_word, tmp); 2648 addi(mark_word, mark_word, 1); 2649 stdx(mark_word, tmp); 2650 bind(L_noincrement); 2651 } 2652 tbegin_(); 2653 beq(CCR0, L_on_abort); 2654 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // Reload in transaction, conflicts need to be tracked. 2655 andi(R0, mark_word, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits 2656 cmpwi(flag, R0, markOopDesc::unlocked_value); // bits = 001 unlocked 2657 beq(flag, DONE_LABEL); // all done if unlocked 2658 2659 if (UseRTMXendForLockBusy) { 2660 tend_(); 2661 b(L_decrement_retry); 2662 } else { 2663 tabort_(); 2664 } 2665 bind(L_on_abort); 2666 const Register abort_status_Reg = tmp; 2667 mftexasr(abort_status_Reg); 2668 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2669 rtm_profiling(abort_status_Reg, /*temp*/mark_word, stack_rtm_counters, method_data, profile_rtm); 2670 } 2671 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // reload 2672 if (RTMRetryCount > 0) { 2673 // Retry on lock abort if abort status is not permanent. 2674 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry, &L_decrement_retry); 2675 } else { 2676 bind(L_decrement_retry); 2677 } 2678 } 2679 2680 // Use RTM for inflating locks 2681 // inputs: obj (object to lock) 2682 // mark_word (current header - KILLED) 2683 // boxReg (on-stack box address (displaced header location) - KILLED) 2684 void MacroAssembler::rtm_inflated_locking(ConditionRegister flag, 2685 Register obj, Register mark_word, Register boxReg, 2686 Register retry_on_busy_count_Reg, Register retry_on_abort_count_Reg, 2687 RTMLockingCounters* rtm_counters, 2688 Metadata* method_data, bool profile_rtm, 2689 Label& DONE_LABEL) { 2690 assert(UseRTMLocking, "why call this otherwise?"); 2691 Label L_rtm_retry, L_decrement_retry, L_on_abort; 2692 // Clean monitor_value bit to get valid pointer. 2693 int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markOopDesc::monitor_value; 2694 2695 // Store non-null, using boxReg instead of (intptr_t)markOopDesc::unused_mark(). 2696 std(boxReg, BasicLock::displaced_header_offset_in_bytes(), boxReg); 2697 const Register tmpReg = boxReg; 2698 const Register owner_addr_Reg = mark_word; 2699 addi(owner_addr_Reg, mark_word, owner_offset); 2700 2701 if (RTMRetryCount > 0) { 2702 load_const_optimized(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy. 2703 load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort. 2704 bind(L_rtm_retry); 2705 } 2706 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2707 Label L_noincrement; 2708 if (RTMTotalCountIncrRate > 1) { 2709 branch_on_random_using_tb(R0, RTMTotalCountIncrRate, L_noincrement); 2710 } 2711 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2712 load_const(R0, (address)rtm_counters->total_count_addr(), tmpReg); 2713 //atomic_inc_ptr(R0, tmpReg); We don't increment atomically 2714 ldx(tmpReg, R0); 2715 addi(tmpReg, tmpReg, 1); 2716 stdx(tmpReg, R0); 2717 bind(L_noincrement); 2718 } 2719 tbegin_(); 2720 beq(CCR0, L_on_abort); 2721 // We don't reload mark word. Will only be reset at safepoint. 2722 ld(R0, 0, owner_addr_Reg); // Load in transaction, conflicts need to be tracked. 2723 cmpdi(flag, R0, 0); 2724 beq(flag, DONE_LABEL); 2725 2726 if (UseRTMXendForLockBusy) { 2727 tend_(); 2728 b(L_decrement_retry); 2729 } else { 2730 tabort_(); 2731 } 2732 bind(L_on_abort); 2733 const Register abort_status_Reg = tmpReg; 2734 mftexasr(abort_status_Reg); 2735 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2736 rtm_profiling(abort_status_Reg, /*temp*/ owner_addr_Reg, rtm_counters, method_data, profile_rtm); 2737 // Restore owner_addr_Reg 2738 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); 2739 #ifdef ASSERT 2740 andi_(R0, mark_word, markOopDesc::monitor_value); 2741 asm_assert_ne("must be inflated", 0xa754); // Deflating only allowed at safepoint. 2742 #endif 2743 addi(owner_addr_Reg, mark_word, owner_offset); 2744 } 2745 if (RTMRetryCount > 0) { 2746 // Retry on lock abort if abort status is not permanent. 2747 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 2748 } 2749 2750 // Appears unlocked - try to swing _owner from null to non-null. 2751 cmpxchgd(flag, /*current val*/ R0, (intptr_t)0, /*new val*/ R16_thread, owner_addr_Reg, 2752 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2753 MacroAssembler::cmpxchgx_hint_acquire_lock(), noreg, &L_decrement_retry, true); 2754 2755 if (RTMRetryCount > 0) { 2756 // success done else retry 2757 b(DONE_LABEL); 2758 bind(L_decrement_retry); 2759 // Spin and retry if lock is busy. 2760 rtm_retry_lock_on_busy(retry_on_busy_count_Reg, owner_addr_Reg, L_rtm_retry); 2761 } else { 2762 bind(L_decrement_retry); 2763 } 2764 } 2765 2766 #endif // INCLUDE_RTM_OPT 2767 2768 // "The box" is the space on the stack where we copy the object mark. 2769 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box, 2770 Register temp, Register displaced_header, Register current_header, 2771 bool try_bias, 2772 RTMLockingCounters* rtm_counters, 2773 RTMLockingCounters* stack_rtm_counters, 2774 Metadata* method_data, 2775 bool use_rtm, bool profile_rtm) { 2776 assert_different_registers(oop, box, temp, displaced_header, current_header); 2777 assert(flag != CCR0, "bad condition register"); 2778 Label cont; 2779 Label object_has_monitor; 2780 Label cas_failed; 2781 2782 // Load markOop from object into displaced_header. 2783 ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop); 2784 2785 2786 // Always do locking in runtime. 2787 if (EmitSync & 0x01) { 2788 cmpdi(flag, oop, 0); // Oop can't be 0 here => always false. 2789 return; 2790 } 2791 2792 if (try_bias) { 2793 biased_locking_enter(flag, oop, displaced_header, temp, current_header, cont); 2794 } 2795 2796 #if INCLUDE_RTM_OPT 2797 if (UseRTMForStackLocks && use_rtm) { 2798 rtm_stack_locking(flag, oop, displaced_header, temp, /*temp*/ current_header, 2799 stack_rtm_counters, method_data, profile_rtm, 2800 cont, object_has_monitor); 2801 } 2802 #endif // INCLUDE_RTM_OPT 2803 2804 // Handle existing monitor. 2805 if ((EmitSync & 0x02) == 0) { 2806 // The object has an existing monitor iff (mark & monitor_value) != 0. 2807 andi_(temp, displaced_header, markOopDesc::monitor_value); 2808 bne(CCR0, object_has_monitor); 2809 } 2810 2811 // Set displaced_header to be (markOop of object | UNLOCK_VALUE). 2812 ori(displaced_header, displaced_header, markOopDesc::unlocked_value); 2813 2814 // Load Compare Value application register. 2815 2816 // Initialize the box. (Must happen before we update the object mark!) 2817 std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2818 2819 // Must fence, otherwise, preceding store(s) may float below cmpxchg. 2820 // Compare object markOop with mark and if equal exchange scratch1 with object markOop. 2821 cmpxchgd(/*flag=*/flag, 2822 /*current_value=*/current_header, 2823 /*compare_value=*/displaced_header, 2824 /*exchange_value=*/box, 2825 /*where=*/oop, 2826 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2827 MacroAssembler::cmpxchgx_hint_acquire_lock(), 2828 noreg, 2829 &cas_failed, 2830 /*check without membar and ldarx first*/true); 2831 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2832 2833 // If the compare-and-exchange succeeded, then we found an unlocked 2834 // object and we have now locked it. 2835 b(cont); 2836 2837 bind(cas_failed); 2838 // We did not see an unlocked object so try the fast recursive case. 2839 2840 // Check if the owner is self by comparing the value in the markOop of object 2841 // (current_header) with the stack pointer. 2842 sub(current_header, current_header, R1_SP); 2843 load_const_optimized(temp, ~(os::vm_page_size()-1) | markOopDesc::lock_mask_in_place); 2844 2845 and_(R0/*==0?*/, current_header, temp); 2846 // If condition is true we are cont and hence we can store 0 as the 2847 // displaced header in the box, which indicates that it is a recursive lock. 2848 mcrf(flag,CCR0); 2849 std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box); 2850 2851 // Handle existing monitor. 2852 if ((EmitSync & 0x02) == 0) { 2853 b(cont); 2854 2855 bind(object_has_monitor); 2856 // The object's monitor m is unlocked iff m->owner == NULL, 2857 // otherwise m->owner may contain a thread or a stack address. 2858 2859 #if INCLUDE_RTM_OPT 2860 // Use the same RTM locking code in 32- and 64-bit VM. 2861 if (use_rtm) { 2862 rtm_inflated_locking(flag, oop, displaced_header, box, temp, /*temp*/ current_header, 2863 rtm_counters, method_data, profile_rtm, cont); 2864 } else { 2865 #endif // INCLUDE_RTM_OPT 2866 2867 // Try to CAS m->owner from NULL to current thread. 2868 addi(temp, displaced_header, ObjectMonitor::owner_offset_in_bytes()-markOopDesc::monitor_value); 2869 cmpxchgd(/*flag=*/flag, 2870 /*current_value=*/current_header, 2871 /*compare_value=*/(intptr_t)0, 2872 /*exchange_value=*/R16_thread, 2873 /*where=*/temp, 2874 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2875 MacroAssembler::cmpxchgx_hint_acquire_lock()); 2876 2877 // Store a non-null value into the box. 2878 std(box, BasicLock::displaced_header_offset_in_bytes(), box); 2879 2880 # ifdef ASSERT 2881 bne(flag, cont); 2882 // We have acquired the monitor, check some invariants. 2883 addi(/*monitor=*/temp, temp, -ObjectMonitor::owner_offset_in_bytes()); 2884 // Invariant 1: _recursions should be 0. 2885 //assert(ObjectMonitor::recursions_size_in_bytes() == 8, "unexpected size"); 2886 asm_assert_mem8_is_zero(ObjectMonitor::recursions_offset_in_bytes(), temp, 2887 "monitor->_recursions should be 0", -1); 2888 # endif 2889 2890 #if INCLUDE_RTM_OPT 2891 } // use_rtm() 2892 #endif 2893 } 2894 2895 bind(cont); 2896 // flag == EQ indicates success 2897 // flag == NE indicates failure 2898 } 2899 2900 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box, 2901 Register temp, Register displaced_header, Register current_header, 2902 bool try_bias, bool use_rtm) { 2903 assert_different_registers(oop, box, temp, displaced_header, current_header); 2904 assert(flag != CCR0, "bad condition register"); 2905 Label cont; 2906 Label object_has_monitor; 2907 2908 // Always do locking in runtime. 2909 if (EmitSync & 0x01) { 2910 cmpdi(flag, oop, 0); // Oop can't be 0 here => always false. 2911 return; 2912 } 2913 2914 if (try_bias) { 2915 biased_locking_exit(flag, oop, current_header, cont); 2916 } 2917 2918 #if INCLUDE_RTM_OPT 2919 if (UseRTMForStackLocks && use_rtm) { 2920 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); 2921 Label L_regular_unlock; 2922 ld(current_header, oopDesc::mark_offset_in_bytes(), oop); // fetch markword 2923 andi(R0, current_header, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits 2924 cmpwi(flag, R0, markOopDesc::unlocked_value); // bits = 001 unlocked 2925 bne(flag, L_regular_unlock); // else RegularLock 2926 tend_(); // otherwise end... 2927 b(cont); // ... and we're done 2928 bind(L_regular_unlock); 2929 } 2930 #endif 2931 2932 // Find the lock address and load the displaced header from the stack. 2933 ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2934 2935 // If the displaced header is 0, we have a recursive unlock. 2936 cmpdi(flag, displaced_header, 0); 2937 beq(flag, cont); 2938 2939 // Handle existing monitor. 2940 if ((EmitSync & 0x02) == 0) { 2941 // The object has an existing monitor iff (mark & monitor_value) != 0. 2942 RTM_OPT_ONLY( if (!(UseRTMForStackLocks && use_rtm)) ) // skip load if already done 2943 ld(current_header, oopDesc::mark_offset_in_bytes(), oop); 2944 andi_(R0, current_header, markOopDesc::monitor_value); 2945 bne(CCR0, object_has_monitor); 2946 } 2947 2948 // Check if it is still a light weight lock, this is is true if we see 2949 // the stack address of the basicLock in the markOop of the object. 2950 // Cmpxchg sets flag to cmpd(current_header, box). 2951 cmpxchgd(/*flag=*/flag, 2952 /*current_value=*/current_header, 2953 /*compare_value=*/box, 2954 /*exchange_value=*/displaced_header, 2955 /*where=*/oop, 2956 MacroAssembler::MemBarRel, 2957 MacroAssembler::cmpxchgx_hint_release_lock(), 2958 noreg, 2959 &cont); 2960 2961 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2962 2963 // Handle existing monitor. 2964 if ((EmitSync & 0x02) == 0) { 2965 b(cont); 2966 2967 bind(object_has_monitor); 2968 addi(current_header, current_header, -markOopDesc::monitor_value); // monitor 2969 ld(temp, ObjectMonitor::owner_offset_in_bytes(), current_header); 2970 2971 // It's inflated. 2972 #if INCLUDE_RTM_OPT 2973 if (use_rtm) { 2974 Label L_regular_inflated_unlock; 2975 // Clean monitor_value bit to get valid pointer 2976 cmpdi(flag, temp, 0); 2977 bne(flag, L_regular_inflated_unlock); 2978 tend_(); 2979 b(cont); 2980 bind(L_regular_inflated_unlock); 2981 } 2982 #endif 2983 2984 ld(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header); 2985 xorr(temp, R16_thread, temp); // Will be 0 if we are the owner. 2986 orr(temp, temp, displaced_header); // Will be 0 if there are 0 recursions. 2987 cmpdi(flag, temp, 0); 2988 bne(flag, cont); 2989 2990 ld(temp, ObjectMonitor::EntryList_offset_in_bytes(), current_header); 2991 ld(displaced_header, ObjectMonitor::cxq_offset_in_bytes(), current_header); 2992 orr(temp, temp, displaced_header); // Will be 0 if both are 0. 2993 cmpdi(flag, temp, 0); 2994 bne(flag, cont); 2995 release(); 2996 std(temp, ObjectMonitor::owner_offset_in_bytes(), current_header); 2997 } 2998 2999 bind(cont); 3000 // flag == EQ indicates success 3001 // flag == NE indicates failure 3002 } 3003 3004 // Write serialization page so VM thread can do a pseudo remote membar. 3005 // We use the current thread pointer to calculate a thread specific 3006 // offset to write to within the page. This minimizes bus traffic 3007 // due to cache line collision. 3008 void MacroAssembler::serialize_memory(Register thread, Register tmp1, Register tmp2) { 3009 srdi(tmp2, thread, os::get_serialize_page_shift_count()); 3010 3011 int mask = os::vm_page_size() - sizeof(int); 3012 if (Assembler::is_simm(mask, 16)) { 3013 andi(tmp2, tmp2, mask); 3014 } else { 3015 lis(tmp1, (int)((signed short) (mask >> 16))); 3016 ori(tmp1, tmp1, mask & 0x0000ffff); 3017 andr(tmp2, tmp2, tmp1); 3018 } 3019 3020 load_const(tmp1, (long) os::get_memory_serialize_page()); 3021 release(); 3022 stwx(R0, tmp1, tmp2); 3023 } 3024 3025 void MacroAssembler::safepoint_poll(Label& slow_path, Register temp_reg) { 3026 if (SafepointMechanism::uses_thread_local_poll()) { 3027 ld(temp_reg, in_bytes(Thread::polling_page_offset()), R16_thread); 3028 // Armed page has poll_bit set. 3029 andi_(temp_reg, temp_reg, SafepointMechanism::poll_bit()); 3030 } else { 3031 lwz(temp_reg, (RegisterOrConstant)(intptr_t)SafepointSynchronize::address_of_state()); 3032 cmpwi(CCR0, temp_reg, SafepointSynchronize::_not_synchronized); 3033 } 3034 bne(CCR0, slow_path); 3035 } 3036 3037 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2, bool needs_frame) { 3038 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 3039 bs->resolve_jobject(this, value, tmp1, tmp2, needs_frame); 3040 } 3041 3042 // Values for last_Java_pc, and last_Java_sp must comply to the rules 3043 // in frame_ppc.hpp. 3044 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) { 3045 // Always set last_Java_pc and flags first because once last_Java_sp 3046 // is visible has_last_Java_frame is true and users will look at the 3047 // rest of the fields. (Note: flags should always be zero before we 3048 // get here so doesn't need to be set.) 3049 3050 // Verify that last_Java_pc was zeroed on return to Java 3051 asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread, 3052 "last_Java_pc not zeroed before leaving Java", 0x200); 3053 3054 // When returning from calling out from Java mode the frame anchor's 3055 // last_Java_pc will always be set to NULL. It is set here so that 3056 // if we are doing a call to native (not VM) that we capture the 3057 // known pc and don't have to rely on the native call having a 3058 // standard frame linkage where we can find the pc. 3059 if (last_Java_pc != noreg) 3060 std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 3061 3062 // Set last_Java_sp last. 3063 std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 3064 } 3065 3066 void MacroAssembler::reset_last_Java_frame(void) { 3067 asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()), 3068 R16_thread, "SP was not set, still zero", 0x202); 3069 3070 BLOCK_COMMENT("reset_last_Java_frame {"); 3071 li(R0, 0); 3072 3073 // _last_Java_sp = 0 3074 std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 3075 3076 // _last_Java_pc = 0 3077 std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 3078 BLOCK_COMMENT("} reset_last_Java_frame"); 3079 } 3080 3081 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) { 3082 assert_different_registers(sp, tmp1); 3083 3084 // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via 3085 // TOP_IJAVA_FRAME_ABI. 3086 // FIXME: assert that we really have a TOP_IJAVA_FRAME here! 3087 address entry = pc(); 3088 load_const_optimized(tmp1, entry); 3089 3090 set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1); 3091 } 3092 3093 void MacroAssembler::get_vm_result(Register oop_result) { 3094 // Read: 3095 // R16_thread 3096 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 3097 // 3098 // Updated: 3099 // oop_result 3100 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 3101 3102 verify_thread(); 3103 3104 ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread); 3105 li(R0, 0); 3106 std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread); 3107 3108 verify_oop(oop_result); 3109 } 3110 3111 void MacroAssembler::get_vm_result_2(Register metadata_result) { 3112 // Read: 3113 // R16_thread 3114 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 3115 // 3116 // Updated: 3117 // metadata_result 3118 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 3119 3120 ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 3121 li(R0, 0); 3122 std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 3123 } 3124 3125 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) { 3126 Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided. 3127 if (Universe::narrow_klass_base() != 0) { 3128 // Use dst as temp if it is free. 3129 sub_const_optimized(dst, current, Universe::narrow_klass_base(), R0); 3130 current = dst; 3131 } 3132 if (Universe::narrow_klass_shift() != 0) { 3133 srdi(dst, current, Universe::narrow_klass_shift()); 3134 current = dst; 3135 } 3136 return current; 3137 } 3138 3139 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) { 3140 if (UseCompressedClassPointers) { 3141 Register compressedKlass = encode_klass_not_null(ck, klass); 3142 stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop); 3143 } else { 3144 std(klass, oopDesc::klass_offset_in_bytes(), dst_oop); 3145 } 3146 } 3147 3148 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) { 3149 if (UseCompressedClassPointers) { 3150 if (val == noreg) { 3151 val = R0; 3152 li(val, 0); 3153 } 3154 stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed 3155 } 3156 } 3157 3158 int MacroAssembler::instr_size_for_decode_klass_not_null() { 3159 if (!UseCompressedClassPointers) return 0; 3160 int num_instrs = 1; // shift or move 3161 if (Universe::narrow_klass_base() != 0) num_instrs = 7; // shift + load const + add 3162 return num_instrs * BytesPerInstWord; 3163 } 3164 3165 void MacroAssembler::decode_klass_not_null(Register dst, Register src) { 3166 assert(dst != R0, "Dst reg may not be R0, as R0 is used here."); 3167 if (src == noreg) src = dst; 3168 Register shifted_src = src; 3169 if (Universe::narrow_klass_shift() != 0 || 3170 Universe::narrow_klass_base() == 0 && src != dst) { // Move required. 3171 shifted_src = dst; 3172 sldi(shifted_src, src, Universe::narrow_klass_shift()); 3173 } 3174 if (Universe::narrow_klass_base() != 0) { 3175 add_const_optimized(dst, shifted_src, Universe::narrow_klass_base(), R0); 3176 } 3177 } 3178 3179 void MacroAssembler::load_klass(Register dst, Register src) { 3180 if (UseCompressedClassPointers) { 3181 lwz(dst, oopDesc::klass_offset_in_bytes(), src); 3182 // Attention: no null check here! 3183 decode_klass_not_null(dst, dst); 3184 } else { 3185 ld(dst, oopDesc::klass_offset_in_bytes(), src); 3186 } 3187 } 3188 3189 // ((OopHandle)result).resolve(); 3190 void MacroAssembler::resolve_oop_handle(Register result) { 3191 // OopHandle::resolve is an indirection. 3192 ld(result, 0, result); 3193 } 3194 3195 void MacroAssembler::load_mirror_from_const_method(Register mirror, Register const_method) { 3196 ld(mirror, in_bytes(ConstMethod::constants_offset()), const_method); 3197 ld(mirror, ConstantPool::pool_holder_offset_in_bytes(), mirror); 3198 ld(mirror, in_bytes(Klass::java_mirror_offset()), mirror); 3199 resolve_oop_handle(mirror); 3200 } 3201 3202 // Clear Array 3203 // For very short arrays. tmp == R0 is allowed. 3204 void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) { 3205 if (cnt_dwords > 0) { li(tmp, 0); } 3206 for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); } 3207 } 3208 3209 // Version for constant short array length. Kills base_ptr. tmp == R0 is allowed. 3210 void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) { 3211 if (cnt_dwords < 8) { 3212 clear_memory_unrolled(base_ptr, cnt_dwords, tmp); 3213 return; 3214 } 3215 3216 Label loop; 3217 const long loopcnt = cnt_dwords >> 1, 3218 remainder = cnt_dwords & 1; 3219 3220 li(tmp, loopcnt); 3221 mtctr(tmp); 3222 li(tmp, 0); 3223 bind(loop); 3224 std(tmp, 0, base_ptr); 3225 std(tmp, 8, base_ptr); 3226 addi(base_ptr, base_ptr, 16); 3227 bdnz(loop); 3228 if (remainder) { std(tmp, 0, base_ptr); } 3229 } 3230 3231 // Kills both input registers. tmp == R0 is allowed. 3232 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) { 3233 // Procedure for large arrays (uses data cache block zero instruction). 3234 Label startloop, fast, fastloop, small_rest, restloop, done; 3235 const int cl_size = VM_Version::L1_data_cache_line_size(), 3236 cl_dwords = cl_size >> 3, 3237 cl_dw_addr_bits = exact_log2(cl_dwords), 3238 dcbz_min = 1, // Min count of dcbz executions, needs to be >0. 3239 min_cnt = ((dcbz_min + 1) << cl_dw_addr_bits) - 1; 3240 3241 if (const_cnt >= 0) { 3242 // Constant case. 3243 if (const_cnt < min_cnt) { 3244 clear_memory_constlen(base_ptr, const_cnt, tmp); 3245 return; 3246 } 3247 load_const_optimized(cnt_dwords, const_cnt, tmp); 3248 } else { 3249 // cnt_dwords already loaded in register. Need to check size. 3250 cmpdi(CCR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included). 3251 blt(CCR1, small_rest); 3252 } 3253 rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line. 3254 beq(CCR0, fast); // Already 128byte aligned. 3255 3256 subfic(tmp, tmp, cl_dwords); 3257 mtctr(tmp); // Set ctr to hit 128byte boundary (0<ctr<cl_dwords). 3258 subf(cnt_dwords, tmp, cnt_dwords); // rest. 3259 li(tmp, 0); 3260 3261 bind(startloop); // Clear at the beginning to reach 128byte boundary. 3262 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 3263 addi(base_ptr, base_ptr, 8); 3264 bdnz(startloop); 3265 3266 bind(fast); // Clear 128byte blocks. 3267 srdi(tmp, cnt_dwords, cl_dw_addr_bits); // Loop count for 128byte loop (>0). 3268 andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords. 3269 mtctr(tmp); // Load counter. 3270 3271 bind(fastloop); 3272 dcbz(base_ptr); // Clear 128byte aligned block. 3273 addi(base_ptr, base_ptr, cl_size); 3274 bdnz(fastloop); 3275 3276 bind(small_rest); 3277 cmpdi(CCR0, cnt_dwords, 0); // size 0? 3278 beq(CCR0, done); // rest == 0 3279 li(tmp, 0); 3280 mtctr(cnt_dwords); // Load counter. 3281 3282 bind(restloop); // Clear rest. 3283 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 3284 addi(base_ptr, base_ptr, 8); 3285 bdnz(restloop); 3286 3287 bind(done); 3288 } 3289 3290 /////////////////////////////////////////// String intrinsics //////////////////////////////////////////// 3291 3292 #ifdef COMPILER2 3293 // Intrinsics for CompactStrings 3294 3295 // Compress char[] to byte[] by compressing 16 bytes at once. 3296 void MacroAssembler::string_compress_16(Register src, Register dst, Register cnt, 3297 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, 3298 Label& Lfailure) { 3299 3300 const Register tmp0 = R0; 3301 assert_different_registers(src, dst, cnt, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5); 3302 Label Lloop, Lslow; 3303 3304 // Check if cnt >= 8 (= 16 bytes) 3305 lis(tmp1, 0xFF); // tmp1 = 0x00FF00FF00FF00FF 3306 srwi_(tmp2, cnt, 3); 3307 beq(CCR0, Lslow); 3308 ori(tmp1, tmp1, 0xFF); 3309 rldimi(tmp1, tmp1, 32, 0); 3310 mtctr(tmp2); 3311 3312 // 2x unrolled loop 3313 bind(Lloop); 3314 ld(tmp2, 0, src); // _0_1_2_3 (Big Endian) 3315 ld(tmp4, 8, src); // _4_5_6_7 3316 3317 orr(tmp0, tmp2, tmp4); 3318 rldicl(tmp3, tmp2, 6*8, 64-24); // _____1_2 3319 rldimi(tmp2, tmp2, 2*8, 2*8); // _0_2_3_3 3320 rldicl(tmp5, tmp4, 6*8, 64-24); // _____5_6 3321 rldimi(tmp4, tmp4, 2*8, 2*8); // _4_6_7_7 3322 3323 andc_(tmp0, tmp0, tmp1); 3324 bne(CCR0, Lfailure); // Not latin1. 3325 addi(src, src, 16); 3326 3327 rlwimi(tmp3, tmp2, 0*8, 24, 31);// _____1_3 3328 srdi(tmp2, tmp2, 3*8); // ____0_2_ 3329 rlwimi(tmp5, tmp4, 0*8, 24, 31);// _____5_7 3330 srdi(tmp4, tmp4, 3*8); // ____4_6_ 3331 3332 orr(tmp2, tmp2, tmp3); // ____0123 3333 orr(tmp4, tmp4, tmp5); // ____4567 3334 3335 stw(tmp2, 0, dst); 3336 stw(tmp4, 4, dst); 3337 addi(dst, dst, 8); 3338 bdnz(Lloop); 3339 3340 bind(Lslow); // Fallback to slow version 3341 } 3342 3343 // Compress char[] to byte[]. cnt must be positive int. 3344 void MacroAssembler::string_compress(Register src, Register dst, Register cnt, Register tmp, Label& Lfailure) { 3345 Label Lloop; 3346 mtctr(cnt); 3347 3348 bind(Lloop); 3349 lhz(tmp, 0, src); 3350 cmplwi(CCR0, tmp, 0xff); 3351 bgt(CCR0, Lfailure); // Not latin1. 3352 addi(src, src, 2); 3353 stb(tmp, 0, dst); 3354 addi(dst, dst, 1); 3355 bdnz(Lloop); 3356 } 3357 3358 // Inflate byte[] to char[] by inflating 16 bytes at once. 3359 void MacroAssembler::string_inflate_16(Register src, Register dst, Register cnt, 3360 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) { 3361 const Register tmp0 = R0; 3362 assert_different_registers(src, dst, cnt, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5); 3363 Label Lloop, Lslow; 3364 3365 // Check if cnt >= 8 3366 srwi_(tmp2, cnt, 3); 3367 beq(CCR0, Lslow); 3368 lis(tmp1, 0xFF); // tmp1 = 0x00FF00FF 3369 ori(tmp1, tmp1, 0xFF); 3370 mtctr(tmp2); 3371 3372 // 2x unrolled loop 3373 bind(Lloop); 3374 lwz(tmp2, 0, src); // ____0123 (Big Endian) 3375 lwz(tmp4, 4, src); // ____4567 3376 addi(src, src, 8); 3377 3378 rldicl(tmp3, tmp2, 7*8, 64-8); // _______2 3379 rlwimi(tmp2, tmp2, 3*8, 16, 23);// ____0113 3380 rldicl(tmp5, tmp4, 7*8, 64-8); // _______6 3381 rlwimi(tmp4, tmp4, 3*8, 16, 23);// ____4557 3382 3383 andc(tmp0, tmp2, tmp1); // ____0_1_ 3384 rlwimi(tmp2, tmp3, 2*8, 0, 23); // _____2_3 3385 andc(tmp3, tmp4, tmp1); // ____4_5_ 3386 rlwimi(tmp4, tmp5, 2*8, 0, 23); // _____6_7 3387 3388 rldimi(tmp2, tmp0, 3*8, 0*8); // _0_1_2_3 3389 rldimi(tmp4, tmp3, 3*8, 0*8); // _4_5_6_7 3390 3391 std(tmp2, 0, dst); 3392 std(tmp4, 8, dst); 3393 addi(dst, dst, 16); 3394 bdnz(Lloop); 3395 3396 bind(Lslow); // Fallback to slow version 3397 } 3398 3399 // Inflate byte[] to char[]. cnt must be positive int. 3400 void MacroAssembler::string_inflate(Register src, Register dst, Register cnt, Register tmp) { 3401 Label Lloop; 3402 mtctr(cnt); 3403 3404 bind(Lloop); 3405 lbz(tmp, 0, src); 3406 addi(src, src, 1); 3407 sth(tmp, 0, dst); 3408 addi(dst, dst, 2); 3409 bdnz(Lloop); 3410 } 3411 3412 void MacroAssembler::string_compare(Register str1, Register str2, 3413 Register cnt1, Register cnt2, 3414 Register tmp1, Register result, int ae) { 3415 const Register tmp0 = R0, 3416 diff = tmp1; 3417 3418 assert_different_registers(str1, str2, cnt1, cnt2, tmp0, tmp1, result); 3419 Label Ldone, Lslow, Lloop, Lreturn_diff; 3420 3421 // Note: Making use of the fact that compareTo(a, b) == -compareTo(b, a) 3422 // we interchange str1 and str2 in the UL case and negate the result. 3423 // Like this, str1 is always latin1 encoded, except for the UU case. 3424 // In addition, we need 0 (or sign which is 0) extend. 3425 3426 if (ae == StrIntrinsicNode::UU) { 3427 srwi(cnt1, cnt1, 1); 3428 } else { 3429 clrldi(cnt1, cnt1, 32); 3430 } 3431 3432 if (ae != StrIntrinsicNode::LL) { 3433 srwi(cnt2, cnt2, 1); 3434 } else { 3435 clrldi(cnt2, cnt2, 32); 3436 } 3437 3438 // See if the lengths are different, and calculate min in cnt1. 3439 // Save diff in case we need it for a tie-breaker. 3440 subf_(diff, cnt2, cnt1); // diff = cnt1 - cnt2 3441 // if (diff > 0) { cnt1 = cnt2; } 3442 if (VM_Version::has_isel()) { 3443 isel(cnt1, CCR0, Assembler::greater, /*invert*/ false, cnt2); 3444 } else { 3445 Label Lskip; 3446 blt(CCR0, Lskip); 3447 mr(cnt1, cnt2); 3448 bind(Lskip); 3449 } 3450 3451 // Rename registers 3452 Register chr1 = result; 3453 Register chr2 = tmp0; 3454 3455 // Compare multiple characters in fast loop (only implemented for same encoding). 3456 int stride1 = 8, stride2 = 8; 3457 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3458 int log2_chars_per_iter = (ae == StrIntrinsicNode::LL) ? 3 : 2; 3459 Label Lfastloop, Lskipfast; 3460 3461 srwi_(tmp0, cnt1, log2_chars_per_iter); 3462 beq(CCR0, Lskipfast); 3463 rldicl(cnt2, cnt1, 0, 64 - log2_chars_per_iter); // Remaining characters. 3464 li(cnt1, 1 << log2_chars_per_iter); // Initialize for failure case: Rescan characters from current iteration. 3465 mtctr(tmp0); 3466 3467 bind(Lfastloop); 3468 ld(chr1, 0, str1); 3469 ld(chr2, 0, str2); 3470 cmpd(CCR0, chr1, chr2); 3471 bne(CCR0, Lslow); 3472 addi(str1, str1, stride1); 3473 addi(str2, str2, stride2); 3474 bdnz(Lfastloop); 3475 mr(cnt1, cnt2); // Remaining characters. 3476 bind(Lskipfast); 3477 } 3478 3479 // Loop which searches the first difference character by character. 3480 cmpwi(CCR0, cnt1, 0); 3481 beq(CCR0, Lreturn_diff); 3482 bind(Lslow); 3483 mtctr(cnt1); 3484 3485 switch (ae) { 3486 case StrIntrinsicNode::LL: stride1 = 1; stride2 = 1; break; 3487 case StrIntrinsicNode::UL: // fallthru (see comment above) 3488 case StrIntrinsicNode::LU: stride1 = 1; stride2 = 2; break; 3489 case StrIntrinsicNode::UU: stride1 = 2; stride2 = 2; break; 3490 default: ShouldNotReachHere(); break; 3491 } 3492 3493 bind(Lloop); 3494 if (stride1 == 1) { lbz(chr1, 0, str1); } else { lhz(chr1, 0, str1); } 3495 if (stride2 == 1) { lbz(chr2, 0, str2); } else { lhz(chr2, 0, str2); } 3496 subf_(result, chr2, chr1); // result = chr1 - chr2 3497 bne(CCR0, Ldone); 3498 addi(str1, str1, stride1); 3499 addi(str2, str2, stride2); 3500 bdnz(Lloop); 3501 3502 // If strings are equal up to min length, return the length difference. 3503 bind(Lreturn_diff); 3504 mr(result, diff); 3505 3506 // Otherwise, return the difference between the first mismatched chars. 3507 bind(Ldone); 3508 if (ae == StrIntrinsicNode::UL) { 3509 neg(result, result); // Negate result (see note above). 3510 } 3511 } 3512 3513 void MacroAssembler::array_equals(bool is_array_equ, Register ary1, Register ary2, 3514 Register limit, Register tmp1, Register result, bool is_byte) { 3515 const Register tmp0 = R0; 3516 assert_different_registers(ary1, ary2, limit, tmp0, tmp1, result); 3517 Label Ldone, Lskiploop, Lloop, Lfastloop, Lskipfast; 3518 bool limit_needs_shift = false; 3519 3520 if (is_array_equ) { 3521 const int length_offset = arrayOopDesc::length_offset_in_bytes(); 3522 const int base_offset = arrayOopDesc::base_offset_in_bytes(is_byte ? T_BYTE : T_CHAR); 3523 3524 // Return true if the same array. 3525 cmpd(CCR0, ary1, ary2); 3526 beq(CCR0, Lskiploop); 3527 3528 // Return false if one of them is NULL. 3529 cmpdi(CCR0, ary1, 0); 3530 cmpdi(CCR1, ary2, 0); 3531 li(result, 0); 3532 cror(CCR0, Assembler::equal, CCR1, Assembler::equal); 3533 beq(CCR0, Ldone); 3534 3535 // Load the lengths of arrays. 3536 lwz(limit, length_offset, ary1); 3537 lwz(tmp0, length_offset, ary2); 3538 3539 // Return false if the two arrays are not equal length. 3540 cmpw(CCR0, limit, tmp0); 3541 bne(CCR0, Ldone); 3542 3543 // Load array addresses. 3544 addi(ary1, ary1, base_offset); 3545 addi(ary2, ary2, base_offset); 3546 } else { 3547 limit_needs_shift = !is_byte; 3548 li(result, 0); // Assume not equal. 3549 } 3550 3551 // Rename registers 3552 Register chr1 = tmp0; 3553 Register chr2 = tmp1; 3554 3555 // Compare 8 bytes per iteration in fast loop. 3556 const int log2_chars_per_iter = is_byte ? 3 : 2; 3557 3558 srwi_(tmp0, limit, log2_chars_per_iter + (limit_needs_shift ? 1 : 0)); 3559 beq(CCR0, Lskipfast); 3560 mtctr(tmp0); 3561 3562 bind(Lfastloop); 3563 ld(chr1, 0, ary1); 3564 ld(chr2, 0, ary2); 3565 addi(ary1, ary1, 8); 3566 addi(ary2, ary2, 8); 3567 cmpd(CCR0, chr1, chr2); 3568 bne(CCR0, Ldone); 3569 bdnz(Lfastloop); 3570 3571 bind(Lskipfast); 3572 rldicl_(limit, limit, limit_needs_shift ? 64 - 1 : 0, 64 - log2_chars_per_iter); // Remaining characters. 3573 beq(CCR0, Lskiploop); 3574 mtctr(limit); 3575 3576 // Character by character. 3577 bind(Lloop); 3578 if (is_byte) { 3579 lbz(chr1, 0, ary1); 3580 lbz(chr2, 0, ary2); 3581 addi(ary1, ary1, 1); 3582 addi(ary2, ary2, 1); 3583 } else { 3584 lhz(chr1, 0, ary1); 3585 lhz(chr2, 0, ary2); 3586 addi(ary1, ary1, 2); 3587 addi(ary2, ary2, 2); 3588 } 3589 cmpw(CCR0, chr1, chr2); 3590 bne(CCR0, Ldone); 3591 bdnz(Lloop); 3592 3593 bind(Lskiploop); 3594 li(result, 1); // All characters are equal. 3595 bind(Ldone); 3596 } 3597 3598 void MacroAssembler::string_indexof(Register result, Register haystack, Register haycnt, 3599 Register needle, ciTypeArray* needle_values, Register needlecnt, int needlecntval, 3600 Register tmp1, Register tmp2, Register tmp3, Register tmp4, int ae) { 3601 3602 // Ensure 0<needlecnt<=haycnt in ideal graph as prerequisite! 3603 Label L_TooShort, L_Found, L_NotFound, L_End; 3604 Register last_addr = haycnt, // Kill haycnt at the beginning. 3605 addr = tmp1, 3606 n_start = tmp2, 3607 ch1 = tmp3, 3608 ch2 = R0; 3609 3610 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 3611 const int h_csize = (ae == StrIntrinsicNode::LL) ? 1 : 2; 3612 const int n_csize = (ae == StrIntrinsicNode::UU) ? 2 : 1; 3613 3614 // ************************************************************************************************** 3615 // Prepare for main loop: optimized for needle count >=2, bail out otherwise. 3616 // ************************************************************************************************** 3617 3618 // Compute last haystack addr to use if no match gets found. 3619 clrldi(haycnt, haycnt, 32); // Ensure positive int is valid as 64 bit value. 3620 addi(addr, haystack, -h_csize); // Accesses use pre-increment. 3621 if (needlecntval == 0) { // variable needlecnt 3622 cmpwi(CCR6, needlecnt, 2); 3623 clrldi(needlecnt, needlecnt, 32); // Ensure positive int is valid as 64 bit value. 3624 blt(CCR6, L_TooShort); // Variable needlecnt: handle short needle separately. 3625 } 3626 3627 if (n_csize == 2) { lwz(n_start, 0, needle); } else { lhz(n_start, 0, needle); } // Load first 2 characters of needle. 3628 3629 if (needlecntval == 0) { // variable needlecnt 3630 subf(ch1, needlecnt, haycnt); // Last character index to compare is haycnt-needlecnt. 3631 addi(needlecnt, needlecnt, -2); // Rest of needle. 3632 } else { // constant needlecnt 3633 guarantee(needlecntval != 1, "IndexOf with single-character needle must be handled separately"); 3634 assert((needlecntval & 0x7fff) == needlecntval, "wrong immediate"); 3635 addi(ch1, haycnt, -needlecntval); // Last character index to compare is haycnt-needlecnt. 3636 if (needlecntval > 3) { li(needlecnt, needlecntval - 2); } // Rest of needle. 3637 } 3638 3639 if (h_csize == 2) { slwi(ch1, ch1, 1); } // Scale to number of bytes. 3640 3641 if (ae ==StrIntrinsicNode::UL) { 3642 srwi(tmp4, n_start, 1*8); // ___0 3643 rlwimi(n_start, tmp4, 2*8, 0, 23); // _0_1 3644 } 3645 3646 add(last_addr, haystack, ch1); // Point to last address to compare (haystack+2*(haycnt-needlecnt)). 3647 3648 // Main Loop (now we have at least 2 characters). 3649 Label L_OuterLoop, L_InnerLoop, L_FinalCheck, L_Comp1, L_Comp2; 3650 bind(L_OuterLoop); // Search for 1st 2 characters. 3651 Register addr_diff = tmp4; 3652 subf(addr_diff, addr, last_addr); // Difference between already checked address and last address to check. 3653 addi(addr, addr, h_csize); // This is the new address we want to use for comparing. 3654 srdi_(ch2, addr_diff, h_csize); 3655 beq(CCR0, L_FinalCheck); // 2 characters left? 3656 mtctr(ch2); // num of characters / 2 3657 bind(L_InnerLoop); // Main work horse (2x unrolled search loop) 3658 if (h_csize == 2) { // Load 2 characters of haystack (ignore alignment). 3659 lwz(ch1, 0, addr); 3660 lwz(ch2, 2, addr); 3661 } else { 3662 lhz(ch1, 0, addr); 3663 lhz(ch2, 1, addr); 3664 } 3665 cmpw(CCR0, ch1, n_start); // Compare 2 characters (1 would be sufficient but try to reduce branches to CompLoop). 3666 cmpw(CCR1, ch2, n_start); 3667 beq(CCR0, L_Comp1); // Did we find the needle start? 3668 beq(CCR1, L_Comp2); 3669 addi(addr, addr, 2 * h_csize); 3670 bdnz(L_InnerLoop); 3671 bind(L_FinalCheck); 3672 andi_(addr_diff, addr_diff, h_csize); // Remaining characters not covered by InnerLoop: (num of characters) & 1. 3673 beq(CCR0, L_NotFound); 3674 if (h_csize == 2) { lwz(ch1, 0, addr); } else { lhz(ch1, 0, addr); } // One position left at which we have to compare. 3675 cmpw(CCR1, ch1, n_start); 3676 beq(CCR1, L_Comp1); 3677 bind(L_NotFound); 3678 li(result, -1); // not found 3679 b(L_End); 3680 3681 // ************************************************************************************************** 3682 // Special Case: unfortunately, the variable needle case can be called with needlecnt<2 3683 // ************************************************************************************************** 3684 if (needlecntval == 0) { // We have to handle these cases separately. 3685 Label L_OneCharLoop; 3686 bind(L_TooShort); 3687 mtctr(haycnt); 3688 if (n_csize == 2) { lhz(n_start, 0, needle); } else { lbz(n_start, 0, needle); } // First character of needle 3689 bind(L_OneCharLoop); 3690 if (h_csize == 2) { lhzu(ch1, 2, addr); } else { lbzu(ch1, 1, addr); } 3691 cmpw(CCR1, ch1, n_start); 3692 beq(CCR1, L_Found); // Did we find the one character needle? 3693 bdnz(L_OneCharLoop); 3694 li(result, -1); // Not found. 3695 b(L_End); 3696 } 3697 3698 // ************************************************************************************************** 3699 // Regular Case Part II: compare rest of needle (first 2 characters have been compared already) 3700 // ************************************************************************************************** 3701 3702 // Compare the rest 3703 bind(L_Comp2); 3704 addi(addr, addr, h_csize); // First comparison has failed, 2nd one hit. 3705 bind(L_Comp1); // Addr points to possible needle start. 3706 if (needlecntval != 2) { // Const needlecnt==2? 3707 if (needlecntval != 3) { 3708 if (needlecntval == 0) { beq(CCR6, L_Found); } // Variable needlecnt==2? 3709 Register n_ind = tmp4, 3710 h_ind = n_ind; 3711 li(n_ind, 2 * n_csize); // First 2 characters are already compared, use index 2. 3712 mtctr(needlecnt); // Decremented by 2, still > 0. 3713 Label L_CompLoop; 3714 bind(L_CompLoop); 3715 if (ae ==StrIntrinsicNode::UL) { 3716 h_ind = ch1; 3717 sldi(h_ind, n_ind, 1); 3718 } 3719 if (n_csize == 2) { lhzx(ch2, needle, n_ind); } else { lbzx(ch2, needle, n_ind); } 3720 if (h_csize == 2) { lhzx(ch1, addr, h_ind); } else { lbzx(ch1, addr, h_ind); } 3721 cmpw(CCR1, ch1, ch2); 3722 bne(CCR1, L_OuterLoop); 3723 addi(n_ind, n_ind, n_csize); 3724 bdnz(L_CompLoop); 3725 } else { // No loop required if there's only one needle character left. 3726 if (n_csize == 2) { lhz(ch2, 2 * 2, needle); } else { lbz(ch2, 2 * 1, needle); } 3727 if (h_csize == 2) { lhz(ch1, 2 * 2, addr); } else { lbz(ch1, 2 * 1, addr); } 3728 cmpw(CCR1, ch1, ch2); 3729 bne(CCR1, L_OuterLoop); 3730 } 3731 } 3732 // Return index ... 3733 bind(L_Found); 3734 subf(result, haystack, addr); // relative to haystack, ... 3735 if (h_csize == 2) { srdi(result, result, 1); } // in characters. 3736 bind(L_End); 3737 } // string_indexof 3738 3739 void MacroAssembler::string_indexof_char(Register result, Register haystack, Register haycnt, 3740 Register needle, jchar needleChar, Register tmp1, Register tmp2, bool is_byte) { 3741 assert_different_registers(haystack, haycnt, needle, tmp1, tmp2); 3742 3743 Label L_InnerLoop, L_FinalCheck, L_Found1, L_Found2, L_NotFound, L_End; 3744 Register addr = tmp1, 3745 ch1 = tmp2, 3746 ch2 = R0; 3747 3748 const int h_csize = is_byte ? 1 : 2; 3749 3750 //4: 3751 srwi_(tmp2, haycnt, 1); // Shift right by exact_log2(UNROLL_FACTOR). 3752 mr(addr, haystack); 3753 beq(CCR0, L_FinalCheck); 3754 mtctr(tmp2); // Move to count register. 3755 //8: 3756 bind(L_InnerLoop); // Main work horse (2x unrolled search loop). 3757 if (!is_byte) { 3758 lhz(ch1, 0, addr); 3759 lhz(ch2, 2, addr); 3760 } else { 3761 lbz(ch1, 0, addr); 3762 lbz(ch2, 1, addr); 3763 } 3764 (needle != R0) ? cmpw(CCR0, ch1, needle) : cmplwi(CCR0, ch1, (unsigned int)needleChar); 3765 (needle != R0) ? cmpw(CCR1, ch2, needle) : cmplwi(CCR1, ch2, (unsigned int)needleChar); 3766 beq(CCR0, L_Found1); // Did we find the needle? 3767 beq(CCR1, L_Found2); 3768 addi(addr, addr, 2 * h_csize); 3769 bdnz(L_InnerLoop); 3770 //16: 3771 bind(L_FinalCheck); 3772 andi_(R0, haycnt, 1); 3773 beq(CCR0, L_NotFound); 3774 if (!is_byte) { lhz(ch1, 0, addr); } else { lbz(ch1, 0, addr); } // One position left at which we have to compare. 3775 (needle != R0) ? cmpw(CCR1, ch1, needle) : cmplwi(CCR1, ch1, (unsigned int)needleChar); 3776 beq(CCR1, L_Found1); 3777 //21: 3778 bind(L_NotFound); 3779 li(result, -1); // Not found. 3780 b(L_End); 3781 3782 bind(L_Found2); 3783 addi(addr, addr, h_csize); 3784 //24: 3785 bind(L_Found1); // Return index ... 3786 subf(result, haystack, addr); // relative to haystack, ... 3787 if (!is_byte) { srdi(result, result, 1); } // in characters. 3788 bind(L_End); 3789 } // string_indexof_char 3790 3791 3792 void MacroAssembler::has_negatives(Register src, Register cnt, Register result, 3793 Register tmp1, Register tmp2) { 3794 const Register tmp0 = R0; 3795 assert_different_registers(src, result, cnt, tmp0, tmp1, tmp2); 3796 Label Lfastloop, Lslow, Lloop, Lnoneg, Ldone; 3797 3798 // Check if cnt >= 8 (= 16 bytes) 3799 lis(tmp1, (int)(short)0x8080); // tmp1 = 0x8080808080808080 3800 srwi_(tmp2, cnt, 4); 3801 li(result, 1); // Assume there's a negative byte. 3802 beq(CCR0, Lslow); 3803 ori(tmp1, tmp1, 0x8080); 3804 rldimi(tmp1, tmp1, 32, 0); 3805 mtctr(tmp2); 3806 3807 // 2x unrolled loop 3808 bind(Lfastloop); 3809 ld(tmp2, 0, src); 3810 ld(tmp0, 8, src); 3811 3812 orr(tmp0, tmp2, tmp0); 3813 3814 and_(tmp0, tmp0, tmp1); 3815 bne(CCR0, Ldone); // Found negative byte. 3816 addi(src, src, 16); 3817 3818 bdnz(Lfastloop); 3819 3820 bind(Lslow); // Fallback to slow version 3821 rldicl_(tmp0, cnt, 0, 64-4); 3822 beq(CCR0, Lnoneg); 3823 mtctr(tmp0); 3824 bind(Lloop); 3825 lbz(tmp0, 0, src); 3826 addi(src, src, 1); 3827 andi_(tmp0, tmp0, 0x80); 3828 bne(CCR0, Ldone); // Found negative byte. 3829 bdnz(Lloop); 3830 bind(Lnoneg); 3831 li(result, 0); 3832 3833 bind(Ldone); 3834 } 3835 3836 #endif // Compiler2 3837 3838 // Helpers for Intrinsic Emitters 3839 // 3840 // Revert the byte order of a 32bit value in a register 3841 // src: 0x44556677 3842 // dst: 0x77665544 3843 // Three steps to obtain the result: 3844 // 1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word 3845 // into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared. 3846 // This value initializes dst. 3847 // 2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost 3848 // byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go. 3849 // This value is mask inserted into dst with a [0..23] mask of 1s. 3850 // 3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position. 3851 // This value is mask inserted into dst with a [8..15] mask of 1s. 3852 void MacroAssembler::load_reverse_32(Register dst, Register src) { 3853 assert_different_registers(dst, src); 3854 3855 rldicl(dst, src, (4+1)*8, 56); // Rotate byte 4 into position 7 (rightmost), clear all to the left. 3856 rlwimi(dst, src, 3*8, 0, 23); // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone. 3857 rlwimi(dst, src, 1*8, 8, 15); // Insert byte 6 into position 5, leave the rest alone. 3858 } 3859 3860 // Calculate the column addresses of the crc32 lookup table into distinct registers. 3861 // This loop-invariant calculation is moved out of the loop body, reducing the loop 3862 // body size from 20 to 16 instructions. 3863 // Returns the offset that was used to calculate the address of column tc3. 3864 // Due to register shortage, setting tc3 may overwrite table. With the return offset 3865 // at hand, the original table address can be easily reconstructed. 3866 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) { 3867 3868 #ifdef VM_LITTLE_ENDIAN 3869 // This is what we implement (the DOLIT4 part): 3870 // ========================================================================= */ 3871 // #define DOLIT4 c ^= *buf4++; \ 3872 // c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \ 3873 // crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24] 3874 // #define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4 3875 // ========================================================================= */ 3876 const int ix0 = 3*(4*CRC32_COLUMN_SIZE); 3877 const int ix1 = 2*(4*CRC32_COLUMN_SIZE); 3878 const int ix2 = 1*(4*CRC32_COLUMN_SIZE); 3879 const int ix3 = 0*(4*CRC32_COLUMN_SIZE); 3880 #else 3881 // This is what we implement (the DOBIG4 part): 3882 // ========================================================================= 3883 // #define DOBIG4 c ^= *++buf4; \ 3884 // c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \ 3885 // crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24] 3886 // #define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4 3887 // ========================================================================= 3888 const int ix0 = 4*(4*CRC32_COLUMN_SIZE); 3889 const int ix1 = 5*(4*CRC32_COLUMN_SIZE); 3890 const int ix2 = 6*(4*CRC32_COLUMN_SIZE); 3891 const int ix3 = 7*(4*CRC32_COLUMN_SIZE); 3892 #endif 3893 assert_different_registers(table, tc0, tc1, tc2); 3894 assert(table == tc3, "must be!"); 3895 3896 addi(tc0, table, ix0); 3897 addi(tc1, table, ix1); 3898 addi(tc2, table, ix2); 3899 if (ix3 != 0) addi(tc3, table, ix3); 3900 3901 return ix3; 3902 } 3903 3904 /** 3905 * uint32_t crc; 3906 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8); 3907 */ 3908 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) { 3909 assert_different_registers(crc, table, tmp); 3910 assert_different_registers(val, table); 3911 3912 if (crc == val) { // Must rotate first to use the unmodified value. 3913 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 3914 // As we use a word (4-byte) instruction, we have to adapt the mask bit positions. 3915 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 3916 } else { 3917 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 3918 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 3919 } 3920 lwzx(tmp, table, tmp); 3921 xorr(crc, crc, tmp); 3922 } 3923 3924 /** 3925 * uint32_t crc; 3926 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8); 3927 */ 3928 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) { 3929 fold_byte_crc32(crc, crc, table, tmp); 3930 } 3931 3932 /** 3933 * Emits code to update CRC-32 with a byte value according to constants in table. 3934 * 3935 * @param [in,out]crc Register containing the crc. 3936 * @param [in]val Register containing the byte to fold into the CRC. 3937 * @param [in]table Register containing the table of crc constants. 3938 * 3939 * uint32_t crc; 3940 * val = crc_table[(val ^ crc) & 0xFF]; 3941 * crc = val ^ (crc >> 8); 3942 */ 3943 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 3944 BLOCK_COMMENT("update_byte_crc32:"); 3945 xorr(val, val, crc); 3946 fold_byte_crc32(crc, val, table, val); 3947 } 3948 3949 /** 3950 * @param crc register containing existing CRC (32-bit) 3951 * @param buf register pointing to input byte buffer (byte*) 3952 * @param len register containing number of bytes 3953 * @param table register pointing to CRC table 3954 */ 3955 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table, 3956 Register data, bool loopAlignment) { 3957 assert_different_registers(crc, buf, len, table, data); 3958 3959 Label L_mainLoop, L_done; 3960 const int mainLoop_stepping = 1; 3961 const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4; 3962 3963 // Process all bytes in a single-byte loop. 3964 clrldi_(len, len, 32); // Enforce 32 bit. Anything to do? 3965 beq(CCR0, L_done); 3966 3967 mtctr(len); 3968 align(mainLoop_alignment); 3969 BIND(L_mainLoop); 3970 lbz(data, 0, buf); // Byte from buffer, zero-extended. 3971 addi(buf, buf, mainLoop_stepping); // Advance buffer position. 3972 update_byte_crc32(crc, data, table); 3973 bdnz(L_mainLoop); // Iterate. 3974 3975 bind(L_done); 3976 } 3977 3978 /** 3979 * Emits code to update CRC-32 with a 4-byte value according to constants in table 3980 * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c 3981 */ 3982 // A not on the lookup table address(es): 3983 // The lookup table consists of two sets of four columns each. 3984 // The columns {0..3} are used for little-endian machines. 3985 // The columns {4..7} are used for big-endian machines. 3986 // To save the effort of adding the column offset to the table address each time 3987 // a table element is looked up, it is possible to pass the pre-calculated 3988 // column addresses. 3989 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary. 3990 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc, 3991 Register t0, Register t1, Register t2, Register t3, 3992 Register tc0, Register tc1, Register tc2, Register tc3) { 3993 assert_different_registers(crc, t3); 3994 3995 // XOR crc with next four bytes of buffer. 3996 lwz(t3, bufDisp, buf); 3997 if (bufInc != 0) { 3998 addi(buf, buf, bufInc); 3999 } 4000 xorr(t3, t3, crc); 4001 4002 // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices. 4003 rlwinm(t0, t3, 2, 24-2, 31-2); // ((t1 >> 0) & 0xff) << 2 4004 rlwinm(t1, t3, 32+(2- 8), 24-2, 31-2); // ((t1 >> 8) & 0xff) << 2 4005 rlwinm(t2, t3, 32+(2-16), 24-2, 31-2); // ((t1 >> 16) & 0xff) << 2 4006 rlwinm(t3, t3, 32+(2-24), 24-2, 31-2); // ((t1 >> 24) & 0xff) << 2 4007 4008 // Use the pre-calculated column addresses. 4009 // Load pre-calculated table values. 4010 lwzx(t0, tc0, t0); 4011 lwzx(t1, tc1, t1); 4012 lwzx(t2, tc2, t2); 4013 lwzx(t3, tc3, t3); 4014 4015 // Calculate new crc from table values. 4016 xorr(t0, t0, t1); 4017 xorr(t2, t2, t3); 4018 xorr(crc, t0, t2); // Now crc contains the final checksum value. 4019 } 4020 4021 /** 4022 * @param crc register containing existing CRC (32-bit) 4023 * @param buf register pointing to input byte buffer (byte*) 4024 * @param len register containing number of bytes 4025 * @param table register pointing to CRC table 4026 * 4027 * Uses R9..R12 as work register. Must be saved/restored by caller! 4028 */ 4029 void MacroAssembler::kernel_crc32_2word(Register crc, Register buf, Register len, Register table, 4030 Register t0, Register t1, Register t2, Register t3, 4031 Register tc0, Register tc1, Register tc2, Register tc3, 4032 bool invertCRC) { 4033 assert_different_registers(crc, buf, len, table); 4034 4035 Label L_mainLoop, L_tail; 4036 Register tmp = t0; 4037 Register data = t0; 4038 Register tmp2 = t1; 4039 const int mainLoop_stepping = 8; 4040 const int tailLoop_stepping = 1; 4041 const int log_stepping = exact_log2(mainLoop_stepping); 4042 const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32; 4043 const int complexThreshold = 2*mainLoop_stepping; 4044 4045 // Don't test for len <= 0 here. This pathological case should not occur anyway. 4046 // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles 4047 // for all well-behaved cases. The situation itself is detected and handled correctly 4048 // within update_byteLoop_crc32. 4049 assert(tailLoop_stepping == 1, "check tailLoop_stepping!"); 4050 4051 BLOCK_COMMENT("kernel_crc32_2word {"); 4052 4053 if (invertCRC) { 4054 nand(crc, crc, crc); // 1s complement of crc 4055 } 4056 4057 // Check for short (<mainLoop_stepping) buffer. 4058 cmpdi(CCR0, len, complexThreshold); 4059 blt(CCR0, L_tail); 4060 4061 // Pre-mainLoop alignment did show a slight (1%) positive effect on performance. 4062 // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions. 4063 { 4064 // Align buf addr to mainLoop_stepping boundary. 4065 neg(tmp2, buf); // Calculate # preLoop iterations for alignment. 4066 rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63. 4067 4068 if (complexThreshold > mainLoop_stepping) { 4069 sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 4070 } else { 4071 sub(tmp, len, tmp2); // Remaining bytes for main loop. 4072 cmpdi(CCR0, tmp, mainLoop_stepping); 4073 blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing 4074 mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 4075 } 4076 update_byteLoop_crc32(crc, buf, tmp2, table, data, false); 4077 } 4078 4079 srdi(tmp2, len, log_stepping); // #iterations for mainLoop 4080 andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop 4081 mtctr(tmp2); 4082 4083 #ifdef VM_LITTLE_ENDIAN 4084 Register crc_rv = crc; 4085 #else 4086 Register crc_rv = tmp; // Load_reverse needs separate registers to work on. 4087 // Occupies tmp, but frees up crc. 4088 load_reverse_32(crc_rv, crc); // Revert byte order because we are dealing with big-endian data. 4089 tmp = crc; 4090 #endif 4091 4092 int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3); 4093 4094 align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement. 4095 BIND(L_mainLoop); 4096 update_1word_crc32(crc_rv, buf, table, 0, 0, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 4097 update_1word_crc32(crc_rv, buf, table, 4, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 4098 bdnz(L_mainLoop); 4099 4100 #ifndef VM_LITTLE_ENDIAN 4101 load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data. 4102 tmp = crc_rv; // Tmp uses it's original register again. 4103 #endif 4104 4105 // Restore original table address for tailLoop. 4106 if (reconstructTableOffset != 0) { 4107 addi(table, table, -reconstructTableOffset); 4108 } 4109 4110 // Process last few (<complexThreshold) bytes of buffer. 4111 BIND(L_tail); 4112 update_byteLoop_crc32(crc, buf, len, table, data, false); 4113 4114 if (invertCRC) { 4115 nand(crc, crc, crc); // 1s complement of crc 4116 } 4117 BLOCK_COMMENT("} kernel_crc32_2word"); 4118 } 4119 4120 /** 4121 * @param crc register containing existing CRC (32-bit) 4122 * @param buf register pointing to input byte buffer (byte*) 4123 * @param len register containing number of bytes 4124 * @param table register pointing to CRC table 4125 * 4126 * uses R9..R12 as work register. Must be saved/restored by caller! 4127 */ 4128 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table, 4129 Register t0, Register t1, Register t2, Register t3, 4130 Register tc0, Register tc1, Register tc2, Register tc3, 4131 bool invertCRC) { 4132 assert_different_registers(crc, buf, len, table); 4133 4134 Label L_mainLoop, L_tail; 4135 Register tmp = t0; 4136 Register data = t0; 4137 Register tmp2 = t1; 4138 const int mainLoop_stepping = 4; 4139 const int tailLoop_stepping = 1; 4140 const int log_stepping = exact_log2(mainLoop_stepping); 4141 const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32; 4142 const int complexThreshold = 2*mainLoop_stepping; 4143 4144 // Don't test for len <= 0 here. This pathological case should not occur anyway. 4145 // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles 4146 // for all well-behaved cases. The situation itself is detected and handled correctly 4147 // within update_byteLoop_crc32. 4148 assert(tailLoop_stepping == 1, "check tailLoop_stepping!"); 4149 4150 BLOCK_COMMENT("kernel_crc32_1word {"); 4151 4152 if (invertCRC) { 4153 nand(crc, crc, crc); // 1s complement of crc 4154 } 4155 4156 // Check for short (<mainLoop_stepping) buffer. 4157 cmpdi(CCR0, len, complexThreshold); 4158 blt(CCR0, L_tail); 4159 4160 // Pre-mainLoop alignment did show a slight (1%) positive effect on performance. 4161 // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions. 4162 { 4163 // Align buf addr to mainLoop_stepping boundary. 4164 neg(tmp2, buf); // Calculate # preLoop iterations for alignment. 4165 rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63. 4166 4167 if (complexThreshold > mainLoop_stepping) { 4168 sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 4169 } else { 4170 sub(tmp, len, tmp2); // Remaining bytes for main loop. 4171 cmpdi(CCR0, tmp, mainLoop_stepping); 4172 blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing 4173 mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 4174 } 4175 update_byteLoop_crc32(crc, buf, tmp2, table, data, false); 4176 } 4177 4178 srdi(tmp2, len, log_stepping); // #iterations for mainLoop 4179 andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop 4180 mtctr(tmp2); 4181 4182 #ifdef VM_LITTLE_ENDIAN 4183 Register crc_rv = crc; 4184 #else 4185 Register crc_rv = tmp; // Load_reverse needs separate registers to work on. 4186 // Occupies tmp, but frees up crc. 4187 load_reverse_32(crc_rv, crc); // Revert byte order because we are dealing with big-endian data. 4188 tmp = crc; 4189 #endif 4190 4191 int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3); 4192 4193 align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement. 4194 BIND(L_mainLoop); 4195 update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 4196 bdnz(L_mainLoop); 4197 4198 #ifndef VM_LITTLE_ENDIAN 4199 load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data. 4200 tmp = crc_rv; // Tmp uses it's original register again. 4201 #endif 4202 4203 // Restore original table address for tailLoop. 4204 if (reconstructTableOffset != 0) { 4205 addi(table, table, -reconstructTableOffset); 4206 } 4207 4208 // Process last few (<complexThreshold) bytes of buffer. 4209 BIND(L_tail); 4210 update_byteLoop_crc32(crc, buf, len, table, data, false); 4211 4212 if (invertCRC) { 4213 nand(crc, crc, crc); // 1s complement of crc 4214 } 4215 BLOCK_COMMENT("} kernel_crc32_1word"); 4216 } 4217 4218 /** 4219 * @param crc register containing existing CRC (32-bit) 4220 * @param buf register pointing to input byte buffer (byte*) 4221 * @param len register containing number of bytes 4222 * @param table register pointing to CRC table 4223 * 4224 * Uses R7_ARG5, R8_ARG6 as work registers. 4225 */ 4226 void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len, Register table, 4227 Register t0, Register t1, Register t2, Register t3, 4228 bool invertCRC) { 4229 assert_different_registers(crc, buf, len, table); 4230 4231 Register data = t0; // Holds the current byte to be folded into crc. 4232 4233 BLOCK_COMMENT("kernel_crc32_1byte {"); 4234 4235 if (invertCRC) { 4236 nand(crc, crc, crc); // 1s complement of crc 4237 } 4238 4239 // Process all bytes in a single-byte loop. 4240 update_byteLoop_crc32(crc, buf, len, table, data, true); 4241 4242 if (invertCRC) { 4243 nand(crc, crc, crc); // 1s complement of crc 4244 } 4245 BLOCK_COMMENT("} kernel_crc32_1byte"); 4246 } 4247 4248 /** 4249 * @param crc register containing existing CRC (32-bit) 4250 * @param buf register pointing to input byte buffer (byte*) 4251 * @param len register containing number of bytes 4252 * @param table register pointing to CRC table 4253 * @param constants register pointing to CRC table for 128-bit aligned memory 4254 * @param barretConstants register pointing to table for barrett reduction 4255 * @param t0-t4 temp registers 4256 */ 4257 void MacroAssembler::kernel_crc32_1word_vpmsum(Register crc, Register buf, Register len, Register table, 4258 Register constants, Register barretConstants, 4259 Register t0, Register t1, Register t2, Register t3, Register t4, 4260 bool invertCRC) { 4261 assert_different_registers(crc, buf, len, table); 4262 4263 Label L_alignedHead, L_tail; 4264 4265 BLOCK_COMMENT("kernel_crc32_1word_vpmsum {"); 4266 4267 // 1. ~c 4268 if (invertCRC) { 4269 nand(crc, crc, crc); // 1s complement of crc 4270 } 4271 4272 // 2. use kernel_crc32_1word for short len 4273 clrldi(len, len, 32); 4274 cmpdi(CCR0, len, 512); 4275 blt(CCR0, L_tail); 4276 4277 // 3. calculate from 0 to first aligned address 4278 const int alignment = 16; 4279 Register prealign = t0; 4280 4281 andi_(prealign, buf, alignment - 1); 4282 beq(CCR0, L_alignedHead); 4283 subfic(prealign, prealign, alignment); 4284 4285 subf(len, prealign, len); 4286 update_byteLoop_crc32(crc, buf, prealign, table, t2, false); 4287 4288 // 4. calculate from first aligned address as far as possible 4289 BIND(L_alignedHead); 4290 kernel_crc32_1word_aligned(crc, buf, len, constants, barretConstants, t0, t1, t2, t3, t4); 4291 4292 // 5. remaining bytes 4293 BIND(L_tail); 4294 Register tc0 = t4; 4295 Register tc1 = constants; 4296 Register tc2 = barretConstants; 4297 kernel_crc32_1word(crc, buf, len, table, t0, t1, t2, t3, tc0, tc1, tc2, table, false); 4298 4299 // 6. ~c 4300 if (invertCRC) { 4301 nand(crc, crc, crc); // 1s complement of crc 4302 } 4303 4304 BLOCK_COMMENT("} kernel_crc32_1word_vpmsum"); 4305 } 4306 4307 /** 4308 * @param crc register containing existing CRC (32-bit) 4309 * @param buf register pointing to input byte buffer (byte*) 4310 * @param len register containing number of bytes (will get updated to remaining bytes) 4311 * @param constants register pointing to CRC table for 128-bit aligned memory 4312 * @param barretConstants register pointing to table for barrett reduction 4313 * @param t0-t4 temp registers 4314 * Precondition: len should be >= 512. Otherwise, nothing will be done. 4315 */ 4316 void MacroAssembler::kernel_crc32_1word_aligned(Register crc, Register buf, Register len, 4317 Register constants, Register barretConstants, 4318 Register t0, Register t1, Register t2, Register t3, Register t4) { 4319 4320 // Save non-volatile vector registers (frameless). 4321 Register offset = t1; 4322 int offsetInt = 0; 4323 offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP); 4324 offsetInt -= 16; li(offset, offsetInt); stvx(VR21, offset, R1_SP); 4325 offsetInt -= 16; li(offset, offsetInt); stvx(VR22, offset, R1_SP); 4326 offsetInt -= 16; li(offset, offsetInt); stvx(VR23, offset, R1_SP); 4327 offsetInt -= 16; li(offset, offsetInt); stvx(VR24, offset, R1_SP); 4328 offsetInt -= 16; li(offset, offsetInt); stvx(VR25, offset, R1_SP); 4329 #ifndef VM_LITTLE_ENDIAN 4330 offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP); 4331 #endif 4332 offsetInt -= 8; std(R14, offsetInt, R1_SP); 4333 offsetInt -= 8; std(R15, offsetInt, R1_SP); 4334 offsetInt -= 8; std(R16, offsetInt, R1_SP); 4335 offsetInt -= 8; std(R17, offsetInt, R1_SP); 4336 4337 // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor 4338 // bytes per iteration. The basic scheme is: 4339 // lvx: load vector (Big Endian needs reversal) 4340 // vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift 4341 // vxor: xor partial results together to get unroll_factor2 vectors 4342 4343 // Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors. 4344 4345 // Using 16 * unroll_factor / unroll_factor_2 bytes for constants. 4346 const int unroll_factor = 2048; 4347 const int unroll_factor2 = 8; 4348 4349 // Support registers. 4350 Register offs[] = { noreg, t0, t1, t2, t3, t4, crc /* will live in VCRC */, R14 }; 4351 Register num_bytes = R15, 4352 loop_count = R16, 4353 cur_const = R17; 4354 // Constant array for outer loop: unroll_factor2 - 1 registers, 4355 // Constant array for inner loop: unroll_factor / unroll_factor2 registers. 4356 VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 }, 4357 consts1[] = { VR23, VR24 }; 4358 // Data register arrays: 2 arrays with unroll_factor2 registers. 4359 VectorRegister data0[] = { VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7 }, 4360 data1[] = { VR8, VR9, VR10, VR11, VR12, VR13, VR14, VR15 }; 4361 4362 VectorRegister VCRC = data0[0]; 4363 VectorRegister Vc = VR25; 4364 VectorRegister swap_bytes = VR26; // Only for Big Endian. 4365 4366 // We have at least 1 iteration (ensured by caller). 4367 Label L_outer_loop, L_inner_loop, L_last; 4368 4369 // If supported set DSCR pre-fetch to deepest. 4370 if (VM_Version::has_mfdscr()) { 4371 load_const_optimized(t0, VM_Version::_dscr_val | 7); 4372 mtdscr(t0); 4373 } 4374 4375 mtvrwz(VCRC, crc); // crc lives lives in VCRC, now 4376 4377 for (int i = 1; i < unroll_factor2; ++i) { 4378 li(offs[i], 16 * i); 4379 } 4380 4381 // Load consts for outer loop 4382 lvx(consts0[0], constants); 4383 for (int i = 1; i < unroll_factor2 - 1; ++i) { 4384 lvx(consts0[i], offs[i], constants); 4385 } 4386 addi(constants, constants, (unroll_factor2 - 1) * 16); 4387 4388 load_const_optimized(num_bytes, 16 * unroll_factor); 4389 load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off. 4390 4391 // Reuse data registers outside of the loop. 4392 VectorRegister Vtmp = data1[0]; 4393 VectorRegister Vtmp2 = data1[1]; 4394 VectorRegister zeroes = data1[2]; 4395 4396 vspltisb(Vtmp, 0); 4397 vsldoi(VCRC, Vtmp, VCRC, 8); // 96 bit zeroes, 32 bit CRC. 4398 4399 // Load vector for vpermxor (to xor both 64 bit parts together) 4400 lvsl(Vtmp, buf); // 000102030405060708090a0b0c0d0e0f 4401 vspltisb(Vc, 4); 4402 vsl(Vc, Vtmp, Vc); // 00102030405060708090a0b0c0d0e0f0 4403 xxspltd(Vc->to_vsr(), Vc->to_vsr(), 0); 4404 vor(Vc, Vtmp, Vc); // 001122334455667708192a3b4c5d6e7f 4405 4406 #ifdef VM_LITTLE_ENDIAN 4407 #define BE_swap_bytes(x) 4408 #else 4409 vspltisb(Vtmp2, 0xf); 4410 vxor(swap_bytes, Vtmp, Vtmp2); 4411 #define BE_swap_bytes(x) vperm(x, x, x, swap_bytes) 4412 #endif 4413 4414 cmpd(CCR0, len, num_bytes); 4415 blt(CCR0, L_last); 4416 4417 // ********** Main loop start ********** 4418 align(32); 4419 bind(L_outer_loop); 4420 4421 // Begin of unrolled first iteration (no xor). 4422 lvx(data1[0], buf); 4423 mr(cur_const, constants); 4424 for (int i = 1; i < unroll_factor2 / 2; ++i) { 4425 lvx(data1[i], offs[i], buf); 4426 } 4427 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. 4428 lvx(consts1[0], cur_const); 4429 mtctr(loop_count); 4430 for (int i = 0; i < unroll_factor2 / 2; ++i) { 4431 BE_swap_bytes(data1[i]); 4432 if (i == 0) { vxor(data1[0], data1[0], VCRC); } // xor in previous CRC. 4433 lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf); 4434 vpmsumw(data0[i], data1[i], consts1[0]); 4435 } 4436 addi(buf, buf, 16 * unroll_factor2); 4437 subf(len, num_bytes, len); 4438 lvx(consts1[1], offs[1], cur_const); 4439 addi(cur_const, cur_const, 32); 4440 // Begin of unrolled second iteration (head). 4441 for (int i = 0; i < unroll_factor2 / 2; ++i) { 4442 BE_swap_bytes(data1[i + unroll_factor2 / 2]); 4443 if (i == 0) { lvx(data1[0], buf); } else { lvx(data1[i], offs[i], buf); } 4444 vpmsumw(data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[0]); 4445 } 4446 for (int i = 0; i < unroll_factor2 / 2; ++i) { 4447 BE_swap_bytes(data1[i]); 4448 lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf); 4449 vpmsumw(data1[i], data1[i], consts1[1]); 4450 } 4451 addi(buf, buf, 16 * unroll_factor2); 4452 4453 // Generate most performance relevant code. Loads + half of the vpmsumw have been generated. 4454 // Double-iteration allows using the 2 constant registers alternatingly. 4455 align(32); 4456 bind(L_inner_loop); 4457 for (int j = 1; j < 3; ++j) { // j < unroll_factor / unroll_factor2 - 1 for complete unrolling. 4458 if (j & 1) { 4459 lvx(consts1[0], cur_const); 4460 } else { 4461 lvx(consts1[1], offs[1], cur_const); 4462 addi(cur_const, cur_const, 32); 4463 } 4464 for (int i = 0; i < unroll_factor2; ++i) { 4465 int idx = i + unroll_factor2 / 2, inc = 0; // For modulo-scheduled input. 4466 if (idx >= unroll_factor2) { idx -= unroll_factor2; inc = 1; } 4467 BE_swap_bytes(data1[idx]); 4468 vxor(data0[i], data0[i], data1[i]); 4469 if (i == 0) lvx(data1[0], buf); else lvx(data1[i], offs[i], buf); 4470 vpmsumw(data1[idx], data1[idx], consts1[(j + inc) & 1]); 4471 } 4472 addi(buf, buf, 16 * unroll_factor2); 4473 } 4474 bdnz(L_inner_loop); 4475 4476 // Tail of last iteration (no loads). 4477 for (int i = 0; i < unroll_factor2 / 2; ++i) { 4478 BE_swap_bytes(data1[i + unroll_factor2 / 2]); 4479 vxor(data0[i], data0[i], data1[i]); 4480 vpmsumw(data1[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[1]); 4481 } 4482 for (int i = 0; i < unroll_factor2 / 2; ++i) { 4483 vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); // First half of fixup shifts. 4484 vxor(data0[i + unroll_factor2 / 2], data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2]); 4485 } 4486 4487 // Last data register is ok, other ones need fixup shift. 4488 for (int i = unroll_factor2 / 2; i < unroll_factor2 - 1; ++i) { 4489 vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); 4490 } 4491 4492 // Combine to 128 bit result vector VCRC = data0[0]. 4493 for (int i = 1; i < unroll_factor2; i<<=1) { 4494 for (int j = 0; j <= unroll_factor2 - 2*i; j+=2*i) { 4495 vxor(data0[j], data0[j], data0[j+i]); 4496 } 4497 } 4498 cmpd(CCR0, len, num_bytes); 4499 bge(CCR0, L_outer_loop); 4500 4501 // Last chance with lower num_bytes. 4502 bind(L_last); 4503 srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations. 4504 add_const_optimized(constants, constants, 16 * (unroll_factor / unroll_factor2)); // Point behind last one. 4505 sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used. 4506 clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2)); 4507 subf(constants, R0, constants); // Point to constant to be used first. 4508 4509 addic_(loop_count, loop_count, -1); // One double-iteration peeled off. 4510 bgt(CCR0, L_outer_loop); 4511 // ********** Main loop end ********** 4512 #undef BE_swap_bytes 4513 4514 // Restore DSCR pre-fetch value. 4515 if (VM_Version::has_mfdscr()) { 4516 load_const_optimized(t0, VM_Version::_dscr_val); 4517 mtdscr(t0); 4518 } 4519 4520 vspltisb(zeroes, 0); 4521 4522 // Combine to 64 bit result. 4523 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. 4524 4525 // Reduce to 32 bit CRC: Remainder by multiply-high. 4526 lvx(Vtmp, barretConstants); 4527 vsldoi(Vtmp2, zeroes, VCRC, 12); // Extract high 32 bit. 4528 vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply by inverse long poly. 4529 vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit. 4530 vsldoi(Vtmp, zeroes, Vtmp, 8); 4531 vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply quotient by long poly. 4532 vxor(VCRC, VCRC, Vtmp2); // Remainder fits into 32 bit. 4533 4534 // Move result. len is already updated. 4535 vsldoi(VCRC, VCRC, zeroes, 8); 4536 mfvrd(crc, VCRC); 4537 4538 // Restore non-volatile Vector registers (frameless). 4539 offsetInt = 0; 4540 offsetInt -= 16; li(offset, offsetInt); lvx(VR20, offset, R1_SP); 4541 offsetInt -= 16; li(offset, offsetInt); lvx(VR21, offset, R1_SP); 4542 offsetInt -= 16; li(offset, offsetInt); lvx(VR22, offset, R1_SP); 4543 offsetInt -= 16; li(offset, offsetInt); lvx(VR23, offset, R1_SP); 4544 offsetInt -= 16; li(offset, offsetInt); lvx(VR24, offset, R1_SP); 4545 offsetInt -= 16; li(offset, offsetInt); lvx(VR25, offset, R1_SP); 4546 #ifndef VM_LITTLE_ENDIAN 4547 offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP); 4548 #endif 4549 offsetInt -= 8; ld(R14, offsetInt, R1_SP); 4550 offsetInt -= 8; ld(R15, offsetInt, R1_SP); 4551 offsetInt -= 8; ld(R16, offsetInt, R1_SP); 4552 offsetInt -= 8; ld(R17, offsetInt, R1_SP); 4553 } 4554 4555 void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp, bool invertCRC) { 4556 assert_different_registers(crc, buf, /* len, not used!! */ table, tmp); 4557 4558 BLOCK_COMMENT("kernel_crc32_singleByte:"); 4559 if (invertCRC) { 4560 nand(crc, crc, crc); // 1s complement of crc 4561 } 4562 4563 lbz(tmp, 0, buf); // Byte from buffer, zero-extended. 4564 update_byte_crc32(crc, tmp, table); 4565 4566 if (invertCRC) { 4567 nand(crc, crc, crc); // 1s complement of crc 4568 } 4569 } 4570 4571 void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) { 4572 assert_different_registers(crc, val, table); 4573 4574 BLOCK_COMMENT("kernel_crc32_singleByteReg:"); 4575 if (invertCRC) { 4576 nand(crc, crc, crc); // 1s complement of crc 4577 } 4578 4579 update_byte_crc32(crc, val, table); 4580 4581 if (invertCRC) { 4582 nand(crc, crc, crc); // 1s complement of crc 4583 } 4584 } 4585 4586 // dest_lo += src1 + src2 4587 // dest_hi += carry1 + carry2 4588 void MacroAssembler::add2_with_carry(Register dest_hi, 4589 Register dest_lo, 4590 Register src1, Register src2) { 4591 li(R0, 0); 4592 addc(dest_lo, dest_lo, src1); 4593 adde(dest_hi, dest_hi, R0); 4594 addc(dest_lo, dest_lo, src2); 4595 adde(dest_hi, dest_hi, R0); 4596 } 4597 4598 // Multiply 64 bit by 64 bit first loop. 4599 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, 4600 Register x_xstart, 4601 Register y, Register y_idx, 4602 Register z, 4603 Register carry, 4604 Register product_high, Register product, 4605 Register idx, Register kdx, 4606 Register tmp) { 4607 // jlong carry, x[], y[], z[]; 4608 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) { 4609 // huge_128 product = y[idx] * x[xstart] + carry; 4610 // z[kdx] = (jlong)product; 4611 // carry = (jlong)(product >>> 64); 4612 // } 4613 // z[xstart] = carry; 4614 4615 Label L_first_loop, L_first_loop_exit; 4616 Label L_one_x, L_one_y, L_multiply; 4617 4618 addic_(xstart, xstart, -1); 4619 blt(CCR0, L_one_x); // Special case: length of x is 1. 4620 4621 // Load next two integers of x. 4622 sldi(tmp, xstart, LogBytesPerInt); 4623 ldx(x_xstart, x, tmp); 4624 #ifdef VM_LITTLE_ENDIAN 4625 rldicl(x_xstart, x_xstart, 32, 0); 4626 #endif 4627 4628 align(32, 16); 4629 bind(L_first_loop); 4630 4631 cmpdi(CCR0, idx, 1); 4632 blt(CCR0, L_first_loop_exit); 4633 addi(idx, idx, -2); 4634 beq(CCR0, L_one_y); 4635 4636 // Load next two integers of y. 4637 sldi(tmp, idx, LogBytesPerInt); 4638 ldx(y_idx, y, tmp); 4639 #ifdef VM_LITTLE_ENDIAN 4640 rldicl(y_idx, y_idx, 32, 0); 4641 #endif 4642 4643 4644 bind(L_multiply); 4645 multiply64(product_high, product, x_xstart, y_idx); 4646 4647 li(tmp, 0); 4648 addc(product, product, carry); // Add carry to result. 4649 adde(product_high, product_high, tmp); // Add carry of the last addition. 4650 addi(kdx, kdx, -2); 4651 4652 // Store result. 4653 #ifdef VM_LITTLE_ENDIAN 4654 rldicl(product, product, 32, 0); 4655 #endif 4656 sldi(tmp, kdx, LogBytesPerInt); 4657 stdx(product, z, tmp); 4658 mr_if_needed(carry, product_high); 4659 b(L_first_loop); 4660 4661 4662 bind(L_one_y); // Load one 32 bit portion of y as (0,value). 4663 4664 lwz(y_idx, 0, y); 4665 b(L_multiply); 4666 4667 4668 bind(L_one_x); // Load one 32 bit portion of x as (0,value). 4669 4670 lwz(x_xstart, 0, x); 4671 b(L_first_loop); 4672 4673 bind(L_first_loop_exit); 4674 } 4675 4676 // Multiply 64 bit by 64 bit and add 128 bit. 4677 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, 4678 Register z, Register yz_idx, 4679 Register idx, Register carry, 4680 Register product_high, Register product, 4681 Register tmp, int offset) { 4682 4683 // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry; 4684 // z[kdx] = (jlong)product; 4685 4686 sldi(tmp, idx, LogBytesPerInt); 4687 if (offset) { 4688 addi(tmp, tmp, offset); 4689 } 4690 ldx(yz_idx, y, tmp); 4691 #ifdef VM_LITTLE_ENDIAN 4692 rldicl(yz_idx, yz_idx, 32, 0); 4693 #endif 4694 4695 multiply64(product_high, product, x_xstart, yz_idx); 4696 ldx(yz_idx, z, tmp); 4697 #ifdef VM_LITTLE_ENDIAN 4698 rldicl(yz_idx, yz_idx, 32, 0); 4699 #endif 4700 4701 add2_with_carry(product_high, product, carry, yz_idx); 4702 4703 sldi(tmp, idx, LogBytesPerInt); 4704 if (offset) { 4705 addi(tmp, tmp, offset); 4706 } 4707 #ifdef VM_LITTLE_ENDIAN 4708 rldicl(product, product, 32, 0); 4709 #endif 4710 stdx(product, z, tmp); 4711 } 4712 4713 // Multiply 128 bit by 128 bit. Unrolled inner loop. 4714 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, 4715 Register y, Register z, 4716 Register yz_idx, Register idx, Register carry, 4717 Register product_high, Register product, 4718 Register carry2, Register tmp) { 4719 4720 // jlong carry, x[], y[], z[]; 4721 // int kdx = ystart+1; 4722 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 4723 // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry; 4724 // z[kdx+idx+1] = (jlong)product; 4725 // jlong carry2 = (jlong)(product >>> 64); 4726 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2; 4727 // z[kdx+idx] = (jlong)product; 4728 // carry = (jlong)(product >>> 64); 4729 // } 4730 // idx += 2; 4731 // if (idx > 0) { 4732 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry; 4733 // z[kdx+idx] = (jlong)product; 4734 // carry = (jlong)(product >>> 64); 4735 // } 4736 4737 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 4738 const Register jdx = R0; 4739 4740 // Scale the index. 4741 srdi_(jdx, idx, 2); 4742 beq(CCR0, L_third_loop_exit); 4743 mtctr(jdx); 4744 4745 align(32, 16); 4746 bind(L_third_loop); 4747 4748 addi(idx, idx, -4); 4749 4750 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8); 4751 mr_if_needed(carry2, product_high); 4752 4753 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0); 4754 mr_if_needed(carry, product_high); 4755 bdnz(L_third_loop); 4756 4757 bind(L_third_loop_exit); // Handle any left-over operand parts. 4758 4759 andi_(idx, idx, 0x3); 4760 beq(CCR0, L_post_third_loop_done); 4761 4762 Label L_check_1; 4763 4764 addic_(idx, idx, -2); 4765 blt(CCR0, L_check_1); 4766 4767 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0); 4768 mr_if_needed(carry, product_high); 4769 4770 bind(L_check_1); 4771 4772 addi(idx, idx, 0x2); 4773 andi_(idx, idx, 0x1); 4774 addic_(idx, idx, -1); 4775 blt(CCR0, L_post_third_loop_done); 4776 4777 sldi(tmp, idx, LogBytesPerInt); 4778 lwzx(yz_idx, y, tmp); 4779 multiply64(product_high, product, x_xstart, yz_idx); 4780 lwzx(yz_idx, z, tmp); 4781 4782 add2_with_carry(product_high, product, yz_idx, carry); 4783 4784 sldi(tmp, idx, LogBytesPerInt); 4785 stwx(product, z, tmp); 4786 srdi(product, product, 32); 4787 4788 sldi(product_high, product_high, 32); 4789 orr(product, product, product_high); 4790 mr_if_needed(carry, product); 4791 4792 bind(L_post_third_loop_done); 4793 } // multiply_128_x_128_loop 4794 4795 void MacroAssembler::muladd(Register out, Register in, 4796 Register offset, Register len, Register k, 4797 Register tmp1, Register tmp2, Register carry) { 4798 4799 // Labels 4800 Label LOOP, SKIP; 4801 4802 // Make sure length is positive. 4803 cmpdi (CCR0, len, 0); 4804 4805 // Prepare variables 4806 subi (offset, offset, 4); 4807 li (carry, 0); 4808 ble (CCR0, SKIP); 4809 4810 mtctr (len); 4811 subi (len, len, 1 ); 4812 sldi (len, len, 2 ); 4813 4814 // Main loop 4815 bind(LOOP); 4816 lwzx (tmp1, len, in ); 4817 lwzx (tmp2, offset, out ); 4818 mulld (tmp1, tmp1, k ); 4819 add (tmp2, carry, tmp2 ); 4820 add (tmp2, tmp1, tmp2 ); 4821 stwx (tmp2, offset, out ); 4822 srdi (carry, tmp2, 32 ); 4823 subi (offset, offset, 4 ); 4824 subi (len, len, 4 ); 4825 bdnz (LOOP); 4826 bind(SKIP); 4827 } 4828 4829 void MacroAssembler::multiply_to_len(Register x, Register xlen, 4830 Register y, Register ylen, 4831 Register z, Register zlen, 4832 Register tmp1, Register tmp2, 4833 Register tmp3, Register tmp4, 4834 Register tmp5, Register tmp6, 4835 Register tmp7, Register tmp8, 4836 Register tmp9, Register tmp10, 4837 Register tmp11, Register tmp12, 4838 Register tmp13) { 4839 4840 ShortBranchVerifier sbv(this); 4841 4842 assert_different_registers(x, xlen, y, ylen, z, zlen, 4843 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6); 4844 assert_different_registers(x, xlen, y, ylen, z, zlen, 4845 tmp1, tmp2, tmp3, tmp4, tmp5, tmp7); 4846 assert_different_registers(x, xlen, y, ylen, z, zlen, 4847 tmp1, tmp2, tmp3, tmp4, tmp5, tmp8); 4848 4849 const Register idx = tmp1; 4850 const Register kdx = tmp2; 4851 const Register xstart = tmp3; 4852 4853 const Register y_idx = tmp4; 4854 const Register carry = tmp5; 4855 const Register product = tmp6; 4856 const Register product_high = tmp7; 4857 const Register x_xstart = tmp8; 4858 const Register tmp = tmp9; 4859 4860 // First Loop. 4861 // 4862 // final static long LONG_MASK = 0xffffffffL; 4863 // int xstart = xlen - 1; 4864 // int ystart = ylen - 1; 4865 // long carry = 0; 4866 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 4867 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; 4868 // z[kdx] = (int)product; 4869 // carry = product >>> 32; 4870 // } 4871 // z[xstart] = (int)carry; 4872 4873 mr_if_needed(idx, ylen); // idx = ylen 4874 mr_if_needed(kdx, zlen); // kdx = xlen + ylen 4875 li(carry, 0); // carry = 0 4876 4877 Label L_done; 4878 4879 addic_(xstart, xlen, -1); 4880 blt(CCR0, L_done); 4881 4882 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, 4883 carry, product_high, product, idx, kdx, tmp); 4884 4885 Label L_second_loop; 4886 4887 cmpdi(CCR0, kdx, 0); 4888 beq(CCR0, L_second_loop); 4889 4890 Label L_carry; 4891 4892 addic_(kdx, kdx, -1); 4893 beq(CCR0, L_carry); 4894 4895 // Store lower 32 bits of carry. 4896 sldi(tmp, kdx, LogBytesPerInt); 4897 stwx(carry, z, tmp); 4898 srdi(carry, carry, 32); 4899 addi(kdx, kdx, -1); 4900 4901 4902 bind(L_carry); 4903 4904 // Store upper 32 bits of carry. 4905 sldi(tmp, kdx, LogBytesPerInt); 4906 stwx(carry, z, tmp); 4907 4908 // Second and third (nested) loops. 4909 // 4910 // for (int i = xstart-1; i >= 0; i--) { // Second loop 4911 // carry = 0; 4912 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop 4913 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + 4914 // (z[k] & LONG_MASK) + carry; 4915 // z[k] = (int)product; 4916 // carry = product >>> 32; 4917 // } 4918 // z[i] = (int)carry; 4919 // } 4920 // 4921 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx 4922 4923 bind(L_second_loop); 4924 4925 li(carry, 0); // carry = 0; 4926 4927 addic_(xstart, xstart, -1); // i = xstart-1; 4928 blt(CCR0, L_done); 4929 4930 Register zsave = tmp10; 4931 4932 mr(zsave, z); 4933 4934 4935 Label L_last_x; 4936 4937 sldi(tmp, xstart, LogBytesPerInt); 4938 add(z, z, tmp); // z = z + k - j 4939 addi(z, z, 4); 4940 addic_(xstart, xstart, -1); // i = xstart-1; 4941 blt(CCR0, L_last_x); 4942 4943 sldi(tmp, xstart, LogBytesPerInt); 4944 ldx(x_xstart, x, tmp); 4945 #ifdef VM_LITTLE_ENDIAN 4946 rldicl(x_xstart, x_xstart, 32, 0); 4947 #endif 4948 4949 4950 Label L_third_loop_prologue; 4951 4952 bind(L_third_loop_prologue); 4953 4954 Register xsave = tmp11; 4955 Register xlensave = tmp12; 4956 Register ylensave = tmp13; 4957 4958 mr(xsave, x); 4959 mr(xlensave, xstart); 4960 mr(ylensave, ylen); 4961 4962 4963 multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen, 4964 carry, product_high, product, x, tmp); 4965 4966 mr(z, zsave); 4967 mr(x, xsave); 4968 mr(xlen, xlensave); // This is the decrement of the loop counter! 4969 mr(ylen, ylensave); 4970 4971 addi(tmp3, xlen, 1); 4972 sldi(tmp, tmp3, LogBytesPerInt); 4973 stwx(carry, z, tmp); 4974 addic_(tmp3, tmp3, -1); 4975 blt(CCR0, L_done); 4976 4977 srdi(carry, carry, 32); 4978 sldi(tmp, tmp3, LogBytesPerInt); 4979 stwx(carry, z, tmp); 4980 b(L_second_loop); 4981 4982 // Next infrequent code is moved outside loops. 4983 bind(L_last_x); 4984 4985 lwz(x_xstart, 0, x); 4986 b(L_third_loop_prologue); 4987 4988 bind(L_done); 4989 } // multiply_to_len 4990 4991 void MacroAssembler::asm_assert(bool check_equal, const char *msg, int id) { 4992 #ifdef ASSERT 4993 Label ok; 4994 if (check_equal) { 4995 beq(CCR0, ok); 4996 } else { 4997 bne(CCR0, ok); 4998 } 4999 stop(msg, id); 5000 bind(ok); 5001 #endif 5002 } 5003 5004 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset, 5005 Register mem_base, const char* msg, int id) { 5006 #ifdef ASSERT 5007 switch (size) { 5008 case 4: 5009 lwz(R0, mem_offset, mem_base); 5010 cmpwi(CCR0, R0, 0); 5011 break; 5012 case 8: 5013 ld(R0, mem_offset, mem_base); 5014 cmpdi(CCR0, R0, 0); 5015 break; 5016 default: 5017 ShouldNotReachHere(); 5018 } 5019 asm_assert(check_equal, msg, id); 5020 #endif // ASSERT 5021 } 5022 5023 void MacroAssembler::verify_thread() { 5024 if (VerifyThread) { 5025 unimplemented("'VerifyThread' currently not implemented on PPC"); 5026 } 5027 } 5028 5029 // READ: oop. KILL: R0. Volatile floats perhaps. 5030 void MacroAssembler::verify_oop(Register oop, const char* msg) { 5031 if (!VerifyOops) { 5032 return; 5033 } 5034 5035 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address(); 5036 const Register tmp = R11; // Will be preserved. 5037 const int nbytes_save = MacroAssembler::num_volatile_regs * 8; 5038 save_volatile_gprs(R1_SP, -nbytes_save); // except R0 5039 5040 mr_if_needed(R4_ARG2, oop); 5041 save_LR_CR(tmp); // save in old frame 5042 push_frame_reg_args(nbytes_save, tmp); 5043 // load FunctionDescriptor** / entry_address * 5044 load_const_optimized(tmp, fd, R0); 5045 // load FunctionDescriptor* / entry_address 5046 ld(tmp, 0, tmp); 5047 load_const_optimized(R3_ARG1, (address)msg, R0); 5048 // Call destination for its side effect. 5049 call_c(tmp); 5050 5051 pop_frame(); 5052 restore_LR_CR(tmp); 5053 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0 5054 } 5055 5056 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) { 5057 if (!VerifyOops) { 5058 return; 5059 } 5060 5061 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address(); 5062 const Register tmp = R11; // Will be preserved. 5063 const int nbytes_save = MacroAssembler::num_volatile_regs * 8; 5064 save_volatile_gprs(R1_SP, -nbytes_save); // except R0 5065 5066 ld(R4_ARG2, offs, base); 5067 save_LR_CR(tmp); // save in old frame 5068 push_frame_reg_args(nbytes_save, tmp); 5069 // load FunctionDescriptor** / entry_address * 5070 load_const_optimized(tmp, fd, R0); 5071 // load FunctionDescriptor* / entry_address 5072 ld(tmp, 0, tmp); 5073 load_const_optimized(R3_ARG1, (address)msg, R0); 5074 // Call destination for its side effect. 5075 call_c(tmp); 5076 5077 pop_frame(); 5078 restore_LR_CR(tmp); 5079 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0 5080 } 5081 5082 const char* stop_types[] = { 5083 "stop", 5084 "untested", 5085 "unimplemented", 5086 "shouldnotreachhere" 5087 }; 5088 5089 static void stop_on_request(int tp, const char* msg) { 5090 tty->print("PPC assembly code requires stop: (%s) %s\n", stop_types[tp%/*stop_end*/4], msg); 5091 guarantee(false, "PPC assembly code requires stop: %s", msg); 5092 } 5093 5094 // Call a C-function that prints output. 5095 void MacroAssembler::stop(int type, const char* msg, int id) { 5096 #ifndef PRODUCT 5097 block_comment(err_msg("stop: %s %s {", stop_types[type%stop_end], msg)); 5098 #else 5099 block_comment("stop {"); 5100 #endif 5101 5102 // setup arguments 5103 load_const_optimized(R3_ARG1, type); 5104 load_const_optimized(R4_ARG2, (void *)msg, /*tmp=*/R0); 5105 call_VM_leaf(CAST_FROM_FN_PTR(address, stop_on_request), R3_ARG1, R4_ARG2); 5106 illtrap(); 5107 emit_int32(id); 5108 block_comment("} stop;"); 5109 } 5110 5111 #ifndef PRODUCT 5112 // Write pattern 0x0101010101010101 in memory region [low-before, high+after]. 5113 // Val, addr are temp registers. 5114 // If low == addr, addr is killed. 5115 // High is preserved. 5116 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) { 5117 if (!ZapMemory) return; 5118 5119 assert_different_registers(low, val); 5120 5121 BLOCK_COMMENT("zap memory region {"); 5122 load_const_optimized(val, 0x0101010101010101); 5123 int size = before + after; 5124 if (low == high && size < 5 && size > 0) { 5125 int offset = -before*BytesPerWord; 5126 for (int i = 0; i < size; ++i) { 5127 std(val, offset, low); 5128 offset += (1*BytesPerWord); 5129 } 5130 } else { 5131 addi(addr, low, -before*BytesPerWord); 5132 assert_different_registers(high, val); 5133 if (after) addi(high, high, after * BytesPerWord); 5134 Label loop; 5135 bind(loop); 5136 std(val, 0, addr); 5137 addi(addr, addr, 8); 5138 cmpd(CCR6, addr, high); 5139 ble(CCR6, loop); 5140 if (after) addi(high, high, -after * BytesPerWord); // Correct back to old value. 5141 } 5142 BLOCK_COMMENT("} zap memory region"); 5143 } 5144 5145 #endif // !PRODUCT 5146 5147 void SkipIfEqualZero::skip_to_label_if_equal_zero(MacroAssembler* masm, Register temp, 5148 const bool* flag_addr, Label& label) { 5149 int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true); 5150 assert(sizeof(bool) == 1, "PowerPC ABI"); 5151 masm->lbz(temp, simm16_offset, temp); 5152 masm->cmpwi(CCR0, temp, 0); 5153 masm->beq(CCR0, label); 5154 } 5155 5156 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() { 5157 skip_to_label_if_equal_zero(masm, temp, flag_addr, _label); 5158 } 5159 5160 SkipIfEqualZero::~SkipIfEqualZero() { 5161 _masm->bind(_label); 5162 }