1 /* 2 * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2012, 2019, SAP SE. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.inline.hpp" 28 #include "compiler/disassembler.hpp" 29 #include "gc/shared/collectedHeap.inline.hpp" 30 #include "gc/shared/barrierSet.hpp" 31 #include "gc/shared/barrierSetAssembler.hpp" 32 #include "interpreter/interpreter.hpp" 33 #include "memory/resourceArea.hpp" 34 #include "nativeInst_ppc.hpp" 35 #include "prims/methodHandles.hpp" 36 #include "runtime/biasedLocking.hpp" 37 #include "runtime/icache.hpp" 38 #include "runtime/interfaceSupport.inline.hpp" 39 #include "runtime/objectMonitor.hpp" 40 #include "runtime/os.hpp" 41 #include "runtime/safepoint.hpp" 42 #include "runtime/safepointMechanism.hpp" 43 #include "runtime/sharedRuntime.hpp" 44 #include "runtime/stubRoutines.hpp" 45 #include "utilities/macros.hpp" 46 #ifdef COMPILER2 47 #include "opto/intrinsicnode.hpp" 48 #endif 49 50 #ifdef PRODUCT 51 #define BLOCK_COMMENT(str) // nothing 52 #else 53 #define BLOCK_COMMENT(str) block_comment(str) 54 #endif 55 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 56 57 #ifdef ASSERT 58 // On RISC, there's no benefit to verifying instruction boundaries. 59 bool AbstractAssembler::pd_check_instruction_mark() { return false; } 60 #endif 61 62 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) { 63 assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range"); 64 if (Assembler::is_simm(si31, 16)) { 65 ld(d, si31, a); 66 if (emit_filler_nop) nop(); 67 } else { 68 const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31); 69 const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31); 70 addis(d, a, hi); 71 ld(d, lo, d); 72 } 73 } 74 75 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) { 76 assert_different_registers(d, a); 77 ld_largeoffset_unchecked(d, si31, a, emit_filler_nop); 78 } 79 80 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base, 81 size_t size_in_bytes, bool is_signed) { 82 switch (size_in_bytes) { 83 case 8: ld(dst, offs, base); break; 84 case 4: is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break; 85 case 2: is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break; 86 case 1: lbz(dst, offs, base); if (is_signed) extsb(dst, dst); break; // lba doesn't exist :( 87 default: ShouldNotReachHere(); 88 } 89 } 90 91 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base, 92 size_t size_in_bytes) { 93 switch (size_in_bytes) { 94 case 8: std(dst, offs, base); break; 95 case 4: stw(dst, offs, base); break; 96 case 2: sth(dst, offs, base); break; 97 case 1: stb(dst, offs, base); break; 98 default: ShouldNotReachHere(); 99 } 100 } 101 102 void MacroAssembler::align(int modulus, int max, int rem) { 103 int padding = (rem + modulus - (offset() % modulus)) % modulus; 104 if (padding > max) return; 105 for (int c = (padding >> 2); c > 0; --c) { nop(); } 106 } 107 108 // Issue instructions that calculate given TOC from global TOC. 109 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16, 110 bool add_relocation, bool emit_dummy_addr) { 111 int offset = -1; 112 if (emit_dummy_addr) { 113 offset = -128; // dummy address 114 } else if (addr != (address)(intptr_t)-1) { 115 offset = MacroAssembler::offset_to_global_toc(addr); 116 } 117 118 if (hi16) { 119 addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset)); 120 } 121 if (lo16) { 122 if (add_relocation) { 123 // Relocate at the addi to avoid confusion with a load from the method's TOC. 124 relocate(internal_word_Relocation::spec(addr)); 125 } 126 addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset)); 127 } 128 } 129 130 address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) { 131 const int offset = MacroAssembler::offset_to_global_toc(addr); 132 133 const address inst2_addr = a; 134 const int inst2 = *(int *)inst2_addr; 135 136 // The relocation points to the second instruction, the addi, 137 // and the addi reads and writes the same register dst. 138 const int dst = inv_rt_field(inst2); 139 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 140 141 // Now, find the preceding addis which writes to dst. 142 int inst1 = 0; 143 address inst1_addr = inst2_addr - BytesPerInstWord; 144 while (inst1_addr >= bound) { 145 inst1 = *(int *) inst1_addr; 146 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 147 // Stop, found the addis which writes dst. 148 break; 149 } 150 inst1_addr -= BytesPerInstWord; 151 } 152 153 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 154 set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset)); 155 set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset)); 156 return inst1_addr; 157 } 158 159 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) { 160 const address inst2_addr = a; 161 const int inst2 = *(int *)inst2_addr; 162 163 // The relocation points to the second instruction, the addi, 164 // and the addi reads and writes the same register dst. 165 const int dst = inv_rt_field(inst2); 166 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 167 168 // Now, find the preceding addis which writes to dst. 169 int inst1 = 0; 170 address inst1_addr = inst2_addr - BytesPerInstWord; 171 while (inst1_addr >= bound) { 172 inst1 = *(int *) inst1_addr; 173 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 174 // stop, found the addis which writes dst 175 break; 176 } 177 inst1_addr -= BytesPerInstWord; 178 } 179 180 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 181 182 int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0); 183 // -1 is a special case 184 if (offset == -1) { 185 return (address)(intptr_t)-1; 186 } else { 187 return global_toc() + offset; 188 } 189 } 190 191 #ifdef _LP64 192 // Patch compressed oops or klass constants. 193 // Assembler sequence is 194 // 1) compressed oops: 195 // lis rx = const.hi 196 // ori rx = rx | const.lo 197 // 2) compressed klass: 198 // lis rx = const.hi 199 // clrldi rx = rx & 0xFFFFffff // clearMS32b, optional 200 // ori rx = rx | const.lo 201 // Clrldi will be passed by. 202 address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) { 203 assert(UseCompressedOops, "Should only patch compressed oops"); 204 205 const address inst2_addr = a; 206 const int inst2 = *(int *)inst2_addr; 207 208 // The relocation points to the second instruction, the ori, 209 // and the ori reads and writes the same register dst. 210 const int dst = inv_rta_field(inst2); 211 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 212 // Now, find the preceding addis which writes to dst. 213 int inst1 = 0; 214 address inst1_addr = inst2_addr - BytesPerInstWord; 215 bool inst1_found = false; 216 while (inst1_addr >= bound) { 217 inst1 = *(int *)inst1_addr; 218 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; } 219 inst1_addr -= BytesPerInstWord; 220 } 221 assert(inst1_found, "inst is not lis"); 222 223 int xc = (data >> 16) & 0xffff; 224 int xd = (data >> 0) & 0xffff; 225 226 set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo 227 set_imm((int *)inst2_addr, (xd)); // unsigned int 228 return inst1_addr; 229 } 230 231 // Get compressed oop or klass constant. 232 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) { 233 assert(UseCompressedOops, "Should only patch compressed oops"); 234 235 const address inst2_addr = a; 236 const int inst2 = *(int *)inst2_addr; 237 238 // The relocation points to the second instruction, the ori, 239 // and the ori reads and writes the same register dst. 240 const int dst = inv_rta_field(inst2); 241 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 242 // Now, find the preceding lis which writes to dst. 243 int inst1 = 0; 244 address inst1_addr = inst2_addr - BytesPerInstWord; 245 bool inst1_found = false; 246 247 while (inst1_addr >= bound) { 248 inst1 = *(int *) inst1_addr; 249 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;} 250 inst1_addr -= BytesPerInstWord; 251 } 252 assert(inst1_found, "inst is not lis"); 253 254 uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff)); 255 uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16); 256 257 return (int) (xl | xh); 258 } 259 #endif // _LP64 260 261 // Returns true if successful. 262 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a, 263 Register toc, bool fixed_size) { 264 int toc_offset = 0; 265 // Use RelocationHolder::none for the constant pool entry, otherwise 266 // we will end up with a failing NativeCall::verify(x) where x is 267 // the address of the constant pool entry. 268 // FIXME: We should insert relocation information for oops at the constant 269 // pool entries instead of inserting it at the loads; patching of a constant 270 // pool entry should be less expensive. 271 address const_address = address_constant((address)a.value(), RelocationHolder::none); 272 if (const_address == NULL) { return false; } // allocation failure 273 // Relocate at the pc of the load. 274 relocate(a.rspec()); 275 toc_offset = (int)(const_address - code()->consts()->start()); 276 ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size); 277 return true; 278 } 279 280 bool MacroAssembler::is_load_const_from_method_toc_at(address a) { 281 const address inst1_addr = a; 282 const int inst1 = *(int *)inst1_addr; 283 284 // The relocation points to the ld or the addis. 285 return (is_ld(inst1)) || 286 (is_addis(inst1) && inv_ra_field(inst1) != 0); 287 } 288 289 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) { 290 assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc"); 291 292 const address inst1_addr = a; 293 const int inst1 = *(int *)inst1_addr; 294 295 if (is_ld(inst1)) { 296 return inv_d1_field(inst1); 297 } else if (is_addis(inst1)) { 298 const int dst = inv_rt_field(inst1); 299 300 // Now, find the succeeding ld which reads and writes to dst. 301 address inst2_addr = inst1_addr + BytesPerInstWord; 302 int inst2 = 0; 303 while (true) { 304 inst2 = *(int *) inst2_addr; 305 if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) { 306 // Stop, found the ld which reads and writes dst. 307 break; 308 } 309 inst2_addr += BytesPerInstWord; 310 } 311 return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2); 312 } 313 ShouldNotReachHere(); 314 return 0; 315 } 316 317 // Get the constant from a `load_const' sequence. 318 long MacroAssembler::get_const(address a) { 319 assert(is_load_const_at(a), "not a load of a constant"); 320 const int *p = (const int*) a; 321 unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48); 322 if (is_ori(*(p+1))) { 323 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32); 324 x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16); 325 x |= (((unsigned long) (get_imm(a,4) & 0xffff))); 326 } else if (is_lis(*(p+1))) { 327 x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32); 328 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16); 329 x |= (((unsigned long) (get_imm(a,3) & 0xffff))); 330 } else { 331 ShouldNotReachHere(); 332 return (long) 0; 333 } 334 return (long) x; 335 } 336 337 // Patch the 64 bit constant of a `load_const' sequence. This is a low 338 // level procedure. It neither flushes the instruction cache nor is it 339 // mt safe. 340 void MacroAssembler::patch_const(address a, long x) { 341 assert(is_load_const_at(a), "not a load of a constant"); 342 int *p = (int*) a; 343 if (is_ori(*(p+1))) { 344 set_imm(0 + p, (x >> 48) & 0xffff); 345 set_imm(1 + p, (x >> 32) & 0xffff); 346 set_imm(3 + p, (x >> 16) & 0xffff); 347 set_imm(4 + p, x & 0xffff); 348 } else if (is_lis(*(p+1))) { 349 set_imm(0 + p, (x >> 48) & 0xffff); 350 set_imm(2 + p, (x >> 32) & 0xffff); 351 set_imm(1 + p, (x >> 16) & 0xffff); 352 set_imm(3 + p, x & 0xffff); 353 } else { 354 ShouldNotReachHere(); 355 } 356 } 357 358 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) { 359 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 360 int index = oop_recorder()->allocate_metadata_index(obj); 361 RelocationHolder rspec = metadata_Relocation::spec(index); 362 return AddressLiteral((address)obj, rspec); 363 } 364 365 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) { 366 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 367 int index = oop_recorder()->find_index(obj); 368 RelocationHolder rspec = metadata_Relocation::spec(index); 369 return AddressLiteral((address)obj, rspec); 370 } 371 372 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) { 373 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 374 int oop_index = oop_recorder()->allocate_oop_index(obj); 375 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 376 } 377 378 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) { 379 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 380 int oop_index = oop_recorder()->find_index(obj); 381 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 382 } 383 384 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr, 385 Register tmp, int offset) { 386 intptr_t value = *delayed_value_addr; 387 if (value != 0) { 388 return RegisterOrConstant(value + offset); 389 } 390 391 // Load indirectly to solve generation ordering problem. 392 // static address, no relocation 393 int simm16_offset = load_const_optimized(tmp, delayed_value_addr, noreg, true); 394 ld(tmp, simm16_offset, tmp); // must be aligned ((xa & 3) == 0) 395 396 if (offset != 0) { 397 addi(tmp, tmp, offset); 398 } 399 400 return RegisterOrConstant(tmp); 401 } 402 403 #ifndef PRODUCT 404 void MacroAssembler::pd_print_patched_instruction(address branch) { 405 Unimplemented(); // TODO: PPC port 406 } 407 #endif // ndef PRODUCT 408 409 // Conditional far branch for destinations encodable in 24+2 bits. 410 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) { 411 412 // If requested by flag optimize, relocate the bc_far as a 413 // runtime_call and prepare for optimizing it when the code gets 414 // relocated. 415 if (optimize == bc_far_optimize_on_relocate) { 416 relocate(relocInfo::runtime_call_type); 417 } 418 419 // variant 2: 420 // 421 // b!cxx SKIP 422 // bxx DEST 423 // SKIP: 424 // 425 426 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 427 opposite_bcond(inv_boint_bcond(boint))); 428 429 // We emit two branches. 430 // First, a conditional branch which jumps around the far branch. 431 const address not_taken_pc = pc() + 2 * BytesPerInstWord; 432 const address bc_pc = pc(); 433 bc(opposite_boint, biint, not_taken_pc); 434 435 const int bc_instr = *(int*)bc_pc; 436 assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition"); 437 assert(opposite_boint == inv_bo_field(bc_instr), "postcondition"); 438 assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))), 439 opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))), 440 "postcondition"); 441 assert(biint == inv_bi_field(bc_instr), "postcondition"); 442 443 // Second, an unconditional far branch which jumps to dest. 444 // Note: target(dest) remembers the current pc (see CodeSection::target) 445 // and returns the current pc if the label is not bound yet; when 446 // the label gets bound, the unconditional far branch will be patched. 447 const address target_pc = target(dest); 448 const address b_pc = pc(); 449 b(target_pc); 450 451 assert(not_taken_pc == pc(), "postcondition"); 452 assert(dest.is_bound() || target_pc == b_pc, "postcondition"); 453 } 454 455 // 1 or 2 instructions 456 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) { 457 if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) { 458 bc(boint, biint, dest); 459 } else { 460 bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate); 461 } 462 } 463 464 bool MacroAssembler::is_bc_far_at(address instruction_addr) { 465 return is_bc_far_variant1_at(instruction_addr) || 466 is_bc_far_variant2_at(instruction_addr) || 467 is_bc_far_variant3_at(instruction_addr); 468 } 469 470 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) { 471 if (is_bc_far_variant1_at(instruction_addr)) { 472 const address instruction_1_addr = instruction_addr; 473 const int instruction_1 = *(int*)instruction_1_addr; 474 return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr); 475 } else if (is_bc_far_variant2_at(instruction_addr)) { 476 const address instruction_2_addr = instruction_addr + 4; 477 return bxx_destination(instruction_2_addr); 478 } else if (is_bc_far_variant3_at(instruction_addr)) { 479 return instruction_addr + 8; 480 } 481 // variant 4 ??? 482 ShouldNotReachHere(); 483 return NULL; 484 } 485 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) { 486 487 if (is_bc_far_variant3_at(instruction_addr)) { 488 // variant 3, far cond branch to the next instruction, already patched to nops: 489 // 490 // nop 491 // endgroup 492 // SKIP/DEST: 493 // 494 return; 495 } 496 497 // first, extract boint and biint from the current branch 498 int boint = 0; 499 int biint = 0; 500 501 ResourceMark rm; 502 const int code_size = 2 * BytesPerInstWord; 503 CodeBuffer buf(instruction_addr, code_size); 504 MacroAssembler masm(&buf); 505 if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) { 506 // Far branch to next instruction: Optimize it by patching nops (produce variant 3). 507 masm.nop(); 508 masm.endgroup(); 509 } else { 510 if (is_bc_far_variant1_at(instruction_addr)) { 511 // variant 1, the 1st instruction contains the destination address: 512 // 513 // bcxx DEST 514 // nop 515 // 516 const int instruction_1 = *(int*)(instruction_addr); 517 boint = inv_bo_field(instruction_1); 518 biint = inv_bi_field(instruction_1); 519 } else if (is_bc_far_variant2_at(instruction_addr)) { 520 // variant 2, the 2nd instruction contains the destination address: 521 // 522 // b!cxx SKIP 523 // bxx DEST 524 // SKIP: 525 // 526 const int instruction_1 = *(int*)(instruction_addr); 527 boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))), 528 opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1)))); 529 biint = inv_bi_field(instruction_1); 530 } else { 531 // variant 4??? 532 ShouldNotReachHere(); 533 } 534 535 // second, set the new branch destination and optimize the code 536 if (dest != instruction_addr + 4 && // the bc_far is still unbound! 537 masm.is_within_range_of_bcxx(dest, instruction_addr)) { 538 // variant 1: 539 // 540 // bcxx DEST 541 // nop 542 // 543 masm.bc(boint, biint, dest); 544 masm.nop(); 545 } else { 546 // variant 2: 547 // 548 // b!cxx SKIP 549 // bxx DEST 550 // SKIP: 551 // 552 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 553 opposite_bcond(inv_boint_bcond(boint))); 554 const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord; 555 masm.bc(opposite_boint, biint, not_taken_pc); 556 masm.b(dest); 557 } 558 } 559 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 560 } 561 562 // Emit a NOT mt-safe patchable 64 bit absolute call/jump. 563 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) { 564 // get current pc 565 uint64_t start_pc = (uint64_t) pc(); 566 567 const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last 568 const address pc_of_b = (address) (start_pc + (0*BytesPerInstWord)); // b is first 569 570 // relocate here 571 if (rt != relocInfo::none) { 572 relocate(rt); 573 } 574 575 if ( ReoptimizeCallSequences && 576 (( link && is_within_range_of_b(dest, pc_of_bl)) || 577 (!link && is_within_range_of_b(dest, pc_of_b)))) { 578 // variant 2: 579 // Emit an optimized, pc-relative call/jump. 580 581 if (link) { 582 // some padding 583 nop(); 584 nop(); 585 nop(); 586 nop(); 587 nop(); 588 nop(); 589 590 // do the call 591 assert(pc() == pc_of_bl, "just checking"); 592 bl(dest, relocInfo::none); 593 } else { 594 // do the jump 595 assert(pc() == pc_of_b, "just checking"); 596 b(dest, relocInfo::none); 597 598 // some padding 599 nop(); 600 nop(); 601 nop(); 602 nop(); 603 nop(); 604 nop(); 605 } 606 607 // Assert that we can identify the emitted call/jump. 608 assert(is_bxx64_patchable_variant2_at((address)start_pc, link), 609 "can't identify emitted call"); 610 } else { 611 // variant 1: 612 mr(R0, R11); // spill R11 -> R0. 613 614 // Load the destination address into CTR, 615 // calculate destination relative to global toc. 616 calculate_address_from_global_toc(R11, dest, true, true, false); 617 618 mtctr(R11); 619 mr(R11, R0); // spill R11 <- R0. 620 nop(); 621 622 // do the call/jump 623 if (link) { 624 bctrl(); 625 } else{ 626 bctr(); 627 } 628 // Assert that we can identify the emitted call/jump. 629 assert(is_bxx64_patchable_variant1b_at((address)start_pc, link), 630 "can't identify emitted call"); 631 } 632 633 // Assert that we can identify the emitted call/jump. 634 assert(is_bxx64_patchable_at((address)start_pc, link), 635 "can't identify emitted call"); 636 assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest, 637 "wrong encoding of dest address"); 638 } 639 640 // Identify a bxx64_patchable instruction. 641 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) { 642 return is_bxx64_patchable_variant1b_at(instruction_addr, link) 643 //|| is_bxx64_patchable_variant1_at(instruction_addr, link) 644 || is_bxx64_patchable_variant2_at(instruction_addr, link); 645 } 646 647 // Does the call64_patchable instruction use a pc-relative encoding of 648 // the call destination? 649 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) { 650 // variant 2 is pc-relative 651 return is_bxx64_patchable_variant2_at(instruction_addr, link); 652 } 653 654 // Identify variant 1. 655 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) { 656 unsigned int* instr = (unsigned int*) instruction_addr; 657 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 658 && is_mtctr(instr[5]) // mtctr 659 && is_load_const_at(instruction_addr); 660 } 661 662 // Identify variant 1b: load destination relative to global toc. 663 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) { 664 unsigned int* instr = (unsigned int*) instruction_addr; 665 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 666 && is_mtctr(instr[3]) // mtctr 667 && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr); 668 } 669 670 // Identify variant 2. 671 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) { 672 unsigned int* instr = (unsigned int*) instruction_addr; 673 if (link) { 674 return is_bl (instr[6]) // bl dest is last 675 && is_nop(instr[0]) // nop 676 && is_nop(instr[1]) // nop 677 && is_nop(instr[2]) // nop 678 && is_nop(instr[3]) // nop 679 && is_nop(instr[4]) // nop 680 && is_nop(instr[5]); // nop 681 } else { 682 return is_b (instr[0]) // b dest is first 683 && is_nop(instr[1]) // nop 684 && is_nop(instr[2]) // nop 685 && is_nop(instr[3]) // nop 686 && is_nop(instr[4]) // nop 687 && is_nop(instr[5]) // nop 688 && is_nop(instr[6]); // nop 689 } 690 } 691 692 // Set dest address of a bxx64_patchable instruction. 693 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) { 694 ResourceMark rm; 695 int code_size = MacroAssembler::bxx64_patchable_size; 696 CodeBuffer buf(instruction_addr, code_size); 697 MacroAssembler masm(&buf); 698 masm.bxx64_patchable(dest, relocInfo::none, link); 699 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 700 } 701 702 // Get dest address of a bxx64_patchable instruction. 703 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) { 704 if (is_bxx64_patchable_variant1_at(instruction_addr, link)) { 705 return (address) (unsigned long) get_const(instruction_addr); 706 } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) { 707 unsigned int* instr = (unsigned int*) instruction_addr; 708 if (link) { 709 const int instr_idx = 6; // bl is last 710 int branchoffset = branch_destination(instr[instr_idx], 0); 711 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 712 } else { 713 const int instr_idx = 0; // b is first 714 int branchoffset = branch_destination(instr[instr_idx], 0); 715 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 716 } 717 // Load dest relative to global toc. 718 } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) { 719 return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, 720 instruction_addr); 721 } else { 722 ShouldNotReachHere(); 723 return NULL; 724 } 725 } 726 727 // Uses ordering which corresponds to ABI: 728 // _savegpr0_14: std r14,-144(r1) 729 // _savegpr0_15: std r15,-136(r1) 730 // _savegpr0_16: std r16,-128(r1) 731 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) { 732 std(R14, offset, dst); offset += 8; 733 std(R15, offset, dst); offset += 8; 734 std(R16, offset, dst); offset += 8; 735 std(R17, offset, dst); offset += 8; 736 std(R18, offset, dst); offset += 8; 737 std(R19, offset, dst); offset += 8; 738 std(R20, offset, dst); offset += 8; 739 std(R21, offset, dst); offset += 8; 740 std(R22, offset, dst); offset += 8; 741 std(R23, offset, dst); offset += 8; 742 std(R24, offset, dst); offset += 8; 743 std(R25, offset, dst); offset += 8; 744 std(R26, offset, dst); offset += 8; 745 std(R27, offset, dst); offset += 8; 746 std(R28, offset, dst); offset += 8; 747 std(R29, offset, dst); offset += 8; 748 std(R30, offset, dst); offset += 8; 749 std(R31, offset, dst); offset += 8; 750 751 stfd(F14, offset, dst); offset += 8; 752 stfd(F15, offset, dst); offset += 8; 753 stfd(F16, offset, dst); offset += 8; 754 stfd(F17, offset, dst); offset += 8; 755 stfd(F18, offset, dst); offset += 8; 756 stfd(F19, offset, dst); offset += 8; 757 stfd(F20, offset, dst); offset += 8; 758 stfd(F21, offset, dst); offset += 8; 759 stfd(F22, offset, dst); offset += 8; 760 stfd(F23, offset, dst); offset += 8; 761 stfd(F24, offset, dst); offset += 8; 762 stfd(F25, offset, dst); offset += 8; 763 stfd(F26, offset, dst); offset += 8; 764 stfd(F27, offset, dst); offset += 8; 765 stfd(F28, offset, dst); offset += 8; 766 stfd(F29, offset, dst); offset += 8; 767 stfd(F30, offset, dst); offset += 8; 768 stfd(F31, offset, dst); 769 } 770 771 // Uses ordering which corresponds to ABI: 772 // _restgpr0_14: ld r14,-144(r1) 773 // _restgpr0_15: ld r15,-136(r1) 774 // _restgpr0_16: ld r16,-128(r1) 775 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) { 776 ld(R14, offset, src); offset += 8; 777 ld(R15, offset, src); offset += 8; 778 ld(R16, offset, src); offset += 8; 779 ld(R17, offset, src); offset += 8; 780 ld(R18, offset, src); offset += 8; 781 ld(R19, offset, src); offset += 8; 782 ld(R20, offset, src); offset += 8; 783 ld(R21, offset, src); offset += 8; 784 ld(R22, offset, src); offset += 8; 785 ld(R23, offset, src); offset += 8; 786 ld(R24, offset, src); offset += 8; 787 ld(R25, offset, src); offset += 8; 788 ld(R26, offset, src); offset += 8; 789 ld(R27, offset, src); offset += 8; 790 ld(R28, offset, src); offset += 8; 791 ld(R29, offset, src); offset += 8; 792 ld(R30, offset, src); offset += 8; 793 ld(R31, offset, src); offset += 8; 794 795 // FP registers 796 lfd(F14, offset, src); offset += 8; 797 lfd(F15, offset, src); offset += 8; 798 lfd(F16, offset, src); offset += 8; 799 lfd(F17, offset, src); offset += 8; 800 lfd(F18, offset, src); offset += 8; 801 lfd(F19, offset, src); offset += 8; 802 lfd(F20, offset, src); offset += 8; 803 lfd(F21, offset, src); offset += 8; 804 lfd(F22, offset, src); offset += 8; 805 lfd(F23, offset, src); offset += 8; 806 lfd(F24, offset, src); offset += 8; 807 lfd(F25, offset, src); offset += 8; 808 lfd(F26, offset, src); offset += 8; 809 lfd(F27, offset, src); offset += 8; 810 lfd(F28, offset, src); offset += 8; 811 lfd(F29, offset, src); offset += 8; 812 lfd(F30, offset, src); offset += 8; 813 lfd(F31, offset, src); 814 } 815 816 // For verify_oops. 817 void MacroAssembler::save_volatile_gprs(Register dst, int offset) { 818 std(R2, offset, dst); offset += 8; 819 std(R3, offset, dst); offset += 8; 820 std(R4, offset, dst); offset += 8; 821 std(R5, offset, dst); offset += 8; 822 std(R6, offset, dst); offset += 8; 823 std(R7, offset, dst); offset += 8; 824 std(R8, offset, dst); offset += 8; 825 std(R9, offset, dst); offset += 8; 826 std(R10, offset, dst); offset += 8; 827 std(R11, offset, dst); offset += 8; 828 std(R12, offset, dst); offset += 8; 829 830 stfd(F0, offset, dst); offset += 8; 831 stfd(F1, offset, dst); offset += 8; 832 stfd(F2, offset, dst); offset += 8; 833 stfd(F3, offset, dst); offset += 8; 834 stfd(F4, offset, dst); offset += 8; 835 stfd(F5, offset, dst); offset += 8; 836 stfd(F6, offset, dst); offset += 8; 837 stfd(F7, offset, dst); offset += 8; 838 stfd(F8, offset, dst); offset += 8; 839 stfd(F9, offset, dst); offset += 8; 840 stfd(F10, offset, dst); offset += 8; 841 stfd(F11, offset, dst); offset += 8; 842 stfd(F12, offset, dst); offset += 8; 843 stfd(F13, offset, dst); 844 } 845 846 // For verify_oops. 847 void MacroAssembler::restore_volatile_gprs(Register src, int offset) { 848 ld(R2, offset, src); offset += 8; 849 ld(R3, offset, src); offset += 8; 850 ld(R4, offset, src); offset += 8; 851 ld(R5, offset, src); offset += 8; 852 ld(R6, offset, src); offset += 8; 853 ld(R7, offset, src); offset += 8; 854 ld(R8, offset, src); offset += 8; 855 ld(R9, offset, src); offset += 8; 856 ld(R10, offset, src); offset += 8; 857 ld(R11, offset, src); offset += 8; 858 ld(R12, offset, src); offset += 8; 859 860 lfd(F0, offset, src); offset += 8; 861 lfd(F1, offset, src); offset += 8; 862 lfd(F2, offset, src); offset += 8; 863 lfd(F3, offset, src); offset += 8; 864 lfd(F4, offset, src); offset += 8; 865 lfd(F5, offset, src); offset += 8; 866 lfd(F6, offset, src); offset += 8; 867 lfd(F7, offset, src); offset += 8; 868 lfd(F8, offset, src); offset += 8; 869 lfd(F9, offset, src); offset += 8; 870 lfd(F10, offset, src); offset += 8; 871 lfd(F11, offset, src); offset += 8; 872 lfd(F12, offset, src); offset += 8; 873 lfd(F13, offset, src); 874 } 875 876 void MacroAssembler::save_LR_CR(Register tmp) { 877 mfcr(tmp); 878 std(tmp, _abi(cr), R1_SP); 879 mflr(tmp); 880 std(tmp, _abi(lr), R1_SP); 881 // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad) 882 } 883 884 void MacroAssembler::restore_LR_CR(Register tmp) { 885 assert(tmp != R1_SP, "must be distinct"); 886 ld(tmp, _abi(lr), R1_SP); 887 mtlr(tmp); 888 ld(tmp, _abi(cr), R1_SP); 889 mtcr(tmp); 890 } 891 892 address MacroAssembler::get_PC_trash_LR(Register result) { 893 Label L; 894 bl(L); 895 bind(L); 896 address lr_pc = pc(); 897 mflr(result); 898 return lr_pc; 899 } 900 901 void MacroAssembler::resize_frame(Register offset, Register tmp) { 902 #ifdef ASSERT 903 assert_different_registers(offset, tmp, R1_SP); 904 andi_(tmp, offset, frame::alignment_in_bytes-1); 905 asm_assert_eq("resize_frame: unaligned", 0x204); 906 #endif 907 908 // tmp <- *(SP) 909 ld(tmp, _abi(callers_sp), R1_SP); 910 // addr <- SP + offset; 911 // *(addr) <- tmp; 912 // SP <- addr 913 stdux(tmp, R1_SP, offset); 914 } 915 916 void MacroAssembler::resize_frame(int offset, Register tmp) { 917 assert(is_simm(offset, 16), "too big an offset"); 918 assert_different_registers(tmp, R1_SP); 919 assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned"); 920 // tmp <- *(SP) 921 ld(tmp, _abi(callers_sp), R1_SP); 922 // addr <- SP + offset; 923 // *(addr) <- tmp; 924 // SP <- addr 925 stdu(tmp, offset, R1_SP); 926 } 927 928 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) { 929 // (addr == tmp1) || (addr == tmp2) is allowed here! 930 assert(tmp1 != tmp2, "must be distinct"); 931 932 // compute offset w.r.t. current stack pointer 933 // tmp_1 <- addr - SP (!) 934 subf(tmp1, R1_SP, addr); 935 936 // atomically update SP keeping back link. 937 resize_frame(tmp1/* offset */, tmp2/* tmp */); 938 } 939 940 void MacroAssembler::push_frame(Register bytes, Register tmp) { 941 #ifdef ASSERT 942 assert(bytes != R0, "r0 not allowed here"); 943 andi_(R0, bytes, frame::alignment_in_bytes-1); 944 asm_assert_eq("push_frame(Reg, Reg): unaligned", 0x203); 945 #endif 946 neg(tmp, bytes); 947 stdux(R1_SP, R1_SP, tmp); 948 } 949 950 // Push a frame of size `bytes'. 951 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) { 952 long offset = align_addr(bytes, frame::alignment_in_bytes); 953 if (is_simm(-offset, 16)) { 954 stdu(R1_SP, -offset, R1_SP); 955 } else { 956 load_const_optimized(tmp, -offset); 957 stdux(R1_SP, R1_SP, tmp); 958 } 959 } 960 961 // Push a frame of size `bytes' plus abi_reg_args on top. 962 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) { 963 push_frame(bytes + frame::abi_reg_args_size, tmp); 964 } 965 966 // Setup up a new C frame with a spill area for non-volatile GPRs and 967 // additional space for local variables. 968 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes, 969 Register tmp) { 970 push_frame(bytes + frame::abi_reg_args_size + frame::spill_nonvolatiles_size, tmp); 971 } 972 973 // Pop current C frame. 974 void MacroAssembler::pop_frame() { 975 ld(R1_SP, _abi(callers_sp), R1_SP); 976 } 977 978 #if defined(ABI_ELFv2) 979 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) { 980 // TODO(asmundak): make sure the caller uses R12 as function descriptor 981 // most of the times. 982 if (R12 != r_function_entry) { 983 mr(R12, r_function_entry); 984 } 985 mtctr(R12); 986 // Do a call or a branch. 987 if (and_link) { 988 bctrl(); 989 } else { 990 bctr(); 991 } 992 _last_calls_return_pc = pc(); 993 994 return _last_calls_return_pc; 995 } 996 997 // Call a C function via a function descriptor and use full C 998 // calling conventions. Updates and returns _last_calls_return_pc. 999 address MacroAssembler::call_c(Register r_function_entry) { 1000 return branch_to(r_function_entry, /*and_link=*/true); 1001 } 1002 1003 // For tail calls: only branch, don't link, so callee returns to caller of this function. 1004 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) { 1005 return branch_to(r_function_entry, /*and_link=*/false); 1006 } 1007 1008 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) { 1009 load_const(R12, function_entry, R0); 1010 return branch_to(R12, /*and_link=*/true); 1011 } 1012 1013 #else 1014 // Generic version of a call to C function via a function descriptor 1015 // with variable support for C calling conventions (TOC, ENV, etc.). 1016 // Updates and returns _last_calls_return_pc. 1017 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call, 1018 bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) { 1019 // we emit standard ptrgl glue code here 1020 assert((function_descriptor != R0), "function_descriptor cannot be R0"); 1021 1022 // retrieve necessary entries from the function descriptor 1023 ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor); 1024 mtctr(R0); 1025 1026 if (load_toc_of_callee) { 1027 ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor); 1028 } 1029 if (load_env_of_callee) { 1030 ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor); 1031 } else if (load_toc_of_callee) { 1032 li(R11, 0); 1033 } 1034 1035 // do a call or a branch 1036 if (and_link) { 1037 bctrl(); 1038 } else { 1039 bctr(); 1040 } 1041 _last_calls_return_pc = pc(); 1042 1043 return _last_calls_return_pc; 1044 } 1045 1046 // Call a C function via a function descriptor and use full C calling 1047 // conventions. 1048 // We don't use the TOC in generated code, so there is no need to save 1049 // and restore its value. 1050 address MacroAssembler::call_c(Register fd) { 1051 return branch_to(fd, /*and_link=*/true, 1052 /*save toc=*/false, 1053 /*restore toc=*/false, 1054 /*load toc=*/true, 1055 /*load env=*/true); 1056 } 1057 1058 address MacroAssembler::call_c_and_return_to_caller(Register fd) { 1059 return branch_to(fd, /*and_link=*/false, 1060 /*save toc=*/false, 1061 /*restore toc=*/false, 1062 /*load toc=*/true, 1063 /*load env=*/true); 1064 } 1065 1066 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) { 1067 if (rt != relocInfo::none) { 1068 // this call needs to be relocatable 1069 if (!ReoptimizeCallSequences 1070 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1071 || fd == NULL // support code-size estimation 1072 || !fd->is_friend_function() 1073 || fd->entry() == NULL) { 1074 // it's not a friend function as defined by class FunctionDescriptor, 1075 // so do a full call-c here. 1076 load_const(R11, (address)fd, R0); 1077 1078 bool has_env = (fd != NULL && fd->env() != NULL); 1079 return branch_to(R11, /*and_link=*/true, 1080 /*save toc=*/false, 1081 /*restore toc=*/false, 1082 /*load toc=*/true, 1083 /*load env=*/has_env); 1084 } else { 1085 // It's a friend function. Load the entry point and don't care about 1086 // toc and env. Use an optimizable call instruction, but ensure the 1087 // same code-size as in the case of a non-friend function. 1088 nop(); 1089 nop(); 1090 nop(); 1091 bl64_patchable(fd->entry(), rt); 1092 _last_calls_return_pc = pc(); 1093 return _last_calls_return_pc; 1094 } 1095 } else { 1096 // This call does not need to be relocatable, do more aggressive 1097 // optimizations. 1098 if (!ReoptimizeCallSequences 1099 || !fd->is_friend_function()) { 1100 // It's not a friend function as defined by class FunctionDescriptor, 1101 // so do a full call-c here. 1102 load_const(R11, (address)fd, R0); 1103 return branch_to(R11, /*and_link=*/true, 1104 /*save toc=*/false, 1105 /*restore toc=*/false, 1106 /*load toc=*/true, 1107 /*load env=*/true); 1108 } else { 1109 // it's a friend function, load the entry point and don't care about 1110 // toc and env. 1111 address dest = fd->entry(); 1112 if (is_within_range_of_b(dest, pc())) { 1113 bl(dest); 1114 } else { 1115 bl64_patchable(dest, rt); 1116 } 1117 _last_calls_return_pc = pc(); 1118 return _last_calls_return_pc; 1119 } 1120 } 1121 } 1122 1123 // Call a C function. All constants needed reside in TOC. 1124 // 1125 // Read the address to call from the TOC. 1126 // Read env from TOC, if fd specifies an env. 1127 // Read new TOC from TOC. 1128 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd, 1129 relocInfo::relocType rt, Register toc) { 1130 if (!ReoptimizeCallSequences 1131 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1132 || !fd->is_friend_function()) { 1133 // It's not a friend function as defined by class FunctionDescriptor, 1134 // so do a full call-c here. 1135 assert(fd->entry() != NULL, "function must be linked"); 1136 1137 AddressLiteral fd_entry(fd->entry()); 1138 bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true); 1139 mtctr(R11); 1140 if (fd->env() == NULL) { 1141 li(R11, 0); 1142 nop(); 1143 } else { 1144 AddressLiteral fd_env(fd->env()); 1145 success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true); 1146 } 1147 AddressLiteral fd_toc(fd->toc()); 1148 // Set R2_TOC (load from toc) 1149 success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true); 1150 bctrl(); 1151 _last_calls_return_pc = pc(); 1152 if (!success) { return NULL; } 1153 } else { 1154 // It's a friend function, load the entry point and don't care about 1155 // toc and env. Use an optimizable call instruction, but ensure the 1156 // same code-size as in the case of a non-friend function. 1157 nop(); 1158 bl64_patchable(fd->entry(), rt); 1159 _last_calls_return_pc = pc(); 1160 } 1161 return _last_calls_return_pc; 1162 } 1163 #endif // ABI_ELFv2 1164 1165 void MacroAssembler::call_VM_base(Register oop_result, 1166 Register last_java_sp, 1167 address entry_point, 1168 bool check_exceptions) { 1169 BLOCK_COMMENT("call_VM {"); 1170 // Determine last_java_sp register. 1171 if (!last_java_sp->is_valid()) { 1172 last_java_sp = R1_SP; 1173 } 1174 set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1); 1175 1176 // ARG1 must hold thread address. 1177 mr(R3_ARG1, R16_thread); 1178 #if defined(ABI_ELFv2) 1179 address return_pc = call_c(entry_point, relocInfo::none); 1180 #else 1181 address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none); 1182 #endif 1183 1184 reset_last_Java_frame(); 1185 1186 // Check for pending exceptions. 1187 if (check_exceptions) { 1188 // We don't check for exceptions here. 1189 ShouldNotReachHere(); 1190 } 1191 1192 // Get oop result if there is one and reset the value in the thread. 1193 if (oop_result->is_valid()) { 1194 get_vm_result(oop_result); 1195 } 1196 1197 _last_calls_return_pc = return_pc; 1198 BLOCK_COMMENT("} call_VM"); 1199 } 1200 1201 void MacroAssembler::call_VM_leaf_base(address entry_point) { 1202 BLOCK_COMMENT("call_VM_leaf {"); 1203 #if defined(ABI_ELFv2) 1204 call_c(entry_point, relocInfo::none); 1205 #else 1206 call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none); 1207 #endif 1208 BLOCK_COMMENT("} call_VM_leaf"); 1209 } 1210 1211 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) { 1212 call_VM_base(oop_result, noreg, entry_point, check_exceptions); 1213 } 1214 1215 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, 1216 bool check_exceptions) { 1217 // R3_ARG1 is reserved for the thread. 1218 mr_if_needed(R4_ARG2, arg_1); 1219 call_VM(oop_result, entry_point, check_exceptions); 1220 } 1221 1222 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, 1223 bool check_exceptions) { 1224 // R3_ARG1 is reserved for the thread 1225 mr_if_needed(R4_ARG2, arg_1); 1226 assert(arg_2 != R4_ARG2, "smashed argument"); 1227 mr_if_needed(R5_ARG3, arg_2); 1228 call_VM(oop_result, entry_point, check_exceptions); 1229 } 1230 1231 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3, 1232 bool check_exceptions) { 1233 // R3_ARG1 is reserved for the thread 1234 mr_if_needed(R4_ARG2, arg_1); 1235 assert(arg_2 != R4_ARG2, "smashed argument"); 1236 mr_if_needed(R5_ARG3, arg_2); 1237 mr_if_needed(R6_ARG4, arg_3); 1238 call_VM(oop_result, entry_point, check_exceptions); 1239 } 1240 1241 void MacroAssembler::call_VM_leaf(address entry_point) { 1242 call_VM_leaf_base(entry_point); 1243 } 1244 1245 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) { 1246 mr_if_needed(R3_ARG1, arg_1); 1247 call_VM_leaf(entry_point); 1248 } 1249 1250 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) { 1251 mr_if_needed(R3_ARG1, arg_1); 1252 assert(arg_2 != R3_ARG1, "smashed argument"); 1253 mr_if_needed(R4_ARG2, arg_2); 1254 call_VM_leaf(entry_point); 1255 } 1256 1257 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) { 1258 mr_if_needed(R3_ARG1, arg_1); 1259 assert(arg_2 != R3_ARG1, "smashed argument"); 1260 mr_if_needed(R4_ARG2, arg_2); 1261 assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument"); 1262 mr_if_needed(R5_ARG3, arg_3); 1263 call_VM_leaf(entry_point); 1264 } 1265 1266 // Check whether instruction is a read access to the polling page 1267 // which was emitted by load_from_polling_page(..). 1268 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext, 1269 address* polling_address_ptr) { 1270 if (!is_ld(instruction)) 1271 return false; // It's not a ld. Fail. 1272 1273 int rt = inv_rt_field(instruction); 1274 int ra = inv_ra_field(instruction); 1275 int ds = inv_ds_field(instruction); 1276 if (!(ds == 0 && ra != 0 && rt == 0)) { 1277 return false; // It's not a ld(r0, X, ra). Fail. 1278 } 1279 1280 if (!ucontext) { 1281 // Set polling address. 1282 if (polling_address_ptr != NULL) { 1283 *polling_address_ptr = NULL; 1284 } 1285 return true; // No ucontext given. Can't check value of ra. Assume true. 1286 } 1287 1288 #ifdef LINUX 1289 // Ucontext given. Check that register ra contains the address of 1290 // the safepoing polling page. 1291 ucontext_t* uc = (ucontext_t*) ucontext; 1292 // Set polling address. 1293 address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds; 1294 if (polling_address_ptr != NULL) { 1295 *polling_address_ptr = addr; 1296 } 1297 return os::is_poll_address(addr); 1298 #else 1299 // Not on Linux, ucontext must be NULL. 1300 ShouldNotReachHere(); 1301 return false; 1302 #endif 1303 } 1304 1305 void MacroAssembler::bang_stack_with_offset(int offset) { 1306 // When increasing the stack, the old stack pointer will be written 1307 // to the new top of stack according to the PPC64 abi. 1308 // Therefore, stack banging is not necessary when increasing 1309 // the stack by <= os::vm_page_size() bytes. 1310 // When increasing the stack by a larger amount, this method is 1311 // called repeatedly to bang the intermediate pages. 1312 1313 // Stack grows down, caller passes positive offset. 1314 assert(offset > 0, "must bang with positive offset"); 1315 1316 long stdoffset = -offset; 1317 1318 if (is_simm(stdoffset, 16)) { 1319 // Signed 16 bit offset, a simple std is ok. 1320 if (UseLoadInstructionsForStackBangingPPC64) { 1321 ld(R0, (int)(signed short)stdoffset, R1_SP); 1322 } else { 1323 std(R0,(int)(signed short)stdoffset, R1_SP); 1324 } 1325 } else if (is_simm(stdoffset, 31)) { 1326 const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset); 1327 const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset); 1328 1329 Register tmp = R11; 1330 addis(tmp, R1_SP, hi); 1331 if (UseLoadInstructionsForStackBangingPPC64) { 1332 ld(R0, lo, tmp); 1333 } else { 1334 std(R0, lo, tmp); 1335 } 1336 } else { 1337 ShouldNotReachHere(); 1338 } 1339 } 1340 1341 // If instruction is a stack bang of the form 1342 // std R0, x(Ry), (see bang_stack_with_offset()) 1343 // stdu R1_SP, x(R1_SP), (see push_frame(), resize_frame()) 1344 // or stdux R1_SP, Rx, R1_SP (see push_frame(), resize_frame()) 1345 // return the banged address. Otherwise, return 0. 1346 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) { 1347 #ifdef LINUX 1348 ucontext_t* uc = (ucontext_t*) ucontext; 1349 int rs = inv_rs_field(instruction); 1350 int ra = inv_ra_field(instruction); 1351 if ( (is_ld(instruction) && rs == 0 && UseLoadInstructionsForStackBangingPPC64) 1352 || (is_std(instruction) && rs == 0 && !UseLoadInstructionsForStackBangingPPC64) 1353 || (is_stdu(instruction) && rs == 1)) { 1354 int ds = inv_ds_field(instruction); 1355 // return banged address 1356 return ds+(address)uc->uc_mcontext.regs->gpr[ra]; 1357 } else if (is_stdux(instruction) && rs == 1) { 1358 int rb = inv_rb_field(instruction); 1359 address sp = (address)uc->uc_mcontext.regs->gpr[1]; 1360 long rb_val = (long)uc->uc_mcontext.regs->gpr[rb]; 1361 return ra != 1 || rb_val >= 0 ? NULL // not a stack bang 1362 : sp + rb_val; // banged address 1363 } 1364 return NULL; // not a stack bang 1365 #else 1366 // workaround not needed on !LINUX :-) 1367 ShouldNotCallThis(); 1368 return NULL; 1369 #endif 1370 } 1371 1372 void MacroAssembler::reserved_stack_check(Register return_pc) { 1373 // Test if reserved zone needs to be enabled. 1374 Label no_reserved_zone_enabling; 1375 1376 ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread); 1377 cmpld(CCR0, R1_SP, R0); 1378 blt_predict_taken(CCR0, no_reserved_zone_enabling); 1379 1380 // Enable reserved zone again, throw stack overflow exception. 1381 push_frame_reg_args(0, R0); 1382 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread); 1383 pop_frame(); 1384 mtlr(return_pc); 1385 load_const_optimized(R0, StubRoutines::throw_delayed_StackOverflowError_entry()); 1386 mtctr(R0); 1387 bctr(); 1388 1389 should_not_reach_here(); 1390 1391 bind(no_reserved_zone_enabling); 1392 } 1393 1394 void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base, 1395 bool cmpxchgx_hint) { 1396 Label retry; 1397 bind(retry); 1398 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1399 stdcx_(exchange_value, addr_base); 1400 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1401 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1402 } else { 1403 bne( CCR0, retry); // StXcx_ sets CCR0. 1404 } 1405 } 1406 1407 void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base, 1408 Register tmp, bool cmpxchgx_hint) { 1409 Label retry; 1410 bind(retry); 1411 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1412 add(tmp, dest_current_value, inc_value); 1413 stdcx_(tmp, addr_base); 1414 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1415 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1416 } else { 1417 bne( CCR0, retry); // StXcx_ sets CCR0. 1418 } 1419 } 1420 1421 // Word/sub-word atomic helper functions 1422 1423 // Temps and addr_base are killed if size < 4 and processor does not support respective instructions. 1424 // Only signed types are supported with size < 4. 1425 // Atomic add always kills tmp1. 1426 void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value, 1427 Register addr_base, Register tmp1, Register tmp2, Register tmp3, 1428 bool cmpxchgx_hint, bool is_add, int size) { 1429 // Sub-word instructions are available since Power 8. 1430 // For older processors, instruction_type != size holds, and we 1431 // emulate the sub-word instructions by constructing a 4-byte value 1432 // that leaves the other bytes unchanged. 1433 const int instruction_type = VM_Version::has_lqarx() ? size : 4; 1434 1435 Label retry; 1436 Register shift_amount = noreg, 1437 val32 = dest_current_value, 1438 modval = is_add ? tmp1 : exchange_value; 1439 1440 if (instruction_type != size) { 1441 assert_different_registers(tmp1, tmp2, tmp3, dest_current_value, exchange_value, addr_base); 1442 modval = tmp1; 1443 shift_amount = tmp2; 1444 val32 = tmp3; 1445 // Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned. 1446 #ifdef VM_LITTLE_ENDIAN 1447 rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8; 1448 clrrdi(addr_base, addr_base, 2); 1449 #else 1450 xori(shift_amount, addr_base, (size == 1) ? 3 : 2); 1451 clrrdi(addr_base, addr_base, 2); 1452 rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16; 1453 #endif 1454 } 1455 1456 // atomic emulation loop 1457 bind(retry); 1458 1459 switch (instruction_type) { 1460 case 4: lwarx(val32, addr_base, cmpxchgx_hint); break; 1461 case 2: lharx(val32, addr_base, cmpxchgx_hint); break; 1462 case 1: lbarx(val32, addr_base, cmpxchgx_hint); break; 1463 default: ShouldNotReachHere(); 1464 } 1465 1466 if (instruction_type != size) { 1467 srw(dest_current_value, val32, shift_amount); 1468 } 1469 1470 if (is_add) { add(modval, dest_current_value, exchange_value); } 1471 1472 if (instruction_type != size) { 1473 // Transform exchange value such that the replacement can be done by one xor instruction. 1474 xorr(modval, dest_current_value, is_add ? modval : exchange_value); 1475 clrldi(modval, modval, (size == 1) ? 56 : 48); 1476 slw(modval, modval, shift_amount); 1477 xorr(modval, val32, modval); 1478 } 1479 1480 switch (instruction_type) { 1481 case 4: stwcx_(modval, addr_base); break; 1482 case 2: sthcx_(modval, addr_base); break; 1483 case 1: stbcx_(modval, addr_base); break; 1484 default: ShouldNotReachHere(); 1485 } 1486 1487 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1488 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1489 } else { 1490 bne( CCR0, retry); // StXcx_ sets CCR0. 1491 } 1492 1493 // l?arx zero-extends, but Java wants byte/short values sign-extended. 1494 if (size == 1) { 1495 extsb(dest_current_value, dest_current_value); 1496 } else if (size == 2) { 1497 extsh(dest_current_value, dest_current_value); 1498 }; 1499 } 1500 1501 // Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions. 1502 // Only signed types are supported with size < 4. 1503 void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value, 1504 Register compare_value, Register exchange_value, 1505 Register addr_base, Register tmp1, Register tmp2, 1506 Label &retry, Label &failed, bool cmpxchgx_hint, int size) { 1507 // Sub-word instructions are available since Power 8. 1508 // For older processors, instruction_type != size holds, and we 1509 // emulate the sub-word instructions by constructing a 4-byte value 1510 // that leaves the other bytes unchanged. 1511 const int instruction_type = VM_Version::has_lqarx() ? size : 4; 1512 1513 Register shift_amount = noreg, 1514 val32 = dest_current_value, 1515 modval = exchange_value; 1516 1517 if (instruction_type != size) { 1518 assert_different_registers(tmp1, tmp2, dest_current_value, compare_value, exchange_value, addr_base); 1519 shift_amount = tmp1; 1520 val32 = tmp2; 1521 modval = tmp2; 1522 // Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned. 1523 #ifdef VM_LITTLE_ENDIAN 1524 rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8; 1525 clrrdi(addr_base, addr_base, 2); 1526 #else 1527 xori(shift_amount, addr_base, (size == 1) ? 3 : 2); 1528 clrrdi(addr_base, addr_base, 2); 1529 rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16; 1530 #endif 1531 // Transform exchange value such that the replacement can be done by one xor instruction. 1532 xorr(exchange_value, compare_value, exchange_value); 1533 clrldi(exchange_value, exchange_value, (size == 1) ? 56 : 48); 1534 slw(exchange_value, exchange_value, shift_amount); 1535 } 1536 1537 // atomic emulation loop 1538 bind(retry); 1539 1540 switch (instruction_type) { 1541 case 4: lwarx(val32, addr_base, cmpxchgx_hint); break; 1542 case 2: lharx(val32, addr_base, cmpxchgx_hint); break; 1543 case 1: lbarx(val32, addr_base, cmpxchgx_hint); break; 1544 default: ShouldNotReachHere(); 1545 } 1546 1547 if (instruction_type != size) { 1548 srw(dest_current_value, val32, shift_amount); 1549 } 1550 if (size == 1) { 1551 extsb(dest_current_value, dest_current_value); 1552 } else if (size == 2) { 1553 extsh(dest_current_value, dest_current_value); 1554 }; 1555 1556 cmpw(flag, dest_current_value, compare_value); 1557 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1558 bne_predict_not_taken(flag, failed); 1559 } else { 1560 bne( flag, failed); 1561 } 1562 // branch to done => (flag == ne), (dest_current_value != compare_value) 1563 // fall through => (flag == eq), (dest_current_value == compare_value) 1564 1565 if (instruction_type != size) { 1566 xorr(modval, val32, exchange_value); 1567 } 1568 1569 switch (instruction_type) { 1570 case 4: stwcx_(modval, addr_base); break; 1571 case 2: sthcx_(modval, addr_base); break; 1572 case 1: stbcx_(modval, addr_base); break; 1573 default: ShouldNotReachHere(); 1574 } 1575 } 1576 1577 // CmpxchgX sets condition register to cmpX(current, compare). 1578 void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value, 1579 Register compare_value, Register exchange_value, 1580 Register addr_base, Register tmp1, Register tmp2, 1581 int semantics, bool cmpxchgx_hint, 1582 Register int_flag_success, bool contention_hint, bool weak, int size) { 1583 Label retry; 1584 Label failed; 1585 Label done; 1586 1587 // Save one branch if result is returned via register and 1588 // result register is different from the other ones. 1589 bool use_result_reg = (int_flag_success != noreg); 1590 bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value && 1591 int_flag_success != exchange_value && int_flag_success != addr_base && 1592 int_flag_success != tmp1 && int_flag_success != tmp2); 1593 assert(!weak || flag == CCR0, "weak only supported with CCR0"); 1594 assert(size == 1 || size == 2 || size == 4, "unsupported"); 1595 1596 if (use_result_reg && preset_result_reg) { 1597 li(int_flag_success, 0); // preset (assume cas failed) 1598 } 1599 1600 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1601 if (contention_hint) { // Don't try to reserve if cmp fails. 1602 switch (size) { 1603 case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break; 1604 case 2: lha(dest_current_value, 0, addr_base); break; 1605 case 4: lwz(dest_current_value, 0, addr_base); break; 1606 default: ShouldNotReachHere(); 1607 } 1608 cmpw(flag, dest_current_value, compare_value); 1609 bne(flag, failed); 1610 } 1611 1612 // release/fence semantics 1613 if (semantics & MemBarRel) { 1614 release(); 1615 } 1616 1617 cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base, tmp1, tmp2, 1618 retry, failed, cmpxchgx_hint, size); 1619 if (!weak || use_result_reg) { 1620 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1621 bne_predict_not_taken(CCR0, weak ? failed : retry); // StXcx_ sets CCR0. 1622 } else { 1623 bne( CCR0, weak ? failed : retry); // StXcx_ sets CCR0. 1624 } 1625 } 1626 // fall through => (flag == eq), (dest_current_value == compare_value), (swapped) 1627 1628 // Result in register (must do this at the end because int_flag_success can be the 1629 // same register as one above). 1630 if (use_result_reg) { 1631 li(int_flag_success, 1); 1632 } 1633 1634 if (semantics & MemBarFenceAfter) { 1635 fence(); 1636 } else if (semantics & MemBarAcq) { 1637 isync(); 1638 } 1639 1640 if (use_result_reg && !preset_result_reg) { 1641 b(done); 1642 } 1643 1644 bind(failed); 1645 if (use_result_reg && !preset_result_reg) { 1646 li(int_flag_success, 0); 1647 } 1648 1649 bind(done); 1650 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1651 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1652 } 1653 1654 // Preforms atomic compare exchange: 1655 // if (compare_value == *addr_base) 1656 // *addr_base = exchange_value 1657 // int_flag_success = 1; 1658 // else 1659 // int_flag_success = 0; 1660 // 1661 // ConditionRegister flag = cmp(compare_value, *addr_base) 1662 // Register dest_current_value = *addr_base 1663 // Register compare_value Used to compare with value in memory 1664 // Register exchange_value Written to memory if compare_value == *addr_base 1665 // Register addr_base The memory location to compareXChange 1666 // Register int_flag_success Set to 1 if exchange_value was written to *addr_base 1667 // 1668 // To avoid the costly compare exchange the value is tested beforehand. 1669 // Several special cases exist to avoid that unnecessary information is generated. 1670 // 1671 void MacroAssembler::cmpxchgd(ConditionRegister flag, 1672 Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value, 1673 Register addr_base, int semantics, bool cmpxchgx_hint, 1674 Register int_flag_success, Label* failed_ext, bool contention_hint, bool weak) { 1675 Label retry; 1676 Label failed_int; 1677 Label& failed = (failed_ext != NULL) ? *failed_ext : failed_int; 1678 Label done; 1679 1680 // Save one branch if result is returned via register and result register is different from the other ones. 1681 bool use_result_reg = (int_flag_success!=noreg); 1682 bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() && 1683 int_flag_success!=exchange_value && int_flag_success!=addr_base); 1684 assert(!weak || flag == CCR0, "weak only supported with CCR0"); 1685 assert(int_flag_success == noreg || failed_ext == NULL, "cannot have both"); 1686 1687 if (use_result_reg && preset_result_reg) { 1688 li(int_flag_success, 0); // preset (assume cas failed) 1689 } 1690 1691 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1692 if (contention_hint) { // Don't try to reserve if cmp fails. 1693 ld(dest_current_value, 0, addr_base); 1694 cmpd(flag, compare_value, dest_current_value); 1695 bne(flag, failed); 1696 } 1697 1698 // release/fence semantics 1699 if (semantics & MemBarRel) { 1700 release(); 1701 } 1702 1703 // atomic emulation loop 1704 bind(retry); 1705 1706 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1707 cmpd(flag, compare_value, dest_current_value); 1708 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1709 bne_predict_not_taken(flag, failed); 1710 } else { 1711 bne( flag, failed); 1712 } 1713 1714 stdcx_(exchange_value, addr_base); 1715 if (!weak || use_result_reg || failed_ext) { 1716 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1717 bne_predict_not_taken(CCR0, weak ? failed : retry); // stXcx_ sets CCR0 1718 } else { 1719 bne( CCR0, weak ? failed : retry); // stXcx_ sets CCR0 1720 } 1721 } 1722 1723 // result in register (must do this at the end because int_flag_success can be the same register as one above) 1724 if (use_result_reg) { 1725 li(int_flag_success, 1); 1726 } 1727 1728 if (semantics & MemBarFenceAfter) { 1729 fence(); 1730 } else if (semantics & MemBarAcq) { 1731 isync(); 1732 } 1733 1734 if (use_result_reg && !preset_result_reg) { 1735 b(done); 1736 } 1737 1738 bind(failed_int); 1739 if (use_result_reg && !preset_result_reg) { 1740 li(int_flag_success, 0); 1741 } 1742 1743 bind(done); 1744 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1745 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1746 } 1747 1748 // Look up the method for a megamorphic invokeinterface call. 1749 // The target method is determined by <intf_klass, itable_index>. 1750 // The receiver klass is in recv_klass. 1751 // On success, the result will be in method_result, and execution falls through. 1752 // On failure, execution transfers to the given label. 1753 void MacroAssembler::lookup_interface_method(Register recv_klass, 1754 Register intf_klass, 1755 RegisterOrConstant itable_index, 1756 Register method_result, 1757 Register scan_temp, 1758 Register temp2, 1759 Label& L_no_such_interface, 1760 bool return_method) { 1761 assert_different_registers(recv_klass, intf_klass, method_result, scan_temp); 1762 1763 // Compute start of first itableOffsetEntry (which is at the end of the vtable). 1764 int vtable_base = in_bytes(Klass::vtable_start_offset()); 1765 int itentry_off = itableMethodEntry::method_offset_in_bytes(); 1766 int logMEsize = exact_log2(itableMethodEntry::size() * wordSize); 1767 int scan_step = itableOffsetEntry::size() * wordSize; 1768 int log_vte_size= exact_log2(vtableEntry::size_in_bytes()); 1769 1770 lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass); 1771 // %%% We should store the aligned, prescaled offset in the klassoop. 1772 // Then the next several instructions would fold away. 1773 1774 sldi(scan_temp, scan_temp, log_vte_size); 1775 addi(scan_temp, scan_temp, vtable_base); 1776 add(scan_temp, recv_klass, scan_temp); 1777 1778 // Adjust recv_klass by scaled itable_index, so we can free itable_index. 1779 if (return_method) { 1780 if (itable_index.is_register()) { 1781 Register itable_offset = itable_index.as_register(); 1782 sldi(method_result, itable_offset, logMEsize); 1783 if (itentry_off) { addi(method_result, method_result, itentry_off); } 1784 add(method_result, method_result, recv_klass); 1785 } else { 1786 long itable_offset = (long)itable_index.as_constant(); 1787 // static address, no relocation 1788 add_const_optimized(method_result, recv_klass, (itable_offset << logMEsize) + itentry_off, temp2); 1789 } 1790 } 1791 1792 // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) { 1793 // if (scan->interface() == intf) { 1794 // result = (klass + scan->offset() + itable_index); 1795 // } 1796 // } 1797 Label search, found_method; 1798 1799 for (int peel = 1; peel >= 0; peel--) { 1800 // %%%% Could load both offset and interface in one ldx, if they were 1801 // in the opposite order. This would save a load. 1802 ld(temp2, itableOffsetEntry::interface_offset_in_bytes(), scan_temp); 1803 1804 // Check that this entry is non-null. A null entry means that 1805 // the receiver class doesn't implement the interface, and wasn't the 1806 // same as when the caller was compiled. 1807 cmpd(CCR0, temp2, intf_klass); 1808 1809 if (peel) { 1810 beq(CCR0, found_method); 1811 } else { 1812 bne(CCR0, search); 1813 // (invert the test to fall through to found_method...) 1814 } 1815 1816 if (!peel) break; 1817 1818 bind(search); 1819 1820 cmpdi(CCR0, temp2, 0); 1821 beq(CCR0, L_no_such_interface); 1822 addi(scan_temp, scan_temp, scan_step); 1823 } 1824 1825 bind(found_method); 1826 1827 // Got a hit. 1828 if (return_method) { 1829 int ito_offset = itableOffsetEntry::offset_offset_in_bytes(); 1830 lwz(scan_temp, ito_offset, scan_temp); 1831 ldx(method_result, scan_temp, method_result); 1832 } 1833 } 1834 1835 // virtual method calling 1836 void MacroAssembler::lookup_virtual_method(Register recv_klass, 1837 RegisterOrConstant vtable_index, 1838 Register method_result) { 1839 1840 assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg()); 1841 1842 const int base = in_bytes(Klass::vtable_start_offset()); 1843 assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); 1844 1845 if (vtable_index.is_register()) { 1846 sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord); 1847 add(recv_klass, vtable_index.as_register(), recv_klass); 1848 } else { 1849 addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord); 1850 } 1851 ld(R19_method, base + vtableEntry::method_offset_in_bytes(), recv_klass); 1852 } 1853 1854 /////////////////////////////////////////// subtype checking //////////////////////////////////////////// 1855 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, 1856 Register super_klass, 1857 Register temp1_reg, 1858 Register temp2_reg, 1859 Label* L_success, 1860 Label* L_failure, 1861 Label* L_slow_path, 1862 RegisterOrConstant super_check_offset) { 1863 1864 const Register check_cache_offset = temp1_reg; 1865 const Register cached_super = temp2_reg; 1866 1867 assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super); 1868 1869 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1870 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1871 1872 bool must_load_sco = (super_check_offset.constant_or_zero() == -1); 1873 bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset); 1874 1875 Label L_fallthrough; 1876 int label_nulls = 0; 1877 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 1878 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 1879 if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; } 1880 assert(label_nulls <= 1 || 1881 (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path), 1882 "at most one NULL in the batch, usually"); 1883 1884 // If the pointers are equal, we are done (e.g., String[] elements). 1885 // This self-check enables sharing of secondary supertype arrays among 1886 // non-primary types such as array-of-interface. Otherwise, each such 1887 // type would need its own customized SSA. 1888 // We move this check to the front of the fast path because many 1889 // type checks are in fact trivially successful in this manner, 1890 // so we get a nicely predicted branch right at the start of the check. 1891 cmpd(CCR0, sub_klass, super_klass); 1892 beq(CCR0, *L_success); 1893 1894 // Check the supertype display: 1895 if (must_load_sco) { 1896 // The super check offset is always positive... 1897 lwz(check_cache_offset, sco_offset, super_klass); 1898 super_check_offset = RegisterOrConstant(check_cache_offset); 1899 // super_check_offset is register. 1900 assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register()); 1901 } 1902 // The loaded value is the offset from KlassOopDesc. 1903 1904 ld(cached_super, super_check_offset, sub_klass); 1905 cmpd(CCR0, cached_super, super_klass); 1906 1907 // This check has worked decisively for primary supers. 1908 // Secondary supers are sought in the super_cache ('super_cache_addr'). 1909 // (Secondary supers are interfaces and very deeply nested subtypes.) 1910 // This works in the same check above because of a tricky aliasing 1911 // between the super_cache and the primary super display elements. 1912 // (The 'super_check_addr' can address either, as the case requires.) 1913 // Note that the cache is updated below if it does not help us find 1914 // what we need immediately. 1915 // So if it was a primary super, we can just fail immediately. 1916 // Otherwise, it's the slow path for us (no success at this point). 1917 1918 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); } 1919 1920 if (super_check_offset.is_register()) { 1921 beq(CCR0, *L_success); 1922 cmpwi(CCR0, super_check_offset.as_register(), sc_offset); 1923 if (L_failure == &L_fallthrough) { 1924 beq(CCR0, *L_slow_path); 1925 } else { 1926 bne(CCR0, *L_failure); 1927 FINAL_JUMP(*L_slow_path); 1928 } 1929 } else { 1930 if (super_check_offset.as_constant() == sc_offset) { 1931 // Need a slow path; fast failure is impossible. 1932 if (L_slow_path == &L_fallthrough) { 1933 beq(CCR0, *L_success); 1934 } else { 1935 bne(CCR0, *L_slow_path); 1936 FINAL_JUMP(*L_success); 1937 } 1938 } else { 1939 // No slow path; it's a fast decision. 1940 if (L_failure == &L_fallthrough) { 1941 beq(CCR0, *L_success); 1942 } else { 1943 bne(CCR0, *L_failure); 1944 FINAL_JUMP(*L_success); 1945 } 1946 } 1947 } 1948 1949 bind(L_fallthrough); 1950 #undef FINAL_JUMP 1951 } 1952 1953 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 1954 Register super_klass, 1955 Register temp1_reg, 1956 Register temp2_reg, 1957 Label* L_success, 1958 Register result_reg) { 1959 const Register array_ptr = temp1_reg; // current value from cache array 1960 const Register temp = temp2_reg; 1961 1962 assert_different_registers(sub_klass, super_klass, array_ptr, temp); 1963 1964 int source_offset = in_bytes(Klass::secondary_supers_offset()); 1965 int target_offset = in_bytes(Klass::secondary_super_cache_offset()); 1966 1967 int length_offset = Array<Klass*>::length_offset_in_bytes(); 1968 int base_offset = Array<Klass*>::base_offset_in_bytes(); 1969 1970 Label hit, loop, failure, fallthru; 1971 1972 ld(array_ptr, source_offset, sub_klass); 1973 1974 // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated."); 1975 lwz(temp, length_offset, array_ptr); 1976 cmpwi(CCR0, temp, 0); 1977 beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0 1978 1979 mtctr(temp); // load ctr 1980 1981 bind(loop); 1982 // Oops in table are NO MORE compressed. 1983 ld(temp, base_offset, array_ptr); 1984 cmpd(CCR0, temp, super_klass); 1985 beq(CCR0, hit); 1986 addi(array_ptr, array_ptr, BytesPerWord); 1987 bdnz(loop); 1988 1989 bind(failure); 1990 if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss) 1991 b(fallthru); 1992 1993 bind(hit); 1994 std(super_klass, target_offset, sub_klass); // save result to cache 1995 if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit) 1996 if (L_success != NULL) { b(*L_success); } 1997 else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided 1998 1999 bind(fallthru); 2000 } 2001 2002 // Try fast path, then go to slow one if not successful 2003 void MacroAssembler::check_klass_subtype(Register sub_klass, 2004 Register super_klass, 2005 Register temp1_reg, 2006 Register temp2_reg, 2007 Label& L_success) { 2008 Label L_failure; 2009 check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure); 2010 check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success); 2011 bind(L_failure); // Fallthru if not successful. 2012 } 2013 2014 void MacroAssembler::check_method_handle_type(Register mtype_reg, Register mh_reg, 2015 Register temp_reg, 2016 Label& wrong_method_type) { 2017 assert_different_registers(mtype_reg, mh_reg, temp_reg); 2018 // Compare method type against that of the receiver. 2019 load_heap_oop(temp_reg, delayed_value(java_lang_invoke_MethodHandle::type_offset_in_bytes, temp_reg), mh_reg, 2020 noreg, noreg, false, IS_NOT_NULL); 2021 cmpd(CCR0, temp_reg, mtype_reg); 2022 bne(CCR0, wrong_method_type); 2023 } 2024 2025 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot, 2026 Register temp_reg, 2027 int extra_slot_offset) { 2028 // cf. TemplateTable::prepare_invoke(), if (load_receiver). 2029 int stackElementSize = Interpreter::stackElementSize; 2030 int offset = extra_slot_offset * stackElementSize; 2031 if (arg_slot.is_constant()) { 2032 offset += arg_slot.as_constant() * stackElementSize; 2033 return offset; 2034 } else { 2035 assert(temp_reg != noreg, "must specify"); 2036 sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize)); 2037 if (offset != 0) 2038 addi(temp_reg, temp_reg, offset); 2039 return temp_reg; 2040 } 2041 } 2042 2043 // Supports temp2_reg = R0. 2044 void MacroAssembler::biased_locking_enter(ConditionRegister cr_reg, Register obj_reg, 2045 Register mark_reg, Register temp_reg, 2046 Register temp2_reg, Label& done, Label* slow_case) { 2047 assert(UseBiasedLocking, "why call this otherwise?"); 2048 2049 #ifdef ASSERT 2050 assert_different_registers(obj_reg, mark_reg, temp_reg, temp2_reg); 2051 #endif 2052 2053 Label cas_label; 2054 2055 // Branch to done if fast path fails and no slow_case provided. 2056 Label *slow_case_int = (slow_case != NULL) ? slow_case : &done; 2057 2058 // Biased locking 2059 // See whether the lock is currently biased toward our thread and 2060 // whether the epoch is still valid 2061 // Note that the runtime guarantees sufficient alignment of JavaThread 2062 // pointers to allow age to be placed into low bits 2063 assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, 2064 "biased locking makes assumptions about bit layout"); 2065 2066 if (PrintBiasedLockingStatistics) { 2067 load_const(temp2_reg, (address) BiasedLocking::total_entry_count_addr(), temp_reg); 2068 lwzx(temp_reg, temp2_reg); 2069 addi(temp_reg, temp_reg, 1); 2070 stwx(temp_reg, temp2_reg); 2071 } 2072 2073 andi(temp_reg, mark_reg, markOopDesc::biased_lock_mask_in_place); 2074 cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern); 2075 bne(cr_reg, cas_label); 2076 2077 load_klass(temp_reg, obj_reg); 2078 2079 load_const_optimized(temp2_reg, ~((int) markOopDesc::age_mask_in_place)); 2080 ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg); 2081 orr(temp_reg, R16_thread, temp_reg); 2082 xorr(temp_reg, mark_reg, temp_reg); 2083 andr(temp_reg, temp_reg, temp2_reg); 2084 cmpdi(cr_reg, temp_reg, 0); 2085 if (PrintBiasedLockingStatistics) { 2086 Label l; 2087 bne(cr_reg, l); 2088 load_const(temp2_reg, (address) BiasedLocking::biased_lock_entry_count_addr()); 2089 lwzx(mark_reg, temp2_reg); 2090 addi(mark_reg, mark_reg, 1); 2091 stwx(mark_reg, temp2_reg); 2092 // restore mark_reg 2093 ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg); 2094 bind(l); 2095 } 2096 beq(cr_reg, done); 2097 2098 Label try_revoke_bias; 2099 Label try_rebias; 2100 2101 // At this point we know that the header has the bias pattern and 2102 // that we are not the bias owner in the current epoch. We need to 2103 // figure out more details about the state of the header in order to 2104 // know what operations can be legally performed on the object's 2105 // header. 2106 2107 // If the low three bits in the xor result aren't clear, that means 2108 // the prototype header is no longer biased and we have to revoke 2109 // the bias on this object. 2110 andi(temp2_reg, temp_reg, markOopDesc::biased_lock_mask_in_place); 2111 cmpwi(cr_reg, temp2_reg, 0); 2112 bne(cr_reg, try_revoke_bias); 2113 2114 // Biasing is still enabled for this data type. See whether the 2115 // epoch of the current bias is still valid, meaning that the epoch 2116 // bits of the mark word are equal to the epoch bits of the 2117 // prototype header. (Note that the prototype header's epoch bits 2118 // only change at a safepoint.) If not, attempt to rebias the object 2119 // toward the current thread. Note that we must be absolutely sure 2120 // that the current epoch is invalid in order to do this because 2121 // otherwise the manipulations it performs on the mark word are 2122 // illegal. 2123 2124 int shift_amount = 64 - markOopDesc::epoch_shift; 2125 // rotate epoch bits to right (little) end and set other bits to 0 2126 // [ big part | epoch | little part ] -> [ 0..0 | epoch ] 2127 rldicl_(temp2_reg, temp_reg, shift_amount, 64 - markOopDesc::epoch_bits); 2128 // branch if epoch bits are != 0, i.e. they differ, because the epoch has been incremented 2129 bne(CCR0, try_rebias); 2130 2131 // The epoch of the current bias is still valid but we know nothing 2132 // about the owner; it might be set or it might be clear. Try to 2133 // acquire the bias of the object using an atomic operation. If this 2134 // fails we will go in to the runtime to revoke the object's bias. 2135 // Note that we first construct the presumed unbiased header so we 2136 // don't accidentally blow away another thread's valid bias. 2137 andi(mark_reg, mark_reg, (markOopDesc::biased_lock_mask_in_place | 2138 markOopDesc::age_mask_in_place | 2139 markOopDesc::epoch_mask_in_place)); 2140 orr(temp_reg, R16_thread, mark_reg); 2141 2142 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2143 2144 // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg). 2145 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg, 2146 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg, 2147 /*where=*/obj_reg, 2148 MacroAssembler::MemBarAcq, 2149 MacroAssembler::cmpxchgx_hint_acquire_lock(), 2150 noreg, slow_case_int); // bail out if failed 2151 2152 // If the biasing toward our thread failed, this means that 2153 // another thread succeeded in biasing it toward itself and we 2154 // need to revoke that bias. The revocation will occur in the 2155 // interpreter runtime in the slow case. 2156 if (PrintBiasedLockingStatistics) { 2157 load_const(temp2_reg, (address) BiasedLocking::anonymously_biased_lock_entry_count_addr(), temp_reg); 2158 lwzx(temp_reg, temp2_reg); 2159 addi(temp_reg, temp_reg, 1); 2160 stwx(temp_reg, temp2_reg); 2161 } 2162 b(done); 2163 2164 bind(try_rebias); 2165 // At this point we know the epoch has expired, meaning that the 2166 // current "bias owner", if any, is actually invalid. Under these 2167 // circumstances _only_, we are allowed to use the current header's 2168 // value as the comparison value when doing the cas to acquire the 2169 // bias in the current epoch. In other words, we allow transfer of 2170 // the bias from one thread to another directly in this situation. 2171 load_klass(temp_reg, obj_reg); 2172 andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place); 2173 orr(temp2_reg, R16_thread, temp2_reg); 2174 ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg); 2175 orr(temp_reg, temp2_reg, temp_reg); 2176 2177 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2178 2179 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg, 2180 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg, 2181 /*where=*/obj_reg, 2182 MacroAssembler::MemBarAcq, 2183 MacroAssembler::cmpxchgx_hint_acquire_lock(), 2184 noreg, slow_case_int); // bail out if failed 2185 2186 // If the biasing toward our thread failed, this means that 2187 // another thread succeeded in biasing it toward itself and we 2188 // need to revoke that bias. The revocation will occur in the 2189 // interpreter runtime in the slow case. 2190 if (PrintBiasedLockingStatistics) { 2191 load_const(temp2_reg, (address) BiasedLocking::rebiased_lock_entry_count_addr(), temp_reg); 2192 lwzx(temp_reg, temp2_reg); 2193 addi(temp_reg, temp_reg, 1); 2194 stwx(temp_reg, temp2_reg); 2195 } 2196 b(done); 2197 2198 bind(try_revoke_bias); 2199 // The prototype mark in the klass doesn't have the bias bit set any 2200 // more, indicating that objects of this data type are not supposed 2201 // to be biased any more. We are going to try to reset the mark of 2202 // this object to the prototype value and fall through to the 2203 // CAS-based locking scheme. Note that if our CAS fails, it means 2204 // that another thread raced us for the privilege of revoking the 2205 // bias of this particular object, so it's okay to continue in the 2206 // normal locking code. 2207 load_klass(temp_reg, obj_reg); 2208 ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg); 2209 andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place); 2210 orr(temp_reg, temp_reg, temp2_reg); 2211 2212 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2213 2214 // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg). 2215 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg, 2216 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg, 2217 /*where=*/obj_reg, 2218 MacroAssembler::MemBarAcq, 2219 MacroAssembler::cmpxchgx_hint_acquire_lock()); 2220 2221 // reload markOop in mark_reg before continuing with lightweight locking 2222 ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg); 2223 2224 // Fall through to the normal CAS-based lock, because no matter what 2225 // the result of the above CAS, some thread must have succeeded in 2226 // removing the bias bit from the object's header. 2227 if (PrintBiasedLockingStatistics) { 2228 Label l; 2229 bne(cr_reg, l); 2230 load_const(temp2_reg, (address) BiasedLocking::revoked_lock_entry_count_addr(), temp_reg); 2231 lwzx(temp_reg, temp2_reg); 2232 addi(temp_reg, temp_reg, 1); 2233 stwx(temp_reg, temp2_reg); 2234 bind(l); 2235 } 2236 2237 bind(cas_label); 2238 } 2239 2240 void MacroAssembler::biased_locking_exit (ConditionRegister cr_reg, Register mark_addr, Register temp_reg, Label& done) { 2241 // Check for biased locking unlock case, which is a no-op 2242 // Note: we do not have to check the thread ID for two reasons. 2243 // First, the interpreter checks for IllegalMonitorStateException at 2244 // a higher level. Second, if the bias was revoked while we held the 2245 // lock, the object could not be rebiased toward another thread, so 2246 // the bias bit would be clear. 2247 2248 ld(temp_reg, 0, mark_addr); 2249 andi(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place); 2250 2251 cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern); 2252 beq(cr_reg, done); 2253 } 2254 2255 // allocation (for C1) 2256 void MacroAssembler::eden_allocate( 2257 Register obj, // result: pointer to object after successful allocation 2258 Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise 2259 int con_size_in_bytes, // object size in bytes if known at compile time 2260 Register t1, // temp register 2261 Register t2, // temp register 2262 Label& slow_case // continuation point if fast allocation fails 2263 ) { 2264 b(slow_case); 2265 } 2266 2267 void MacroAssembler::tlab_allocate( 2268 Register obj, // result: pointer to object after successful allocation 2269 Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise 2270 int con_size_in_bytes, // object size in bytes if known at compile time 2271 Register t1, // temp register 2272 Label& slow_case // continuation point if fast allocation fails 2273 ) { 2274 // make sure arguments make sense 2275 assert_different_registers(obj, var_size_in_bytes, t1); 2276 assert(0 <= con_size_in_bytes && is_simm16(con_size_in_bytes), "illegal object size"); 2277 assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment"); 2278 2279 const Register new_top = t1; 2280 //verify_tlab(); not implemented 2281 2282 ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread); 2283 ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread); 2284 if (var_size_in_bytes == noreg) { 2285 addi(new_top, obj, con_size_in_bytes); 2286 } else { 2287 add(new_top, obj, var_size_in_bytes); 2288 } 2289 cmpld(CCR0, new_top, R0); 2290 bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case); 2291 2292 #ifdef ASSERT 2293 // make sure new free pointer is properly aligned 2294 { 2295 Label L; 2296 andi_(R0, new_top, MinObjAlignmentInBytesMask); 2297 beq(CCR0, L); 2298 stop("updated TLAB free is not properly aligned", 0x934); 2299 bind(L); 2300 } 2301 #endif // ASSERT 2302 2303 // update the tlab top pointer 2304 std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread); 2305 //verify_tlab(); not implemented 2306 } 2307 void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register t1, Register t2) { 2308 unimplemented("incr_allocated_bytes"); 2309 } 2310 2311 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset, 2312 int insts_call_instruction_offset, Register Rtoc) { 2313 // Start the stub. 2314 address stub = start_a_stub(64); 2315 if (stub == NULL) { return NULL; } // CodeCache full: bail out 2316 2317 // Create a trampoline stub relocation which relates this trampoline stub 2318 // with the call instruction at insts_call_instruction_offset in the 2319 // instructions code-section. 2320 relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset)); 2321 const int stub_start_offset = offset(); 2322 2323 // For java_to_interp stubs we use R11_scratch1 as scratch register 2324 // and in call trampoline stubs we use R12_scratch2. This way we 2325 // can distinguish them (see is_NativeCallTrampolineStub_at()). 2326 Register reg_scratch = R12_scratch2; 2327 2328 // Now, create the trampoline stub's code: 2329 // - load the TOC 2330 // - load the call target from the constant pool 2331 // - call 2332 if (Rtoc == noreg) { 2333 calculate_address_from_global_toc(reg_scratch, method_toc()); 2334 Rtoc = reg_scratch; 2335 } 2336 2337 ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false); 2338 mtctr(reg_scratch); 2339 bctr(); 2340 2341 const address stub_start_addr = addr_at(stub_start_offset); 2342 2343 // Assert that the encoded destination_toc_offset can be identified and that it is correct. 2344 assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(), 2345 "encoded offset into the constant pool must match"); 2346 // Trampoline_stub_size should be good. 2347 assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size"); 2348 assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline"); 2349 2350 // End the stub. 2351 end_a_stub(); 2352 return stub; 2353 } 2354 2355 // TM on PPC64. 2356 void MacroAssembler::atomic_inc_ptr(Register addr, Register result, int simm16) { 2357 Label retry; 2358 bind(retry); 2359 ldarx(result, addr, /*hint*/ false); 2360 addi(result, result, simm16); 2361 stdcx_(result, addr); 2362 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 2363 bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0 2364 } else { 2365 bne( CCR0, retry); // stXcx_ sets CCR0 2366 } 2367 } 2368 2369 void MacroAssembler::atomic_ori_int(Register addr, Register result, int uimm16) { 2370 Label retry; 2371 bind(retry); 2372 lwarx(result, addr, /*hint*/ false); 2373 ori(result, result, uimm16); 2374 stwcx_(result, addr); 2375 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 2376 bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0 2377 } else { 2378 bne( CCR0, retry); // stXcx_ sets CCR0 2379 } 2380 } 2381 2382 #if INCLUDE_RTM_OPT 2383 2384 // Update rtm_counters based on abort status 2385 // input: abort_status 2386 // rtm_counters_Reg (RTMLockingCounters*) 2387 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters_Reg) { 2388 // Mapping to keep PreciseRTMLockingStatistics similar to x86. 2389 // x86 ppc (! means inverted, ? means not the same) 2390 // 0 31 Set if abort caused by XABORT instruction. 2391 // 1 ! 7 If set, the transaction may succeed on a retry. This bit is always clear if bit 0 is set. 2392 // 2 13 Set if another logical processor conflicted with a memory address that was part of the transaction that aborted. 2393 // 3 10 Set if an internal buffer overflowed. 2394 // 4 ?12 Set if a debug breakpoint was hit. 2395 // 5 ?32 Set if an abort occurred during execution of a nested transaction. 2396 const int failure_bit[] = {tm_tabort, // Signal handler will set this too. 2397 tm_failure_persistent, 2398 tm_non_trans_cf, 2399 tm_trans_cf, 2400 tm_footprint_of, 2401 tm_failure_code, 2402 tm_transaction_level}; 2403 2404 const int num_failure_bits = sizeof(failure_bit) / sizeof(int); 2405 const int num_counters = RTMLockingCounters::ABORT_STATUS_LIMIT; 2406 2407 const int bit2counter_map[][num_counters] = 2408 // 0 = no map; 1 = mapped, no inverted logic; -1 = mapped, inverted logic 2409 // Inverted logic means that if a bit is set don't count it, or vice-versa. 2410 // Care must be taken when mapping bits to counters as bits for a given 2411 // counter must be mutually exclusive. Otherwise, the counter will be 2412 // incremented more than once. 2413 // counters: 2414 // 0 1 2 3 4 5 2415 // abort , persist, conflict, overflow, debug , nested bits: 2416 {{ 1 , 0 , 0 , 0 , 0 , 0 }, // abort 2417 { 0 , -1 , 0 , 0 , 0 , 0 }, // failure_persistent 2418 { 0 , 0 , 1 , 0 , 0 , 0 }, // non_trans_cf 2419 { 0 , 0 , 1 , 0 , 0 , 0 }, // trans_cf 2420 { 0 , 0 , 0 , 1 , 0 , 0 }, // footprint_of 2421 { 0 , 0 , 0 , 0 , -1 , 0 }, // failure_code = 0xD4 2422 { 0 , 0 , 0 , 0 , 0 , 1 }}; // transaction_level > 1 2423 // ... 2424 2425 // Move abort_status value to R0 and use abort_status register as a 2426 // temporary register because R0 as third operand in ld/std is treated 2427 // as base address zero (value). Likewise, R0 as second operand in addi 2428 // is problematic because it amounts to li. 2429 const Register temp_Reg = abort_status; 2430 const Register abort_status_R0 = R0; 2431 mr(abort_status_R0, abort_status); 2432 2433 // Increment total abort counter. 2434 int counters_offs = RTMLockingCounters::abort_count_offset(); 2435 ld(temp_Reg, counters_offs, rtm_counters_Reg); 2436 addi(temp_Reg, temp_Reg, 1); 2437 std(temp_Reg, counters_offs, rtm_counters_Reg); 2438 2439 // Increment specific abort counters. 2440 if (PrintPreciseRTMLockingStatistics) { 2441 2442 // #0 counter offset. 2443 int abortX_offs = RTMLockingCounters::abortX_count_offset(); 2444 2445 for (int nbit = 0; nbit < num_failure_bits; nbit++) { 2446 for (int ncounter = 0; ncounter < num_counters; ncounter++) { 2447 if (bit2counter_map[nbit][ncounter] != 0) { 2448 Label check_abort; 2449 int abort_counter_offs = abortX_offs + (ncounter << 3); 2450 2451 if (failure_bit[nbit] == tm_transaction_level) { 2452 // Don't check outer transaction, TL = 1 (bit 63). Hence only 2453 // 11 bits in the TL field are checked to find out if failure 2454 // occured in a nested transaction. This check also matches 2455 // the case when nesting_of = 1 (nesting overflow). 2456 rldicr_(temp_Reg, abort_status_R0, failure_bit[nbit], 10); 2457 } else if (failure_bit[nbit] == tm_failure_code) { 2458 // Check failure code for trap or illegal caught in TM. 2459 // Bits 0:7 are tested as bit 7 (persistent) is copied from 2460 // tabort or treclaim source operand. 2461 // On Linux: trap or illegal is TM_CAUSE_SIGNAL (0xD4). 2462 rldicl(temp_Reg, abort_status_R0, 8, 56); 2463 cmpdi(CCR0, temp_Reg, 0xD4); 2464 } else { 2465 rldicr_(temp_Reg, abort_status_R0, failure_bit[nbit], 0); 2466 } 2467 2468 if (bit2counter_map[nbit][ncounter] == 1) { 2469 beq(CCR0, check_abort); 2470 } else { 2471 bne(CCR0, check_abort); 2472 } 2473 2474 // We don't increment atomically. 2475 ld(temp_Reg, abort_counter_offs, rtm_counters_Reg); 2476 addi(temp_Reg, temp_Reg, 1); 2477 std(temp_Reg, abort_counter_offs, rtm_counters_Reg); 2478 2479 bind(check_abort); 2480 } 2481 } 2482 } 2483 } 2484 // Restore abort_status. 2485 mr(abort_status, abort_status_R0); 2486 } 2487 2488 // Branch if (random & (count-1) != 0), count is 2^n 2489 // tmp and CR0 are killed 2490 void MacroAssembler::branch_on_random_using_tb(Register tmp, int count, Label& brLabel) { 2491 mftb(tmp); 2492 andi_(tmp, tmp, count-1); 2493 bne(CCR0, brLabel); 2494 } 2495 2496 // Perform abort ratio calculation, set no_rtm bit if high ratio. 2497 // input: rtm_counters_Reg (RTMLockingCounters* address) - KILLED 2498 void MacroAssembler::rtm_abort_ratio_calculation(Register rtm_counters_Reg, 2499 RTMLockingCounters* rtm_counters, 2500 Metadata* method_data) { 2501 Label L_done, L_check_always_rtm1, L_check_always_rtm2; 2502 2503 if (RTMLockingCalculationDelay > 0) { 2504 // Delay calculation. 2505 ld(rtm_counters_Reg, (RegisterOrConstant)(intptr_t)RTMLockingCounters::rtm_calculation_flag_addr()); 2506 cmpdi(CCR0, rtm_counters_Reg, 0); 2507 beq(CCR0, L_done); 2508 load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload 2509 } 2510 // Abort ratio calculation only if abort_count > RTMAbortThreshold. 2511 // Aborted transactions = abort_count * 100 2512 // All transactions = total_count * RTMTotalCountIncrRate 2513 // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio) 2514 ld(R0, RTMLockingCounters::abort_count_offset(), rtm_counters_Reg); 2515 if (is_simm(RTMAbortThreshold, 16)) { // cmpdi can handle 16bit immediate only. 2516 cmpdi(CCR0, R0, RTMAbortThreshold); 2517 blt(CCR0, L_check_always_rtm2); // reload of rtm_counters_Reg not necessary 2518 } else { 2519 load_const_optimized(rtm_counters_Reg, RTMAbortThreshold); 2520 cmpd(CCR0, R0, rtm_counters_Reg); 2521 blt(CCR0, L_check_always_rtm1); // reload of rtm_counters_Reg required 2522 } 2523 mulli(R0, R0, 100); 2524 2525 const Register tmpReg = rtm_counters_Reg; 2526 ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg); 2527 mulli(tmpReg, tmpReg, RTMTotalCountIncrRate); // allowable range: int16 2528 mulli(tmpReg, tmpReg, RTMAbortRatio); // allowable range: int16 2529 cmpd(CCR0, R0, tmpReg); 2530 blt(CCR0, L_check_always_rtm1); // jump to reload 2531 if (method_data != NULL) { 2532 // Set rtm_state to "no rtm" in MDO. 2533 // Not using a metadata relocation. Method and Class Loader are kept alive anyway. 2534 // (See nmethod::metadata_do and CodeBuffer::finalize_oop_references.) 2535 load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg); 2536 atomic_ori_int(R0, tmpReg, NoRTM); 2537 } 2538 b(L_done); 2539 2540 bind(L_check_always_rtm1); 2541 load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload 2542 bind(L_check_always_rtm2); 2543 ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg); 2544 int64_t thresholdValue = RTMLockingThreshold / RTMTotalCountIncrRate; 2545 if (is_simm(thresholdValue, 16)) { // cmpdi can handle 16bit immediate only. 2546 cmpdi(CCR0, tmpReg, thresholdValue); 2547 } else { 2548 load_const_optimized(R0, thresholdValue); 2549 cmpd(CCR0, tmpReg, R0); 2550 } 2551 blt(CCR0, L_done); 2552 if (method_data != NULL) { 2553 // Set rtm_state to "always rtm" in MDO. 2554 // Not using a metadata relocation. See above. 2555 load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg); 2556 atomic_ori_int(R0, tmpReg, UseRTM); 2557 } 2558 bind(L_done); 2559 } 2560 2561 // Update counters and perform abort ratio calculation. 2562 // input: abort_status_Reg 2563 void MacroAssembler::rtm_profiling(Register abort_status_Reg, Register temp_Reg, 2564 RTMLockingCounters* rtm_counters, 2565 Metadata* method_data, 2566 bool profile_rtm) { 2567 2568 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2569 // Update rtm counters based on state at abort. 2570 // Reads abort_status_Reg, updates flags. 2571 assert_different_registers(abort_status_Reg, temp_Reg); 2572 load_const_optimized(temp_Reg, (address)rtm_counters, R0); 2573 rtm_counters_update(abort_status_Reg, temp_Reg); 2574 if (profile_rtm) { 2575 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2576 rtm_abort_ratio_calculation(temp_Reg, rtm_counters, method_data); 2577 } 2578 } 2579 2580 // Retry on abort if abort's status indicates non-persistent failure. 2581 // inputs: retry_count_Reg 2582 // : abort_status_Reg 2583 // output: retry_count_Reg decremented by 1 2584 void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, 2585 Label& retryLabel, Label* checkRetry) { 2586 Label doneRetry; 2587 2588 // Don't retry if failure is persistent. 2589 // The persistent bit is set when a (A) Disallowed operation is performed in 2590 // transactional state, like for instance trying to write the TFHAR after a 2591 // transaction is started; or when there is (B) a Nesting Overflow (too many 2592 // nested transactions); or when (C) the Footprint overflows (too many 2593 // addressess touched in TM state so there is no more space in the footprint 2594 // area to track them); or in case of (D) a Self-Induced Conflict, i.e. a 2595 // store is performed to a given address in TM state, then once in suspended 2596 // state the same address is accessed. Failure (A) is very unlikely to occur 2597 // in the JVM. Failure (D) will never occur because Suspended state is never 2598 // used in the JVM. Thus mostly (B) a Nesting Overflow or (C) a Footprint 2599 // Overflow will set the persistent bit. 2600 rldicr_(R0, abort_status_Reg, tm_failure_persistent, 0); 2601 bne(CCR0, doneRetry); 2602 2603 // Don't retry if transaction was deliberately aborted, i.e. caused by a 2604 // tabort instruction. 2605 rldicr_(R0, abort_status_Reg, tm_tabort, 0); 2606 bne(CCR0, doneRetry); 2607 2608 // Retry if transaction aborted due to a conflict with another thread. 2609 if (checkRetry) { bind(*checkRetry); } 2610 addic_(retry_count_Reg, retry_count_Reg, -1); 2611 blt(CCR0, doneRetry); 2612 b(retryLabel); 2613 bind(doneRetry); 2614 } 2615 2616 // Spin and retry if lock is busy. 2617 // inputs: owner_addr_Reg (monitor address) 2618 // : retry_count_Reg 2619 // output: retry_count_Reg decremented by 1 2620 // CTR is killed 2621 void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register owner_addr_Reg, Label& retryLabel) { 2622 Label SpinLoop, doneRetry, doRetry; 2623 addic_(retry_count_Reg, retry_count_Reg, -1); 2624 blt(CCR0, doneRetry); 2625 2626 if (RTMSpinLoopCount > 1) { 2627 li(R0, RTMSpinLoopCount); 2628 mtctr(R0); 2629 } 2630 2631 // low thread priority 2632 smt_prio_low(); 2633 bind(SpinLoop); 2634 2635 if (RTMSpinLoopCount > 1) { 2636 bdz(doRetry); 2637 ld(R0, 0, owner_addr_Reg); 2638 cmpdi(CCR0, R0, 0); 2639 bne(CCR0, SpinLoop); 2640 } 2641 2642 bind(doRetry); 2643 2644 // restore thread priority to default in userspace 2645 #ifdef LINUX 2646 smt_prio_medium_low(); 2647 #else 2648 smt_prio_medium(); 2649 #endif 2650 2651 b(retryLabel); 2652 2653 bind(doneRetry); 2654 } 2655 2656 // Use RTM for normal stack locks. 2657 // Input: objReg (object to lock) 2658 void MacroAssembler::rtm_stack_locking(ConditionRegister flag, 2659 Register obj, Register mark_word, Register tmp, 2660 Register retry_on_abort_count_Reg, 2661 RTMLockingCounters* stack_rtm_counters, 2662 Metadata* method_data, bool profile_rtm, 2663 Label& DONE_LABEL, Label& IsInflated) { 2664 assert(UseRTMForStackLocks, "why call this otherwise?"); 2665 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); 2666 Label L_rtm_retry, L_decrement_retry, L_on_abort; 2667 2668 if (RTMRetryCount > 0) { 2669 load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 2670 bind(L_rtm_retry); 2671 } 2672 andi_(R0, mark_word, markOopDesc::monitor_value); // inflated vs stack-locked|neutral|biased 2673 bne(CCR0, IsInflated); 2674 2675 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2676 Label L_noincrement; 2677 if (RTMTotalCountIncrRate > 1) { 2678 branch_on_random_using_tb(tmp, RTMTotalCountIncrRate, L_noincrement); 2679 } 2680 assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM"); 2681 load_const_optimized(tmp, (address)stack_rtm_counters->total_count_addr(), R0); 2682 //atomic_inc_ptr(tmp, /*temp, will be reloaded*/mark_word); We don't increment atomically 2683 ldx(mark_word, tmp); 2684 addi(mark_word, mark_word, 1); 2685 stdx(mark_word, tmp); 2686 bind(L_noincrement); 2687 } 2688 tbegin_(); 2689 beq(CCR0, L_on_abort); 2690 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // Reload in transaction, conflicts need to be tracked. 2691 andi(R0, mark_word, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits 2692 cmpwi(flag, R0, markOopDesc::unlocked_value); // bits = 001 unlocked 2693 beq(flag, DONE_LABEL); // all done if unlocked 2694 2695 if (UseRTMXendForLockBusy) { 2696 tend_(); 2697 b(L_decrement_retry); 2698 } else { 2699 tabort_(); 2700 } 2701 bind(L_on_abort); 2702 const Register abort_status_Reg = tmp; 2703 mftexasr(abort_status_Reg); 2704 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2705 rtm_profiling(abort_status_Reg, /*temp*/mark_word, stack_rtm_counters, method_data, profile_rtm); 2706 } 2707 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // reload 2708 if (RTMRetryCount > 0) { 2709 // Retry on lock abort if abort status is not permanent. 2710 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry, &L_decrement_retry); 2711 } else { 2712 bind(L_decrement_retry); 2713 } 2714 } 2715 2716 // Use RTM for inflating locks 2717 // inputs: obj (object to lock) 2718 // mark_word (current header - KILLED) 2719 // boxReg (on-stack box address (displaced header location) - KILLED) 2720 void MacroAssembler::rtm_inflated_locking(ConditionRegister flag, 2721 Register obj, Register mark_word, Register boxReg, 2722 Register retry_on_busy_count_Reg, Register retry_on_abort_count_Reg, 2723 RTMLockingCounters* rtm_counters, 2724 Metadata* method_data, bool profile_rtm, 2725 Label& DONE_LABEL) { 2726 assert(UseRTMLocking, "why call this otherwise?"); 2727 Label L_rtm_retry, L_decrement_retry, L_on_abort; 2728 // Clean monitor_value bit to get valid pointer. 2729 int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markOopDesc::monitor_value; 2730 2731 // Store non-null, using boxReg instead of (intptr_t)markOopDesc::unused_mark(). 2732 std(boxReg, BasicLock::displaced_header_offset_in_bytes(), boxReg); 2733 const Register tmpReg = boxReg; 2734 const Register owner_addr_Reg = mark_word; 2735 addi(owner_addr_Reg, mark_word, owner_offset); 2736 2737 if (RTMRetryCount > 0) { 2738 load_const_optimized(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy. 2739 load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort. 2740 bind(L_rtm_retry); 2741 } 2742 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2743 Label L_noincrement; 2744 if (RTMTotalCountIncrRate > 1) { 2745 branch_on_random_using_tb(R0, RTMTotalCountIncrRate, L_noincrement); 2746 } 2747 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2748 load_const(R0, (address)rtm_counters->total_count_addr(), tmpReg); 2749 //atomic_inc_ptr(R0, tmpReg); We don't increment atomically 2750 ldx(tmpReg, R0); 2751 addi(tmpReg, tmpReg, 1); 2752 stdx(tmpReg, R0); 2753 bind(L_noincrement); 2754 } 2755 tbegin_(); 2756 beq(CCR0, L_on_abort); 2757 // We don't reload mark word. Will only be reset at safepoint. 2758 ld(R0, 0, owner_addr_Reg); // Load in transaction, conflicts need to be tracked. 2759 cmpdi(flag, R0, 0); 2760 beq(flag, DONE_LABEL); 2761 2762 if (UseRTMXendForLockBusy) { 2763 tend_(); 2764 b(L_decrement_retry); 2765 } else { 2766 tabort_(); 2767 } 2768 bind(L_on_abort); 2769 const Register abort_status_Reg = tmpReg; 2770 mftexasr(abort_status_Reg); 2771 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2772 rtm_profiling(abort_status_Reg, /*temp*/ owner_addr_Reg, rtm_counters, method_data, profile_rtm); 2773 // Restore owner_addr_Reg 2774 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); 2775 #ifdef ASSERT 2776 andi_(R0, mark_word, markOopDesc::monitor_value); 2777 asm_assert_ne("must be inflated", 0xa754); // Deflating only allowed at safepoint. 2778 #endif 2779 addi(owner_addr_Reg, mark_word, owner_offset); 2780 } 2781 if (RTMRetryCount > 0) { 2782 // Retry on lock abort if abort status is not permanent. 2783 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 2784 } 2785 2786 // Appears unlocked - try to swing _owner from null to non-null. 2787 cmpxchgd(flag, /*current val*/ R0, (intptr_t)0, /*new val*/ R16_thread, owner_addr_Reg, 2788 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2789 MacroAssembler::cmpxchgx_hint_acquire_lock(), noreg, &L_decrement_retry, true); 2790 2791 if (RTMRetryCount > 0) { 2792 // success done else retry 2793 b(DONE_LABEL); 2794 bind(L_decrement_retry); 2795 // Spin and retry if lock is busy. 2796 rtm_retry_lock_on_busy(retry_on_busy_count_Reg, owner_addr_Reg, L_rtm_retry); 2797 } else { 2798 bind(L_decrement_retry); 2799 } 2800 } 2801 2802 #endif // INCLUDE_RTM_OPT 2803 2804 // "The box" is the space on the stack where we copy the object mark. 2805 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box, 2806 Register temp, Register displaced_header, Register current_header, 2807 bool try_bias, 2808 RTMLockingCounters* rtm_counters, 2809 RTMLockingCounters* stack_rtm_counters, 2810 Metadata* method_data, 2811 bool use_rtm, bool profile_rtm) { 2812 assert_different_registers(oop, box, temp, displaced_header, current_header); 2813 assert(flag != CCR0, "bad condition register"); 2814 Label cont; 2815 Label object_has_monitor; 2816 Label cas_failed; 2817 2818 // Load markOop from object into displaced_header. 2819 ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop); 2820 2821 2822 if (try_bias) { 2823 biased_locking_enter(flag, oop, displaced_header, temp, current_header, cont); 2824 } 2825 2826 #if INCLUDE_RTM_OPT 2827 if (UseRTMForStackLocks && use_rtm) { 2828 rtm_stack_locking(flag, oop, displaced_header, temp, /*temp*/ current_header, 2829 stack_rtm_counters, method_data, profile_rtm, 2830 cont, object_has_monitor); 2831 } 2832 #endif // INCLUDE_RTM_OPT 2833 2834 // Handle existing monitor. 2835 // The object has an existing monitor iff (mark & monitor_value) != 0. 2836 andi_(temp, displaced_header, markOopDesc::monitor_value); 2837 bne(CCR0, object_has_monitor); 2838 2839 // Set displaced_header to be (markOop of object | UNLOCK_VALUE). 2840 ori(displaced_header, displaced_header, markOopDesc::unlocked_value); 2841 2842 // Load Compare Value application register. 2843 2844 // Initialize the box. (Must happen before we update the object mark!) 2845 std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2846 2847 // Must fence, otherwise, preceding store(s) may float below cmpxchg. 2848 // Compare object markOop with mark and if equal exchange scratch1 with object markOop. 2849 cmpxchgd(/*flag=*/flag, 2850 /*current_value=*/current_header, 2851 /*compare_value=*/displaced_header, 2852 /*exchange_value=*/box, 2853 /*where=*/oop, 2854 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2855 MacroAssembler::cmpxchgx_hint_acquire_lock(), 2856 noreg, 2857 &cas_failed, 2858 /*check without membar and ldarx first*/true); 2859 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2860 2861 // If the compare-and-exchange succeeded, then we found an unlocked 2862 // object and we have now locked it. 2863 b(cont); 2864 2865 bind(cas_failed); 2866 // We did not see an unlocked object so try the fast recursive case. 2867 2868 // Check if the owner is self by comparing the value in the markOop of object 2869 // (current_header) with the stack pointer. 2870 sub(current_header, current_header, R1_SP); 2871 load_const_optimized(temp, ~(os::vm_page_size()-1) | markOopDesc::lock_mask_in_place); 2872 2873 and_(R0/*==0?*/, current_header, temp); 2874 // If condition is true we are cont and hence we can store 0 as the 2875 // displaced header in the box, which indicates that it is a recursive lock. 2876 mcrf(flag,CCR0); 2877 std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box); 2878 2879 // Handle existing monitor. 2880 b(cont); 2881 2882 bind(object_has_monitor); 2883 // The object's monitor m is unlocked iff m->owner == NULL, 2884 // otherwise m->owner may contain a thread or a stack address. 2885 2886 #if INCLUDE_RTM_OPT 2887 // Use the same RTM locking code in 32- and 64-bit VM. 2888 if (use_rtm) { 2889 rtm_inflated_locking(flag, oop, displaced_header, box, temp, /*temp*/ current_header, 2890 rtm_counters, method_data, profile_rtm, cont); 2891 } else { 2892 #endif // INCLUDE_RTM_OPT 2893 2894 // Try to CAS m->owner from NULL to current thread. 2895 addi(temp, displaced_header, ObjectMonitor::owner_offset_in_bytes()-markOopDesc::monitor_value); 2896 cmpxchgd(/*flag=*/flag, 2897 /*current_value=*/current_header, 2898 /*compare_value=*/(intptr_t)0, 2899 /*exchange_value=*/R16_thread, 2900 /*where=*/temp, 2901 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2902 MacroAssembler::cmpxchgx_hint_acquire_lock()); 2903 2904 // Store a non-null value into the box. 2905 std(box, BasicLock::displaced_header_offset_in_bytes(), box); 2906 2907 # ifdef ASSERT 2908 bne(flag, cont); 2909 // We have acquired the monitor, check some invariants. 2910 addi(/*monitor=*/temp, temp, -ObjectMonitor::owner_offset_in_bytes()); 2911 // Invariant 1: _recursions should be 0. 2912 //assert(ObjectMonitor::recursions_size_in_bytes() == 8, "unexpected size"); 2913 asm_assert_mem8_is_zero(ObjectMonitor::recursions_offset_in_bytes(), temp, 2914 "monitor->_recursions should be 0", -1); 2915 # endif 2916 2917 #if INCLUDE_RTM_OPT 2918 } // use_rtm() 2919 #endif 2920 2921 bind(cont); 2922 // flag == EQ indicates success 2923 // flag == NE indicates failure 2924 } 2925 2926 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box, 2927 Register temp, Register displaced_header, Register current_header, 2928 bool try_bias, bool use_rtm) { 2929 assert_different_registers(oop, box, temp, displaced_header, current_header); 2930 assert(flag != CCR0, "bad condition register"); 2931 Label cont; 2932 Label object_has_monitor; 2933 2934 if (try_bias) { 2935 biased_locking_exit(flag, oop, current_header, cont); 2936 } 2937 2938 #if INCLUDE_RTM_OPT 2939 if (UseRTMForStackLocks && use_rtm) { 2940 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); 2941 Label L_regular_unlock; 2942 ld(current_header, oopDesc::mark_offset_in_bytes(), oop); // fetch markword 2943 andi(R0, current_header, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits 2944 cmpwi(flag, R0, markOopDesc::unlocked_value); // bits = 001 unlocked 2945 bne(flag, L_regular_unlock); // else RegularLock 2946 tend_(); // otherwise end... 2947 b(cont); // ... and we're done 2948 bind(L_regular_unlock); 2949 } 2950 #endif 2951 2952 // Find the lock address and load the displaced header from the stack. 2953 ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2954 2955 // If the displaced header is 0, we have a recursive unlock. 2956 cmpdi(flag, displaced_header, 0); 2957 beq(flag, cont); 2958 2959 // Handle existing monitor. 2960 // The object has an existing monitor iff (mark & monitor_value) != 0. 2961 RTM_OPT_ONLY( if (!(UseRTMForStackLocks && use_rtm)) ) // skip load if already done 2962 ld(current_header, oopDesc::mark_offset_in_bytes(), oop); 2963 andi_(R0, current_header, markOopDesc::monitor_value); 2964 bne(CCR0, object_has_monitor); 2965 2966 // Check if it is still a light weight lock, this is is true if we see 2967 // the stack address of the basicLock in the markOop of the object. 2968 // Cmpxchg sets flag to cmpd(current_header, box). 2969 cmpxchgd(/*flag=*/flag, 2970 /*current_value=*/current_header, 2971 /*compare_value=*/box, 2972 /*exchange_value=*/displaced_header, 2973 /*where=*/oop, 2974 MacroAssembler::MemBarRel, 2975 MacroAssembler::cmpxchgx_hint_release_lock(), 2976 noreg, 2977 &cont); 2978 2979 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2980 2981 // Handle existing monitor. 2982 b(cont); 2983 2984 bind(object_has_monitor); 2985 addi(current_header, current_header, -markOopDesc::monitor_value); // monitor 2986 ld(temp, ObjectMonitor::owner_offset_in_bytes(), current_header); 2987 2988 // It's inflated. 2989 #if INCLUDE_RTM_OPT 2990 if (use_rtm) { 2991 Label L_regular_inflated_unlock; 2992 // Clean monitor_value bit to get valid pointer 2993 cmpdi(flag, temp, 0); 2994 bne(flag, L_regular_inflated_unlock); 2995 tend_(); 2996 b(cont); 2997 bind(L_regular_inflated_unlock); 2998 } 2999 #endif 3000 3001 ld(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header); 3002 xorr(temp, R16_thread, temp); // Will be 0 if we are the owner. 3003 orr(temp, temp, displaced_header); // Will be 0 if there are 0 recursions. 3004 cmpdi(flag, temp, 0); 3005 bne(flag, cont); 3006 3007 ld(temp, ObjectMonitor::EntryList_offset_in_bytes(), current_header); 3008 ld(displaced_header, ObjectMonitor::cxq_offset_in_bytes(), current_header); 3009 orr(temp, temp, displaced_header); // Will be 0 if both are 0. 3010 cmpdi(flag, temp, 0); 3011 bne(flag, cont); 3012 release(); 3013 std(temp, ObjectMonitor::owner_offset_in_bytes(), current_header); 3014 3015 bind(cont); 3016 // flag == EQ indicates success 3017 // flag == NE indicates failure 3018 } 3019 3020 void MacroAssembler::safepoint_poll(Label& slow_path, Register temp_reg) { 3021 if (SafepointMechanism::uses_thread_local_poll()) { 3022 ld(temp_reg, in_bytes(Thread::polling_page_offset()), R16_thread); 3023 // Armed page has poll_bit set. 3024 andi_(temp_reg, temp_reg, SafepointMechanism::poll_bit()); 3025 } else { 3026 lwz(temp_reg, (RegisterOrConstant)(intptr_t)SafepointSynchronize::address_of_state()); 3027 cmpwi(CCR0, temp_reg, SafepointSynchronize::_not_synchronized); 3028 } 3029 bne(CCR0, slow_path); 3030 } 3031 3032 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2, bool needs_frame) { 3033 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 3034 bs->resolve_jobject(this, value, tmp1, tmp2, needs_frame); 3035 } 3036 3037 // Values for last_Java_pc, and last_Java_sp must comply to the rules 3038 // in frame_ppc.hpp. 3039 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) { 3040 // Always set last_Java_pc and flags first because once last_Java_sp 3041 // is visible has_last_Java_frame is true and users will look at the 3042 // rest of the fields. (Note: flags should always be zero before we 3043 // get here so doesn't need to be set.) 3044 3045 // Verify that last_Java_pc was zeroed on return to Java 3046 asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread, 3047 "last_Java_pc not zeroed before leaving Java", 0x200); 3048 3049 // When returning from calling out from Java mode the frame anchor's 3050 // last_Java_pc will always be set to NULL. It is set here so that 3051 // if we are doing a call to native (not VM) that we capture the 3052 // known pc and don't have to rely on the native call having a 3053 // standard frame linkage where we can find the pc. 3054 if (last_Java_pc != noreg) 3055 std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 3056 3057 // Set last_Java_sp last. 3058 std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 3059 } 3060 3061 void MacroAssembler::reset_last_Java_frame(void) { 3062 asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()), 3063 R16_thread, "SP was not set, still zero", 0x202); 3064 3065 BLOCK_COMMENT("reset_last_Java_frame {"); 3066 li(R0, 0); 3067 3068 // _last_Java_sp = 0 3069 std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 3070 3071 // _last_Java_pc = 0 3072 std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 3073 BLOCK_COMMENT("} reset_last_Java_frame"); 3074 } 3075 3076 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) { 3077 assert_different_registers(sp, tmp1); 3078 3079 // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via 3080 // TOP_IJAVA_FRAME_ABI. 3081 // FIXME: assert that we really have a TOP_IJAVA_FRAME here! 3082 address entry = pc(); 3083 load_const_optimized(tmp1, entry); 3084 3085 set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1); 3086 } 3087 3088 void MacroAssembler::get_vm_result(Register oop_result) { 3089 // Read: 3090 // R16_thread 3091 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 3092 // 3093 // Updated: 3094 // oop_result 3095 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 3096 3097 verify_thread(); 3098 3099 ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread); 3100 li(R0, 0); 3101 std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread); 3102 3103 verify_oop(oop_result); 3104 } 3105 3106 void MacroAssembler::get_vm_result_2(Register metadata_result) { 3107 // Read: 3108 // R16_thread 3109 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 3110 // 3111 // Updated: 3112 // metadata_result 3113 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 3114 3115 ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 3116 li(R0, 0); 3117 std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 3118 } 3119 3120 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) { 3121 Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided. 3122 if (Universe::narrow_klass_base() != 0) { 3123 // Use dst as temp if it is free. 3124 sub_const_optimized(dst, current, Universe::narrow_klass_base(), R0); 3125 current = dst; 3126 } 3127 if (Universe::narrow_klass_shift() != 0) { 3128 srdi(dst, current, Universe::narrow_klass_shift()); 3129 current = dst; 3130 } 3131 return current; 3132 } 3133 3134 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) { 3135 if (UseCompressedClassPointers) { 3136 Register compressedKlass = encode_klass_not_null(ck, klass); 3137 stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop); 3138 } else { 3139 std(klass, oopDesc::klass_offset_in_bytes(), dst_oop); 3140 } 3141 } 3142 3143 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) { 3144 if (UseCompressedClassPointers) { 3145 if (val == noreg) { 3146 val = R0; 3147 li(val, 0); 3148 } 3149 stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed 3150 } 3151 } 3152 3153 int MacroAssembler::instr_size_for_decode_klass_not_null() { 3154 if (!UseCompressedClassPointers) return 0; 3155 int num_instrs = 1; // shift or move 3156 if (Universe::narrow_klass_base() != 0) num_instrs = 7; // shift + load const + add 3157 return num_instrs * BytesPerInstWord; 3158 } 3159 3160 void MacroAssembler::decode_klass_not_null(Register dst, Register src) { 3161 assert(dst != R0, "Dst reg may not be R0, as R0 is used here."); 3162 if (src == noreg) src = dst; 3163 Register shifted_src = src; 3164 if (Universe::narrow_klass_shift() != 0 || 3165 Universe::narrow_klass_base() == 0 && src != dst) { // Move required. 3166 shifted_src = dst; 3167 sldi(shifted_src, src, Universe::narrow_klass_shift()); 3168 } 3169 if (Universe::narrow_klass_base() != 0) { 3170 add_const_optimized(dst, shifted_src, Universe::narrow_klass_base(), R0); 3171 } 3172 } 3173 3174 void MacroAssembler::load_klass(Register dst, Register src) { 3175 if (UseCompressedClassPointers) { 3176 lwz(dst, oopDesc::klass_offset_in_bytes(), src); 3177 // Attention: no null check here! 3178 decode_klass_not_null(dst, dst); 3179 } else { 3180 ld(dst, oopDesc::klass_offset_in_bytes(), src); 3181 } 3182 } 3183 3184 // ((OopHandle)result).resolve(); 3185 void MacroAssembler::resolve_oop_handle(Register result) { 3186 // OopHandle::resolve is an indirection. 3187 ld(result, 0, result); 3188 } 3189 3190 void MacroAssembler::load_mirror_from_const_method(Register mirror, Register const_method) { 3191 ld(mirror, in_bytes(ConstMethod::constants_offset()), const_method); 3192 ld(mirror, ConstantPool::pool_holder_offset_in_bytes(), mirror); 3193 ld(mirror, in_bytes(Klass::java_mirror_offset()), mirror); 3194 resolve_oop_handle(mirror); 3195 } 3196 3197 // Clear Array 3198 // For very short arrays. tmp == R0 is allowed. 3199 void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) { 3200 if (cnt_dwords > 0) { li(tmp, 0); } 3201 for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); } 3202 } 3203 3204 // Version for constant short array length. Kills base_ptr. tmp == R0 is allowed. 3205 void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) { 3206 if (cnt_dwords < 8) { 3207 clear_memory_unrolled(base_ptr, cnt_dwords, tmp); 3208 return; 3209 } 3210 3211 Label loop; 3212 const long loopcnt = cnt_dwords >> 1, 3213 remainder = cnt_dwords & 1; 3214 3215 li(tmp, loopcnt); 3216 mtctr(tmp); 3217 li(tmp, 0); 3218 bind(loop); 3219 std(tmp, 0, base_ptr); 3220 std(tmp, 8, base_ptr); 3221 addi(base_ptr, base_ptr, 16); 3222 bdnz(loop); 3223 if (remainder) { std(tmp, 0, base_ptr); } 3224 } 3225 3226 // Kills both input registers. tmp == R0 is allowed. 3227 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) { 3228 // Procedure for large arrays (uses data cache block zero instruction). 3229 Label startloop, fast, fastloop, small_rest, restloop, done; 3230 const int cl_size = VM_Version::L1_data_cache_line_size(), 3231 cl_dwords = cl_size >> 3, 3232 cl_dw_addr_bits = exact_log2(cl_dwords), 3233 dcbz_min = 1, // Min count of dcbz executions, needs to be >0. 3234 min_cnt = ((dcbz_min + 1) << cl_dw_addr_bits) - 1; 3235 3236 if (const_cnt >= 0) { 3237 // Constant case. 3238 if (const_cnt < min_cnt) { 3239 clear_memory_constlen(base_ptr, const_cnt, tmp); 3240 return; 3241 } 3242 load_const_optimized(cnt_dwords, const_cnt, tmp); 3243 } else { 3244 // cnt_dwords already loaded in register. Need to check size. 3245 cmpdi(CCR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included). 3246 blt(CCR1, small_rest); 3247 } 3248 rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line. 3249 beq(CCR0, fast); // Already 128byte aligned. 3250 3251 subfic(tmp, tmp, cl_dwords); 3252 mtctr(tmp); // Set ctr to hit 128byte boundary (0<ctr<cl_dwords). 3253 subf(cnt_dwords, tmp, cnt_dwords); // rest. 3254 li(tmp, 0); 3255 3256 bind(startloop); // Clear at the beginning to reach 128byte boundary. 3257 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 3258 addi(base_ptr, base_ptr, 8); 3259 bdnz(startloop); 3260 3261 bind(fast); // Clear 128byte blocks. 3262 srdi(tmp, cnt_dwords, cl_dw_addr_bits); // Loop count for 128byte loop (>0). 3263 andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords. 3264 mtctr(tmp); // Load counter. 3265 3266 bind(fastloop); 3267 dcbz(base_ptr); // Clear 128byte aligned block. 3268 addi(base_ptr, base_ptr, cl_size); 3269 bdnz(fastloop); 3270 3271 bind(small_rest); 3272 cmpdi(CCR0, cnt_dwords, 0); // size 0? 3273 beq(CCR0, done); // rest == 0 3274 li(tmp, 0); 3275 mtctr(cnt_dwords); // Load counter. 3276 3277 bind(restloop); // Clear rest. 3278 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 3279 addi(base_ptr, base_ptr, 8); 3280 bdnz(restloop); 3281 3282 bind(done); 3283 } 3284 3285 /////////////////////////////////////////// String intrinsics //////////////////////////////////////////// 3286 3287 #ifdef COMPILER2 3288 // Intrinsics for CompactStrings 3289 3290 // Compress char[] to byte[] by compressing 16 bytes at once. 3291 void MacroAssembler::string_compress_16(Register src, Register dst, Register cnt, 3292 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, 3293 Label& Lfailure) { 3294 3295 const Register tmp0 = R0; 3296 assert_different_registers(src, dst, cnt, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5); 3297 Label Lloop, Lslow; 3298 3299 // Check if cnt >= 8 (= 16 bytes) 3300 lis(tmp1, 0xFF); // tmp1 = 0x00FF00FF00FF00FF 3301 srwi_(tmp2, cnt, 3); 3302 beq(CCR0, Lslow); 3303 ori(tmp1, tmp1, 0xFF); 3304 rldimi(tmp1, tmp1, 32, 0); 3305 mtctr(tmp2); 3306 3307 // 2x unrolled loop 3308 bind(Lloop); 3309 ld(tmp2, 0, src); // _0_1_2_3 (Big Endian) 3310 ld(tmp4, 8, src); // _4_5_6_7 3311 3312 orr(tmp0, tmp2, tmp4); 3313 rldicl(tmp3, tmp2, 6*8, 64-24); // _____1_2 3314 rldimi(tmp2, tmp2, 2*8, 2*8); // _0_2_3_3 3315 rldicl(tmp5, tmp4, 6*8, 64-24); // _____5_6 3316 rldimi(tmp4, tmp4, 2*8, 2*8); // _4_6_7_7 3317 3318 andc_(tmp0, tmp0, tmp1); 3319 bne(CCR0, Lfailure); // Not latin1. 3320 addi(src, src, 16); 3321 3322 rlwimi(tmp3, tmp2, 0*8, 24, 31);// _____1_3 3323 srdi(tmp2, tmp2, 3*8); // ____0_2_ 3324 rlwimi(tmp5, tmp4, 0*8, 24, 31);// _____5_7 3325 srdi(tmp4, tmp4, 3*8); // ____4_6_ 3326 3327 orr(tmp2, tmp2, tmp3); // ____0123 3328 orr(tmp4, tmp4, tmp5); // ____4567 3329 3330 stw(tmp2, 0, dst); 3331 stw(tmp4, 4, dst); 3332 addi(dst, dst, 8); 3333 bdnz(Lloop); 3334 3335 bind(Lslow); // Fallback to slow version 3336 } 3337 3338 // Compress char[] to byte[]. cnt must be positive int. 3339 void MacroAssembler::string_compress(Register src, Register dst, Register cnt, Register tmp, Label& Lfailure) { 3340 Label Lloop; 3341 mtctr(cnt); 3342 3343 bind(Lloop); 3344 lhz(tmp, 0, src); 3345 cmplwi(CCR0, tmp, 0xff); 3346 bgt(CCR0, Lfailure); // Not latin1. 3347 addi(src, src, 2); 3348 stb(tmp, 0, dst); 3349 addi(dst, dst, 1); 3350 bdnz(Lloop); 3351 } 3352 3353 // Inflate byte[] to char[] by inflating 16 bytes at once. 3354 void MacroAssembler::string_inflate_16(Register src, Register dst, Register cnt, 3355 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) { 3356 const Register tmp0 = R0; 3357 assert_different_registers(src, dst, cnt, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5); 3358 Label Lloop, Lslow; 3359 3360 // Check if cnt >= 8 3361 srwi_(tmp2, cnt, 3); 3362 beq(CCR0, Lslow); 3363 lis(tmp1, 0xFF); // tmp1 = 0x00FF00FF 3364 ori(tmp1, tmp1, 0xFF); 3365 mtctr(tmp2); 3366 3367 // 2x unrolled loop 3368 bind(Lloop); 3369 lwz(tmp2, 0, src); // ____0123 (Big Endian) 3370 lwz(tmp4, 4, src); // ____4567 3371 addi(src, src, 8); 3372 3373 rldicl(tmp3, tmp2, 7*8, 64-8); // _______2 3374 rlwimi(tmp2, tmp2, 3*8, 16, 23);// ____0113 3375 rldicl(tmp5, tmp4, 7*8, 64-8); // _______6 3376 rlwimi(tmp4, tmp4, 3*8, 16, 23);// ____4557 3377 3378 andc(tmp0, tmp2, tmp1); // ____0_1_ 3379 rlwimi(tmp2, tmp3, 2*8, 0, 23); // _____2_3 3380 andc(tmp3, tmp4, tmp1); // ____4_5_ 3381 rlwimi(tmp4, tmp5, 2*8, 0, 23); // _____6_7 3382 3383 rldimi(tmp2, tmp0, 3*8, 0*8); // _0_1_2_3 3384 rldimi(tmp4, tmp3, 3*8, 0*8); // _4_5_6_7 3385 3386 std(tmp2, 0, dst); 3387 std(tmp4, 8, dst); 3388 addi(dst, dst, 16); 3389 bdnz(Lloop); 3390 3391 bind(Lslow); // Fallback to slow version 3392 } 3393 3394 // Inflate byte[] to char[]. cnt must be positive int. 3395 void MacroAssembler::string_inflate(Register src, Register dst, Register cnt, Register tmp) { 3396 Label Lloop; 3397 mtctr(cnt); 3398 3399 bind(Lloop); 3400 lbz(tmp, 0, src); 3401 addi(src, src, 1); 3402 sth(tmp, 0, dst); 3403 addi(dst, dst, 2); 3404 bdnz(Lloop); 3405 } 3406 3407 void MacroAssembler::string_compare(Register str1, Register str2, 3408 Register cnt1, Register cnt2, 3409 Register tmp1, Register result, int ae) { 3410 const Register tmp0 = R0, 3411 diff = tmp1; 3412 3413 assert_different_registers(str1, str2, cnt1, cnt2, tmp0, tmp1, result); 3414 Label Ldone, Lslow, Lloop, Lreturn_diff; 3415 3416 // Note: Making use of the fact that compareTo(a, b) == -compareTo(b, a) 3417 // we interchange str1 and str2 in the UL case and negate the result. 3418 // Like this, str1 is always latin1 encoded, except for the UU case. 3419 // In addition, we need 0 (or sign which is 0) extend. 3420 3421 if (ae == StrIntrinsicNode::UU) { 3422 srwi(cnt1, cnt1, 1); 3423 } else { 3424 clrldi(cnt1, cnt1, 32); 3425 } 3426 3427 if (ae != StrIntrinsicNode::LL) { 3428 srwi(cnt2, cnt2, 1); 3429 } else { 3430 clrldi(cnt2, cnt2, 32); 3431 } 3432 3433 // See if the lengths are different, and calculate min in cnt1. 3434 // Save diff in case we need it for a tie-breaker. 3435 subf_(diff, cnt2, cnt1); // diff = cnt1 - cnt2 3436 // if (diff > 0) { cnt1 = cnt2; } 3437 if (VM_Version::has_isel()) { 3438 isel(cnt1, CCR0, Assembler::greater, /*invert*/ false, cnt2); 3439 } else { 3440 Label Lskip; 3441 blt(CCR0, Lskip); 3442 mr(cnt1, cnt2); 3443 bind(Lskip); 3444 } 3445 3446 // Rename registers 3447 Register chr1 = result; 3448 Register chr2 = tmp0; 3449 3450 // Compare multiple characters in fast loop (only implemented for same encoding). 3451 int stride1 = 8, stride2 = 8; 3452 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3453 int log2_chars_per_iter = (ae == StrIntrinsicNode::LL) ? 3 : 2; 3454 Label Lfastloop, Lskipfast; 3455 3456 srwi_(tmp0, cnt1, log2_chars_per_iter); 3457 beq(CCR0, Lskipfast); 3458 rldicl(cnt2, cnt1, 0, 64 - log2_chars_per_iter); // Remaining characters. 3459 li(cnt1, 1 << log2_chars_per_iter); // Initialize for failure case: Rescan characters from current iteration. 3460 mtctr(tmp0); 3461 3462 bind(Lfastloop); 3463 ld(chr1, 0, str1); 3464 ld(chr2, 0, str2); 3465 cmpd(CCR0, chr1, chr2); 3466 bne(CCR0, Lslow); 3467 addi(str1, str1, stride1); 3468 addi(str2, str2, stride2); 3469 bdnz(Lfastloop); 3470 mr(cnt1, cnt2); // Remaining characters. 3471 bind(Lskipfast); 3472 } 3473 3474 // Loop which searches the first difference character by character. 3475 cmpwi(CCR0, cnt1, 0); 3476 beq(CCR0, Lreturn_diff); 3477 bind(Lslow); 3478 mtctr(cnt1); 3479 3480 switch (ae) { 3481 case StrIntrinsicNode::LL: stride1 = 1; stride2 = 1; break; 3482 case StrIntrinsicNode::UL: // fallthru (see comment above) 3483 case StrIntrinsicNode::LU: stride1 = 1; stride2 = 2; break; 3484 case StrIntrinsicNode::UU: stride1 = 2; stride2 = 2; break; 3485 default: ShouldNotReachHere(); break; 3486 } 3487 3488 bind(Lloop); 3489 if (stride1 == 1) { lbz(chr1, 0, str1); } else { lhz(chr1, 0, str1); } 3490 if (stride2 == 1) { lbz(chr2, 0, str2); } else { lhz(chr2, 0, str2); } 3491 subf_(result, chr2, chr1); // result = chr1 - chr2 3492 bne(CCR0, Ldone); 3493 addi(str1, str1, stride1); 3494 addi(str2, str2, stride2); 3495 bdnz(Lloop); 3496 3497 // If strings are equal up to min length, return the length difference. 3498 bind(Lreturn_diff); 3499 mr(result, diff); 3500 3501 // Otherwise, return the difference between the first mismatched chars. 3502 bind(Ldone); 3503 if (ae == StrIntrinsicNode::UL) { 3504 neg(result, result); // Negate result (see note above). 3505 } 3506 } 3507 3508 void MacroAssembler::array_equals(bool is_array_equ, Register ary1, Register ary2, 3509 Register limit, Register tmp1, Register result, bool is_byte) { 3510 const Register tmp0 = R0; 3511 assert_different_registers(ary1, ary2, limit, tmp0, tmp1, result); 3512 Label Ldone, Lskiploop, Lloop, Lfastloop, Lskipfast; 3513 bool limit_needs_shift = false; 3514 3515 if (is_array_equ) { 3516 const int length_offset = arrayOopDesc::length_offset_in_bytes(); 3517 const int base_offset = arrayOopDesc::base_offset_in_bytes(is_byte ? T_BYTE : T_CHAR); 3518 3519 // Return true if the same array. 3520 cmpd(CCR0, ary1, ary2); 3521 beq(CCR0, Lskiploop); 3522 3523 // Return false if one of them is NULL. 3524 cmpdi(CCR0, ary1, 0); 3525 cmpdi(CCR1, ary2, 0); 3526 li(result, 0); 3527 cror(CCR0, Assembler::equal, CCR1, Assembler::equal); 3528 beq(CCR0, Ldone); 3529 3530 // Load the lengths of arrays. 3531 lwz(limit, length_offset, ary1); 3532 lwz(tmp0, length_offset, ary2); 3533 3534 // Return false if the two arrays are not equal length. 3535 cmpw(CCR0, limit, tmp0); 3536 bne(CCR0, Ldone); 3537 3538 // Load array addresses. 3539 addi(ary1, ary1, base_offset); 3540 addi(ary2, ary2, base_offset); 3541 } else { 3542 limit_needs_shift = !is_byte; 3543 li(result, 0); // Assume not equal. 3544 } 3545 3546 // Rename registers 3547 Register chr1 = tmp0; 3548 Register chr2 = tmp1; 3549 3550 // Compare 8 bytes per iteration in fast loop. 3551 const int log2_chars_per_iter = is_byte ? 3 : 2; 3552 3553 srwi_(tmp0, limit, log2_chars_per_iter + (limit_needs_shift ? 1 : 0)); 3554 beq(CCR0, Lskipfast); 3555 mtctr(tmp0); 3556 3557 bind(Lfastloop); 3558 ld(chr1, 0, ary1); 3559 ld(chr2, 0, ary2); 3560 addi(ary1, ary1, 8); 3561 addi(ary2, ary2, 8); 3562 cmpd(CCR0, chr1, chr2); 3563 bne(CCR0, Ldone); 3564 bdnz(Lfastloop); 3565 3566 bind(Lskipfast); 3567 rldicl_(limit, limit, limit_needs_shift ? 64 - 1 : 0, 64 - log2_chars_per_iter); // Remaining characters. 3568 beq(CCR0, Lskiploop); 3569 mtctr(limit); 3570 3571 // Character by character. 3572 bind(Lloop); 3573 if (is_byte) { 3574 lbz(chr1, 0, ary1); 3575 lbz(chr2, 0, ary2); 3576 addi(ary1, ary1, 1); 3577 addi(ary2, ary2, 1); 3578 } else { 3579 lhz(chr1, 0, ary1); 3580 lhz(chr2, 0, ary2); 3581 addi(ary1, ary1, 2); 3582 addi(ary2, ary2, 2); 3583 } 3584 cmpw(CCR0, chr1, chr2); 3585 bne(CCR0, Ldone); 3586 bdnz(Lloop); 3587 3588 bind(Lskiploop); 3589 li(result, 1); // All characters are equal. 3590 bind(Ldone); 3591 } 3592 3593 void MacroAssembler::string_indexof(Register result, Register haystack, Register haycnt, 3594 Register needle, ciTypeArray* needle_values, Register needlecnt, int needlecntval, 3595 Register tmp1, Register tmp2, Register tmp3, Register tmp4, int ae) { 3596 3597 // Ensure 0<needlecnt<=haycnt in ideal graph as prerequisite! 3598 Label L_TooShort, L_Found, L_NotFound, L_End; 3599 Register last_addr = haycnt, // Kill haycnt at the beginning. 3600 addr = tmp1, 3601 n_start = tmp2, 3602 ch1 = tmp3, 3603 ch2 = R0; 3604 3605 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 3606 const int h_csize = (ae == StrIntrinsicNode::LL) ? 1 : 2; 3607 const int n_csize = (ae == StrIntrinsicNode::UU) ? 2 : 1; 3608 3609 // ************************************************************************************************** 3610 // Prepare for main loop: optimized for needle count >=2, bail out otherwise. 3611 // ************************************************************************************************** 3612 3613 // Compute last haystack addr to use if no match gets found. 3614 clrldi(haycnt, haycnt, 32); // Ensure positive int is valid as 64 bit value. 3615 addi(addr, haystack, -h_csize); // Accesses use pre-increment. 3616 if (needlecntval == 0) { // variable needlecnt 3617 cmpwi(CCR6, needlecnt, 2); 3618 clrldi(needlecnt, needlecnt, 32); // Ensure positive int is valid as 64 bit value. 3619 blt(CCR6, L_TooShort); // Variable needlecnt: handle short needle separately. 3620 } 3621 3622 if (n_csize == 2) { lwz(n_start, 0, needle); } else { lhz(n_start, 0, needle); } // Load first 2 characters of needle. 3623 3624 if (needlecntval == 0) { // variable needlecnt 3625 subf(ch1, needlecnt, haycnt); // Last character index to compare is haycnt-needlecnt. 3626 addi(needlecnt, needlecnt, -2); // Rest of needle. 3627 } else { // constant needlecnt 3628 guarantee(needlecntval != 1, "IndexOf with single-character needle must be handled separately"); 3629 assert((needlecntval & 0x7fff) == needlecntval, "wrong immediate"); 3630 addi(ch1, haycnt, -needlecntval); // Last character index to compare is haycnt-needlecnt. 3631 if (needlecntval > 3) { li(needlecnt, needlecntval - 2); } // Rest of needle. 3632 } 3633 3634 if (h_csize == 2) { slwi(ch1, ch1, 1); } // Scale to number of bytes. 3635 3636 if (ae ==StrIntrinsicNode::UL) { 3637 srwi(tmp4, n_start, 1*8); // ___0 3638 rlwimi(n_start, tmp4, 2*8, 0, 23); // _0_1 3639 } 3640 3641 add(last_addr, haystack, ch1); // Point to last address to compare (haystack+2*(haycnt-needlecnt)). 3642 3643 // Main Loop (now we have at least 2 characters). 3644 Label L_OuterLoop, L_InnerLoop, L_FinalCheck, L_Comp1, L_Comp2; 3645 bind(L_OuterLoop); // Search for 1st 2 characters. 3646 Register addr_diff = tmp4; 3647 subf(addr_diff, addr, last_addr); // Difference between already checked address and last address to check. 3648 addi(addr, addr, h_csize); // This is the new address we want to use for comparing. 3649 srdi_(ch2, addr_diff, h_csize); 3650 beq(CCR0, L_FinalCheck); // 2 characters left? 3651 mtctr(ch2); // num of characters / 2 3652 bind(L_InnerLoop); // Main work horse (2x unrolled search loop) 3653 if (h_csize == 2) { // Load 2 characters of haystack (ignore alignment). 3654 lwz(ch1, 0, addr); 3655 lwz(ch2, 2, addr); 3656 } else { 3657 lhz(ch1, 0, addr); 3658 lhz(ch2, 1, addr); 3659 } 3660 cmpw(CCR0, ch1, n_start); // Compare 2 characters (1 would be sufficient but try to reduce branches to CompLoop). 3661 cmpw(CCR1, ch2, n_start); 3662 beq(CCR0, L_Comp1); // Did we find the needle start? 3663 beq(CCR1, L_Comp2); 3664 addi(addr, addr, 2 * h_csize); 3665 bdnz(L_InnerLoop); 3666 bind(L_FinalCheck); 3667 andi_(addr_diff, addr_diff, h_csize); // Remaining characters not covered by InnerLoop: (num of characters) & 1. 3668 beq(CCR0, L_NotFound); 3669 if (h_csize == 2) { lwz(ch1, 0, addr); } else { lhz(ch1, 0, addr); } // One position left at which we have to compare. 3670 cmpw(CCR1, ch1, n_start); 3671 beq(CCR1, L_Comp1); 3672 bind(L_NotFound); 3673 li(result, -1); // not found 3674 b(L_End); 3675 3676 // ************************************************************************************************** 3677 // Special Case: unfortunately, the variable needle case can be called with needlecnt<2 3678 // ************************************************************************************************** 3679 if (needlecntval == 0) { // We have to handle these cases separately. 3680 Label L_OneCharLoop; 3681 bind(L_TooShort); 3682 mtctr(haycnt); 3683 if (n_csize == 2) { lhz(n_start, 0, needle); } else { lbz(n_start, 0, needle); } // First character of needle 3684 bind(L_OneCharLoop); 3685 if (h_csize == 2) { lhzu(ch1, 2, addr); } else { lbzu(ch1, 1, addr); } 3686 cmpw(CCR1, ch1, n_start); 3687 beq(CCR1, L_Found); // Did we find the one character needle? 3688 bdnz(L_OneCharLoop); 3689 li(result, -1); // Not found. 3690 b(L_End); 3691 } 3692 3693 // ************************************************************************************************** 3694 // Regular Case Part II: compare rest of needle (first 2 characters have been compared already) 3695 // ************************************************************************************************** 3696 3697 // Compare the rest 3698 bind(L_Comp2); 3699 addi(addr, addr, h_csize); // First comparison has failed, 2nd one hit. 3700 bind(L_Comp1); // Addr points to possible needle start. 3701 if (needlecntval != 2) { // Const needlecnt==2? 3702 if (needlecntval != 3) { 3703 if (needlecntval == 0) { beq(CCR6, L_Found); } // Variable needlecnt==2? 3704 Register n_ind = tmp4, 3705 h_ind = n_ind; 3706 li(n_ind, 2 * n_csize); // First 2 characters are already compared, use index 2. 3707 mtctr(needlecnt); // Decremented by 2, still > 0. 3708 Label L_CompLoop; 3709 bind(L_CompLoop); 3710 if (ae ==StrIntrinsicNode::UL) { 3711 h_ind = ch1; 3712 sldi(h_ind, n_ind, 1); 3713 } 3714 if (n_csize == 2) { lhzx(ch2, needle, n_ind); } else { lbzx(ch2, needle, n_ind); } 3715 if (h_csize == 2) { lhzx(ch1, addr, h_ind); } else { lbzx(ch1, addr, h_ind); } 3716 cmpw(CCR1, ch1, ch2); 3717 bne(CCR1, L_OuterLoop); 3718 addi(n_ind, n_ind, n_csize); 3719 bdnz(L_CompLoop); 3720 } else { // No loop required if there's only one needle character left. 3721 if (n_csize == 2) { lhz(ch2, 2 * 2, needle); } else { lbz(ch2, 2 * 1, needle); } 3722 if (h_csize == 2) { lhz(ch1, 2 * 2, addr); } else { lbz(ch1, 2 * 1, addr); } 3723 cmpw(CCR1, ch1, ch2); 3724 bne(CCR1, L_OuterLoop); 3725 } 3726 } 3727 // Return index ... 3728 bind(L_Found); 3729 subf(result, haystack, addr); // relative to haystack, ... 3730 if (h_csize == 2) { srdi(result, result, 1); } // in characters. 3731 bind(L_End); 3732 } // string_indexof 3733 3734 void MacroAssembler::string_indexof_char(Register result, Register haystack, Register haycnt, 3735 Register needle, jchar needleChar, Register tmp1, Register tmp2, bool is_byte) { 3736 assert_different_registers(haystack, haycnt, needle, tmp1, tmp2); 3737 3738 Label L_InnerLoop, L_FinalCheck, L_Found1, L_Found2, L_NotFound, L_End; 3739 Register addr = tmp1, 3740 ch1 = tmp2, 3741 ch2 = R0; 3742 3743 const int h_csize = is_byte ? 1 : 2; 3744 3745 //4: 3746 srwi_(tmp2, haycnt, 1); // Shift right by exact_log2(UNROLL_FACTOR). 3747 mr(addr, haystack); 3748 beq(CCR0, L_FinalCheck); 3749 mtctr(tmp2); // Move to count register. 3750 //8: 3751 bind(L_InnerLoop); // Main work horse (2x unrolled search loop). 3752 if (!is_byte) { 3753 lhz(ch1, 0, addr); 3754 lhz(ch2, 2, addr); 3755 } else { 3756 lbz(ch1, 0, addr); 3757 lbz(ch2, 1, addr); 3758 } 3759 (needle != R0) ? cmpw(CCR0, ch1, needle) : cmplwi(CCR0, ch1, (unsigned int)needleChar); 3760 (needle != R0) ? cmpw(CCR1, ch2, needle) : cmplwi(CCR1, ch2, (unsigned int)needleChar); 3761 beq(CCR0, L_Found1); // Did we find the needle? 3762 beq(CCR1, L_Found2); 3763 addi(addr, addr, 2 * h_csize); 3764 bdnz(L_InnerLoop); 3765 //16: 3766 bind(L_FinalCheck); 3767 andi_(R0, haycnt, 1); 3768 beq(CCR0, L_NotFound); 3769 if (!is_byte) { lhz(ch1, 0, addr); } else { lbz(ch1, 0, addr); } // One position left at which we have to compare. 3770 (needle != R0) ? cmpw(CCR1, ch1, needle) : cmplwi(CCR1, ch1, (unsigned int)needleChar); 3771 beq(CCR1, L_Found1); 3772 //21: 3773 bind(L_NotFound); 3774 li(result, -1); // Not found. 3775 b(L_End); 3776 3777 bind(L_Found2); 3778 addi(addr, addr, h_csize); 3779 //24: 3780 bind(L_Found1); // Return index ... 3781 subf(result, haystack, addr); // relative to haystack, ... 3782 if (!is_byte) { srdi(result, result, 1); } // in characters. 3783 bind(L_End); 3784 } // string_indexof_char 3785 3786 3787 void MacroAssembler::has_negatives(Register src, Register cnt, Register result, 3788 Register tmp1, Register tmp2) { 3789 const Register tmp0 = R0; 3790 assert_different_registers(src, result, cnt, tmp0, tmp1, tmp2); 3791 Label Lfastloop, Lslow, Lloop, Lnoneg, Ldone; 3792 3793 // Check if cnt >= 8 (= 16 bytes) 3794 lis(tmp1, (int)(short)0x8080); // tmp1 = 0x8080808080808080 3795 srwi_(tmp2, cnt, 4); 3796 li(result, 1); // Assume there's a negative byte. 3797 beq(CCR0, Lslow); 3798 ori(tmp1, tmp1, 0x8080); 3799 rldimi(tmp1, tmp1, 32, 0); 3800 mtctr(tmp2); 3801 3802 // 2x unrolled loop 3803 bind(Lfastloop); 3804 ld(tmp2, 0, src); 3805 ld(tmp0, 8, src); 3806 3807 orr(tmp0, tmp2, tmp0); 3808 3809 and_(tmp0, tmp0, tmp1); 3810 bne(CCR0, Ldone); // Found negative byte. 3811 addi(src, src, 16); 3812 3813 bdnz(Lfastloop); 3814 3815 bind(Lslow); // Fallback to slow version 3816 rldicl_(tmp0, cnt, 0, 64-4); 3817 beq(CCR0, Lnoneg); 3818 mtctr(tmp0); 3819 bind(Lloop); 3820 lbz(tmp0, 0, src); 3821 addi(src, src, 1); 3822 andi_(tmp0, tmp0, 0x80); 3823 bne(CCR0, Ldone); // Found negative byte. 3824 bdnz(Lloop); 3825 bind(Lnoneg); 3826 li(result, 0); 3827 3828 bind(Ldone); 3829 } 3830 3831 #endif // Compiler2 3832 3833 // Helpers for Intrinsic Emitters 3834 // 3835 // Revert the byte order of a 32bit value in a register 3836 // src: 0x44556677 3837 // dst: 0x77665544 3838 // Three steps to obtain the result: 3839 // 1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word 3840 // into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared. 3841 // This value initializes dst. 3842 // 2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost 3843 // byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go. 3844 // This value is mask inserted into dst with a [0..23] mask of 1s. 3845 // 3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position. 3846 // This value is mask inserted into dst with a [8..15] mask of 1s. 3847 void MacroAssembler::load_reverse_32(Register dst, Register src) { 3848 assert_different_registers(dst, src); 3849 3850 rldicl(dst, src, (4+1)*8, 56); // Rotate byte 4 into position 7 (rightmost), clear all to the left. 3851 rlwimi(dst, src, 3*8, 0, 23); // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone. 3852 rlwimi(dst, src, 1*8, 8, 15); // Insert byte 6 into position 5, leave the rest alone. 3853 } 3854 3855 // Calculate the column addresses of the crc32 lookup table into distinct registers. 3856 // This loop-invariant calculation is moved out of the loop body, reducing the loop 3857 // body size from 20 to 16 instructions. 3858 // Returns the offset that was used to calculate the address of column tc3. 3859 // Due to register shortage, setting tc3 may overwrite table. With the return offset 3860 // at hand, the original table address can be easily reconstructed. 3861 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) { 3862 3863 #ifdef VM_LITTLE_ENDIAN 3864 // This is what we implement (the DOLIT4 part): 3865 // ========================================================================= */ 3866 // #define DOLIT4 c ^= *buf4++; \ 3867 // c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \ 3868 // crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24] 3869 // #define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4 3870 // ========================================================================= */ 3871 const int ix0 = 3*(4*CRC32_COLUMN_SIZE); 3872 const int ix1 = 2*(4*CRC32_COLUMN_SIZE); 3873 const int ix2 = 1*(4*CRC32_COLUMN_SIZE); 3874 const int ix3 = 0*(4*CRC32_COLUMN_SIZE); 3875 #else 3876 // This is what we implement (the DOBIG4 part): 3877 // ========================================================================= 3878 // #define DOBIG4 c ^= *++buf4; \ 3879 // c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \ 3880 // crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24] 3881 // #define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4 3882 // ========================================================================= 3883 const int ix0 = 4*(4*CRC32_COLUMN_SIZE); 3884 const int ix1 = 5*(4*CRC32_COLUMN_SIZE); 3885 const int ix2 = 6*(4*CRC32_COLUMN_SIZE); 3886 const int ix3 = 7*(4*CRC32_COLUMN_SIZE); 3887 #endif 3888 assert_different_registers(table, tc0, tc1, tc2); 3889 assert(table == tc3, "must be!"); 3890 3891 addi(tc0, table, ix0); 3892 addi(tc1, table, ix1); 3893 addi(tc2, table, ix2); 3894 if (ix3 != 0) addi(tc3, table, ix3); 3895 3896 return ix3; 3897 } 3898 3899 /** 3900 * uint32_t crc; 3901 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8); 3902 */ 3903 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) { 3904 assert_different_registers(crc, table, tmp); 3905 assert_different_registers(val, table); 3906 3907 if (crc == val) { // Must rotate first to use the unmodified value. 3908 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 3909 // As we use a word (4-byte) instruction, we have to adapt the mask bit positions. 3910 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 3911 } else { 3912 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 3913 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 3914 } 3915 lwzx(tmp, table, tmp); 3916 xorr(crc, crc, tmp); 3917 } 3918 3919 /** 3920 * uint32_t crc; 3921 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8); 3922 */ 3923 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) { 3924 fold_byte_crc32(crc, crc, table, tmp); 3925 } 3926 3927 /** 3928 * Emits code to update CRC-32 with a byte value according to constants in table. 3929 * 3930 * @param [in,out]crc Register containing the crc. 3931 * @param [in]val Register containing the byte to fold into the CRC. 3932 * @param [in]table Register containing the table of crc constants. 3933 * 3934 * uint32_t crc; 3935 * val = crc_table[(val ^ crc) & 0xFF]; 3936 * crc = val ^ (crc >> 8); 3937 */ 3938 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 3939 BLOCK_COMMENT("update_byte_crc32:"); 3940 xorr(val, val, crc); 3941 fold_byte_crc32(crc, val, table, val); 3942 } 3943 3944 /** 3945 * @param crc register containing existing CRC (32-bit) 3946 * @param buf register pointing to input byte buffer (byte*) 3947 * @param len register containing number of bytes 3948 * @param table register pointing to CRC table 3949 */ 3950 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table, 3951 Register data, bool loopAlignment) { 3952 assert_different_registers(crc, buf, len, table, data); 3953 3954 Label L_mainLoop, L_done; 3955 const int mainLoop_stepping = 1; 3956 const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4; 3957 3958 // Process all bytes in a single-byte loop. 3959 clrldi_(len, len, 32); // Enforce 32 bit. Anything to do? 3960 beq(CCR0, L_done); 3961 3962 mtctr(len); 3963 align(mainLoop_alignment); 3964 BIND(L_mainLoop); 3965 lbz(data, 0, buf); // Byte from buffer, zero-extended. 3966 addi(buf, buf, mainLoop_stepping); // Advance buffer position. 3967 update_byte_crc32(crc, data, table); 3968 bdnz(L_mainLoop); // Iterate. 3969 3970 bind(L_done); 3971 } 3972 3973 /** 3974 * Emits code to update CRC-32 with a 4-byte value according to constants in table 3975 * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c 3976 */ 3977 // A note on the lookup table address(es): 3978 // The lookup table consists of two sets of four columns each. 3979 // The columns {0..3} are used for little-endian machines. 3980 // The columns {4..7} are used for big-endian machines. 3981 // To save the effort of adding the column offset to the table address each time 3982 // a table element is looked up, it is possible to pass the pre-calculated 3983 // column addresses. 3984 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary. 3985 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc, 3986 Register t0, Register t1, Register t2, Register t3, 3987 Register tc0, Register tc1, Register tc2, Register tc3) { 3988 assert_different_registers(crc, t3); 3989 3990 // XOR crc with next four bytes of buffer. 3991 lwz(t3, bufDisp, buf); 3992 if (bufInc != 0) { 3993 addi(buf, buf, bufInc); 3994 } 3995 xorr(t3, t3, crc); 3996 3997 // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices. 3998 rlwinm(t0, t3, 2, 24-2, 31-2); // ((t1 >> 0) & 0xff) << 2 3999 rlwinm(t1, t3, 32+(2- 8), 24-2, 31-2); // ((t1 >> 8) & 0xff) << 2 4000 rlwinm(t2, t3, 32+(2-16), 24-2, 31-2); // ((t1 >> 16) & 0xff) << 2 4001 rlwinm(t3, t3, 32+(2-24), 24-2, 31-2); // ((t1 >> 24) & 0xff) << 2 4002 4003 // Use the pre-calculated column addresses. 4004 // Load pre-calculated table values. 4005 lwzx(t0, tc0, t0); 4006 lwzx(t1, tc1, t1); 4007 lwzx(t2, tc2, t2); 4008 lwzx(t3, tc3, t3); 4009 4010 // Calculate new crc from table values. 4011 xorr(t0, t0, t1); 4012 xorr(t2, t2, t3); 4013 xorr(crc, t0, t2); // Now crc contains the final checksum value. 4014 } 4015 4016 /** 4017 * @param crc register containing existing CRC (32-bit) 4018 * @param buf register pointing to input byte buffer (byte*) 4019 * @param len register containing number of bytes 4020 * @param table register pointing to CRC table 4021 * 4022 * uses R9..R12 as work register. Must be saved/restored by caller! 4023 */ 4024 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table, 4025 Register t0, Register t1, Register t2, Register t3, 4026 Register tc0, Register tc1, Register tc2, Register tc3, 4027 bool invertCRC) { 4028 assert_different_registers(crc, buf, len, table); 4029 4030 Label L_mainLoop, L_tail; 4031 Register tmp = t0; 4032 Register data = t0; 4033 Register tmp2 = t1; 4034 const int mainLoop_stepping = 4; 4035 const int tailLoop_stepping = 1; 4036 const int log_stepping = exact_log2(mainLoop_stepping); 4037 const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32; 4038 const int complexThreshold = 2*mainLoop_stepping; 4039 4040 // Don't test for len <= 0 here. This pathological case should not occur anyway. 4041 // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles 4042 // for all well-behaved cases. The situation itself is detected and handled correctly 4043 // within update_byteLoop_crc32. 4044 assert(tailLoop_stepping == 1, "check tailLoop_stepping!"); 4045 4046 BLOCK_COMMENT("kernel_crc32_1word {"); 4047 4048 if (invertCRC) { 4049 nand(crc, crc, crc); // 1s complement of crc 4050 } 4051 4052 // Check for short (<mainLoop_stepping) buffer. 4053 cmpdi(CCR0, len, complexThreshold); 4054 blt(CCR0, L_tail); 4055 4056 // Pre-mainLoop alignment did show a slight (1%) positive effect on performance. 4057 // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions. 4058 { 4059 // Align buf addr to mainLoop_stepping boundary. 4060 neg(tmp2, buf); // Calculate # preLoop iterations for alignment. 4061 rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63. 4062 4063 if (complexThreshold > mainLoop_stepping) { 4064 sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 4065 } else { 4066 sub(tmp, len, tmp2); // Remaining bytes for main loop. 4067 cmpdi(CCR0, tmp, mainLoop_stepping); 4068 blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing 4069 mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 4070 } 4071 update_byteLoop_crc32(crc, buf, tmp2, table, data, false); 4072 } 4073 4074 srdi(tmp2, len, log_stepping); // #iterations for mainLoop 4075 andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop 4076 mtctr(tmp2); 4077 4078 #ifdef VM_LITTLE_ENDIAN 4079 Register crc_rv = crc; 4080 #else 4081 Register crc_rv = tmp; // Load_reverse needs separate registers to work on. 4082 // Occupies tmp, but frees up crc. 4083 load_reverse_32(crc_rv, crc); // Revert byte order because we are dealing with big-endian data. 4084 tmp = crc; 4085 #endif 4086 4087 int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3); 4088 4089 align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement. 4090 BIND(L_mainLoop); 4091 update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 4092 bdnz(L_mainLoop); 4093 4094 #ifndef VM_LITTLE_ENDIAN 4095 load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data. 4096 tmp = crc_rv; // Tmp uses it's original register again. 4097 #endif 4098 4099 // Restore original table address for tailLoop. 4100 if (reconstructTableOffset != 0) { 4101 addi(table, table, -reconstructTableOffset); 4102 } 4103 4104 // Process last few (<complexThreshold) bytes of buffer. 4105 BIND(L_tail); 4106 update_byteLoop_crc32(crc, buf, len, table, data, false); 4107 4108 if (invertCRC) { 4109 nand(crc, crc, crc); // 1s complement of crc 4110 } 4111 BLOCK_COMMENT("} kernel_crc32_1word"); 4112 } 4113 4114 /** 4115 * @param crc register containing existing CRC (32-bit) 4116 * @param buf register pointing to input byte buffer (byte*) 4117 * @param len register containing number of bytes 4118 * @param table register pointing to CRC table 4119 * 4120 * Uses R7_ARG5, R8_ARG6 as work registers. 4121 */ 4122 void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len, Register table, 4123 Register t0, Register t1, Register t2, Register t3, 4124 bool invertCRC) { 4125 assert_different_registers(crc, buf, len, table); 4126 4127 Register data = t0; // Holds the current byte to be folded into crc. 4128 4129 BLOCK_COMMENT("kernel_crc32_1byte {"); 4130 4131 if (invertCRC) { 4132 nand(crc, crc, crc); // 1s complement of crc 4133 } 4134 4135 // Process all bytes in a single-byte loop. 4136 update_byteLoop_crc32(crc, buf, len, table, data, true); 4137 4138 if (invertCRC) { 4139 nand(crc, crc, crc); // 1s complement of crc 4140 } 4141 BLOCK_COMMENT("} kernel_crc32_1byte"); 4142 } 4143 4144 /** 4145 * @param crc register containing existing CRC (32-bit) 4146 * @param buf register pointing to input byte buffer (byte*) 4147 * @param len register containing number of bytes 4148 * @param table register pointing to CRC table 4149 * @param constants register pointing to CRC table for 128-bit aligned memory 4150 * @param t0-t5 temp registers 4151 */ 4152 void MacroAssembler::kernel_crc32_vpmsum(Register crc, Register buf, Register len, Register table, 4153 Register constants, Register t0, Register t1, Register t2, 4154 Register t3, Register t4, Register t5, bool invertCRC) { 4155 assert_different_registers(crc, buf, len, table); 4156 4157 Label L_tail; 4158 4159 BLOCK_COMMENT("kernel_crc32_vpmsum {"); 4160 4161 if (invertCRC) { 4162 nand(crc, crc, crc); // 1s complement of crc 4163 } 4164 4165 // Enforce 32 bit. 4166 clrldi(len, len, 32); 4167 4168 // Align if we have enough bytes for the fast version. 4169 const int alignment = 16, 4170 threshold = 32; 4171 Register prealign = t0; 4172 4173 neg(prealign, buf); 4174 addi(t1, len, -threshold); 4175 andi(prealign, prealign, alignment - 1); 4176 cmpw(CCR0, t1, prealign); 4177 blt(CCR0, L_tail); // len - prealign < threshold? 4178 4179 subf(len, prealign, len); 4180 update_byteLoop_crc32(crc, buf, prealign, table, t2, false); 4181 4182 // Calculate from first aligned address as far as possible. 4183 kernel_crc32_vpmsum_aligned(crc, buf, len, constants, t0, t1, t2, t3, t4, t5); 4184 4185 // Remaining bytes. 4186 BIND(L_tail); 4187 update_byteLoop_crc32(crc, buf, len, table, t2, false); 4188 4189 if (invertCRC) { 4190 nand(crc, crc, crc); // 1s complement of crc 4191 } 4192 4193 BLOCK_COMMENT("} kernel_crc32_vpmsum"); 4194 } 4195 4196 /** 4197 * @param crc register containing existing CRC (32-bit) 4198 * @param buf register pointing to input byte buffer (byte*) 4199 * @param len register containing number of bytes (will get updated to remaining bytes) 4200 * @param constants register pointing to CRC table for 128-bit aligned memory 4201 * @param t0-t5 temp registers 4202 */ 4203 void MacroAssembler::kernel_crc32_vpmsum_aligned(Register crc, Register buf, Register len, 4204 Register constants, Register t0, Register t1, Register t2, Register t3, Register t4, Register t5) { 4205 4206 // Save non-volatile vector registers (frameless). 4207 Register offset = t1; 4208 int offsetInt = 0; 4209 offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP); 4210 offsetInt -= 16; li(offset, offsetInt); stvx(VR21, offset, R1_SP); 4211 offsetInt -= 16; li(offset, offsetInt); stvx(VR22, offset, R1_SP); 4212 offsetInt -= 16; li(offset, offsetInt); stvx(VR23, offset, R1_SP); 4213 offsetInt -= 16; li(offset, offsetInt); stvx(VR24, offset, R1_SP); 4214 offsetInt -= 16; li(offset, offsetInt); stvx(VR25, offset, R1_SP); 4215 #ifndef VM_LITTLE_ENDIAN 4216 offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP); 4217 #endif 4218 offsetInt -= 8; std(R14, offsetInt, R1_SP); 4219 offsetInt -= 8; std(R15, offsetInt, R1_SP); 4220 offsetInt -= 8; std(R16, offsetInt, R1_SP); 4221 4222 // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor 4223 // bytes per iteration. The basic scheme is: 4224 // lvx: load vector (Big Endian needs reversal) 4225 // vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift 4226 // vxor: xor partial results together to get unroll_factor2 vectors 4227 4228 // Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors. 4229 4230 // Using 16 * unroll_factor / unroll_factor_2 bytes for constants. 4231 const int unroll_factor = CRC32_UNROLL_FACTOR, 4232 unroll_factor2 = CRC32_UNROLL_FACTOR2; 4233 4234 const int outer_consts_size = (unroll_factor2 - 1) * 16, 4235 inner_consts_size = (unroll_factor / unroll_factor2) * 16; 4236 4237 // Support registers. 4238 Register offs[] = { noreg, t0, t1, t2, t3, t4, t5, crc /* will live in VCRC */ }; 4239 Register num_bytes = R14, 4240 loop_count = R15, 4241 cur_const = R16; 4242 // Constant array for outer loop: unroll_factor2 - 1 registers, 4243 // Constant array for inner loop: unroll_factor / unroll_factor2 registers. 4244 VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 }, 4245 consts1[] = { VR23, VR24 }; 4246 // Data register arrays: 2 arrays with unroll_factor2 registers. 4247 VectorRegister data0[] = { VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7 }, 4248 data1[] = { VR8, VR9, VR10, VR11, VR12, VR13, VR14, VR15 }; 4249 4250 VectorRegister VCRC = data0[0]; 4251 VectorRegister Vc = VR25; 4252 VectorRegister swap_bytes = VR26; // Only for Big Endian. 4253 4254 // We have at least 1 iteration (ensured by caller). 4255 Label L_outer_loop, L_inner_loop, L_last; 4256 4257 // If supported set DSCR pre-fetch to deepest. 4258 if (VM_Version::has_mfdscr()) { 4259 load_const_optimized(t0, VM_Version::_dscr_val | 7); 4260 mtdscr(t0); 4261 } 4262 4263 mtvrwz(VCRC, crc); // crc lives in VCRC, now 4264 4265 for (int i = 1; i < unroll_factor2; ++i) { 4266 li(offs[i], 16 * i); 4267 } 4268 4269 // Load consts for outer loop 4270 lvx(consts0[0], constants); 4271 for (int i = 1; i < unroll_factor2 - 1; ++i) { 4272 lvx(consts0[i], offs[i], constants); 4273 } 4274 4275 load_const_optimized(num_bytes, 16 * unroll_factor); 4276 4277 // Reuse data registers outside of the loop. 4278 VectorRegister Vtmp = data1[0]; 4279 VectorRegister Vtmp2 = data1[1]; 4280 VectorRegister zeroes = data1[2]; 4281 4282 vspltisb(Vtmp, 0); 4283 vsldoi(VCRC, Vtmp, VCRC, 8); // 96 bit zeroes, 32 bit CRC. 4284 4285 // Load vector for vpermxor (to xor both 64 bit parts together) 4286 lvsl(Vtmp, buf); // 000102030405060708090a0b0c0d0e0f 4287 vspltisb(Vc, 4); 4288 vsl(Vc, Vtmp, Vc); // 00102030405060708090a0b0c0d0e0f0 4289 xxspltd(Vc->to_vsr(), Vc->to_vsr(), 0); 4290 vor(Vc, Vtmp, Vc); // 001122334455667708192a3b4c5d6e7f 4291 4292 #ifdef VM_LITTLE_ENDIAN 4293 #define BE_swap_bytes(x) 4294 #else 4295 vspltisb(Vtmp2, 0xf); 4296 vxor(swap_bytes, Vtmp, Vtmp2); 4297 #define BE_swap_bytes(x) vperm(x, x, x, swap_bytes) 4298 #endif 4299 4300 cmpd(CCR0, len, num_bytes); 4301 blt(CCR0, L_last); 4302 4303 addi(cur_const, constants, outer_consts_size); // Point to consts for inner loop 4304 load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off. 4305 4306 // ********** Main loop start ********** 4307 align(32); 4308 bind(L_outer_loop); 4309 4310 // Begin of unrolled first iteration (no xor). 4311 lvx(data1[0], buf); 4312 for (int i = 1; i < unroll_factor2 / 2; ++i) { 4313 lvx(data1[i], offs[i], buf); 4314 } 4315 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. 4316 lvx(consts1[0], cur_const); 4317 mtctr(loop_count); 4318 for (int i = 0; i < unroll_factor2 / 2; ++i) { 4319 BE_swap_bytes(data1[i]); 4320 if (i == 0) { vxor(data1[0], data1[0], VCRC); } // xor in previous CRC. 4321 lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf); 4322 vpmsumw(data0[i], data1[i], consts1[0]); 4323 } 4324 addi(buf, buf, 16 * unroll_factor2); 4325 subf(len, num_bytes, len); 4326 lvx(consts1[1], offs[1], cur_const); 4327 addi(cur_const, cur_const, 32); 4328 // Begin of unrolled second iteration (head). 4329 for (int i = 0; i < unroll_factor2 / 2; ++i) { 4330 BE_swap_bytes(data1[i + unroll_factor2 / 2]); 4331 if (i == 0) { lvx(data1[0], buf); } else { lvx(data1[i], offs[i], buf); } 4332 vpmsumw(data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[0]); 4333 } 4334 for (int i = 0; i < unroll_factor2 / 2; ++i) { 4335 BE_swap_bytes(data1[i]); 4336 lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf); 4337 vpmsumw(data1[i], data1[i], consts1[1]); 4338 } 4339 addi(buf, buf, 16 * unroll_factor2); 4340 4341 // Generate most performance relevant code. Loads + half of the vpmsumw have been generated. 4342 // Double-iteration allows using the 2 constant registers alternatingly. 4343 align(32); 4344 bind(L_inner_loop); 4345 for (int j = 1; j < 3; ++j) { // j < unroll_factor / unroll_factor2 - 1 for complete unrolling. 4346 if (j & 1) { 4347 lvx(consts1[0], cur_const); 4348 } else { 4349 lvx(consts1[1], offs[1], cur_const); 4350 addi(cur_const, cur_const, 32); 4351 } 4352 for (int i = 0; i < unroll_factor2; ++i) { 4353 int idx = i + unroll_factor2 / 2, inc = 0; // For modulo-scheduled input. 4354 if (idx >= unroll_factor2) { idx -= unroll_factor2; inc = 1; } 4355 BE_swap_bytes(data1[idx]); 4356 vxor(data0[i], data0[i], data1[i]); 4357 if (i == 0) lvx(data1[0], buf); else lvx(data1[i], offs[i], buf); 4358 vpmsumw(data1[idx], data1[idx], consts1[(j + inc) & 1]); 4359 } 4360 addi(buf, buf, 16 * unroll_factor2); 4361 } 4362 bdnz(L_inner_loop); 4363 4364 addi(cur_const, constants, outer_consts_size); // Reset 4365 4366 // Tail of last iteration (no loads). 4367 for (int i = 0; i < unroll_factor2 / 2; ++i) { 4368 BE_swap_bytes(data1[i + unroll_factor2 / 2]); 4369 vxor(data0[i], data0[i], data1[i]); 4370 vpmsumw(data1[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[1]); 4371 } 4372 for (int i = 0; i < unroll_factor2 / 2; ++i) { 4373 vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); // First half of fixup shifts. 4374 vxor(data0[i + unroll_factor2 / 2], data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2]); 4375 } 4376 4377 // Last data register is ok, other ones need fixup shift. 4378 for (int i = unroll_factor2 / 2; i < unroll_factor2 - 1; ++i) { 4379 vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); 4380 } 4381 4382 // Combine to 128 bit result vector VCRC = data0[0]. 4383 for (int i = 1; i < unroll_factor2; i<<=1) { 4384 for (int j = 0; j <= unroll_factor2 - 2*i; j+=2*i) { 4385 vxor(data0[j], data0[j], data0[j+i]); 4386 } 4387 } 4388 cmpd(CCR0, len, num_bytes); 4389 bge(CCR0, L_outer_loop); 4390 4391 // Last chance with lower num_bytes. 4392 bind(L_last); 4393 srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations. 4394 // Point behind last const for inner loop. 4395 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size); 4396 sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used. 4397 clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2)); 4398 subf(cur_const, R0, cur_const); // Point to constant to be used first. 4399 4400 addic_(loop_count, loop_count, -1); // One double-iteration peeled off. 4401 bgt(CCR0, L_outer_loop); 4402 // ********** Main loop end ********** 4403 4404 // Restore DSCR pre-fetch value. 4405 if (VM_Version::has_mfdscr()) { 4406 load_const_optimized(t0, VM_Version::_dscr_val); 4407 mtdscr(t0); 4408 } 4409 4410 // ********** Simple loop for remaining 16 byte blocks ********** 4411 { 4412 Label L_loop, L_done; 4413 4414 srdi_(t0, len, 4); // 16 bytes per iteration 4415 clrldi(len, len, 64-4); 4416 beq(CCR0, L_done); 4417 4418 // Point to const (same as last const for inner loop). 4419 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size - 16); 4420 mtctr(t0); 4421 lvx(Vtmp2, cur_const); 4422 4423 align(32); 4424 bind(L_loop); 4425 4426 lvx(Vtmp, buf); 4427 addi(buf, buf, 16); 4428 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. 4429 BE_swap_bytes(Vtmp); 4430 vxor(VCRC, VCRC, Vtmp); 4431 vpmsumw(VCRC, VCRC, Vtmp2); 4432 bdnz(L_loop); 4433 4434 bind(L_done); 4435 } 4436 // ********** Simple loop end ********** 4437 #undef BE_swap_bytes 4438 4439 // Point to Barrett constants 4440 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size); 4441 4442 vspltisb(zeroes, 0); 4443 4444 // Combine to 64 bit result. 4445 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. 4446 4447 // Reduce to 32 bit CRC: Remainder by multiply-high. 4448 lvx(Vtmp, cur_const); 4449 vsldoi(Vtmp2, zeroes, VCRC, 12); // Extract high 32 bit. 4450 vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply by inverse long poly. 4451 vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit. 4452 vsldoi(Vtmp, zeroes, Vtmp, 8); 4453 vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply quotient by long poly. 4454 vxor(VCRC, VCRC, Vtmp2); // Remainder fits into 32 bit. 4455 4456 // Move result. len is already updated. 4457 vsldoi(VCRC, VCRC, zeroes, 8); 4458 mfvrd(crc, VCRC); 4459 4460 // Restore non-volatile Vector registers (frameless). 4461 offsetInt = 0; 4462 offsetInt -= 16; li(offset, offsetInt); lvx(VR20, offset, R1_SP); 4463 offsetInt -= 16; li(offset, offsetInt); lvx(VR21, offset, R1_SP); 4464 offsetInt -= 16; li(offset, offsetInt); lvx(VR22, offset, R1_SP); 4465 offsetInt -= 16; li(offset, offsetInt); lvx(VR23, offset, R1_SP); 4466 offsetInt -= 16; li(offset, offsetInt); lvx(VR24, offset, R1_SP); 4467 offsetInt -= 16; li(offset, offsetInt); lvx(VR25, offset, R1_SP); 4468 #ifndef VM_LITTLE_ENDIAN 4469 offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP); 4470 #endif 4471 offsetInt -= 8; ld(R14, offsetInt, R1_SP); 4472 offsetInt -= 8; ld(R15, offsetInt, R1_SP); 4473 offsetInt -= 8; ld(R16, offsetInt, R1_SP); 4474 } 4475 4476 void MacroAssembler::crc32(Register crc, Register buf, Register len, Register t0, Register t1, Register t2, 4477 Register t3, Register t4, Register t5, Register t6, Register t7, bool is_crc32c) { 4478 load_const_optimized(t0, is_crc32c ? StubRoutines::crc32c_table_addr() 4479 : StubRoutines::crc_table_addr() , R0); 4480 4481 if (VM_Version::has_vpmsumb()) { 4482 load_const_optimized(t1, is_crc32c ? StubRoutines::ppc64::crc32c_constants() 4483 : StubRoutines::ppc64::crc_constants() , R0); 4484 kernel_crc32_vpmsum(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, !is_crc32c); 4485 } else { 4486 kernel_crc32_1word(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, t0, !is_crc32c); 4487 } 4488 } 4489 4490 void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp, bool invertCRC) { 4491 assert_different_registers(crc, buf, /* len, not used!! */ table, tmp); 4492 4493 BLOCK_COMMENT("kernel_crc32_singleByte:"); 4494 if (invertCRC) { 4495 nand(crc, crc, crc); // 1s complement of crc 4496 } 4497 4498 lbz(tmp, 0, buf); // Byte from buffer, zero-extended. 4499 update_byte_crc32(crc, tmp, table); 4500 4501 if (invertCRC) { 4502 nand(crc, crc, crc); // 1s complement of crc 4503 } 4504 } 4505 4506 void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) { 4507 assert_different_registers(crc, val, table); 4508 4509 BLOCK_COMMENT("kernel_crc32_singleByteReg:"); 4510 if (invertCRC) { 4511 nand(crc, crc, crc); // 1s complement of crc 4512 } 4513 4514 update_byte_crc32(crc, val, table); 4515 4516 if (invertCRC) { 4517 nand(crc, crc, crc); // 1s complement of crc 4518 } 4519 } 4520 4521 // dest_lo += src1 + src2 4522 // dest_hi += carry1 + carry2 4523 void MacroAssembler::add2_with_carry(Register dest_hi, 4524 Register dest_lo, 4525 Register src1, Register src2) { 4526 li(R0, 0); 4527 addc(dest_lo, dest_lo, src1); 4528 adde(dest_hi, dest_hi, R0); 4529 addc(dest_lo, dest_lo, src2); 4530 adde(dest_hi, dest_hi, R0); 4531 } 4532 4533 // Multiply 64 bit by 64 bit first loop. 4534 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, 4535 Register x_xstart, 4536 Register y, Register y_idx, 4537 Register z, 4538 Register carry, 4539 Register product_high, Register product, 4540 Register idx, Register kdx, 4541 Register tmp) { 4542 // jlong carry, x[], y[], z[]; 4543 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) { 4544 // huge_128 product = y[idx] * x[xstart] + carry; 4545 // z[kdx] = (jlong)product; 4546 // carry = (jlong)(product >>> 64); 4547 // } 4548 // z[xstart] = carry; 4549 4550 Label L_first_loop, L_first_loop_exit; 4551 Label L_one_x, L_one_y, L_multiply; 4552 4553 addic_(xstart, xstart, -1); 4554 blt(CCR0, L_one_x); // Special case: length of x is 1. 4555 4556 // Load next two integers of x. 4557 sldi(tmp, xstart, LogBytesPerInt); 4558 ldx(x_xstart, x, tmp); 4559 #ifdef VM_LITTLE_ENDIAN 4560 rldicl(x_xstart, x_xstart, 32, 0); 4561 #endif 4562 4563 align(32, 16); 4564 bind(L_first_loop); 4565 4566 cmpdi(CCR0, idx, 1); 4567 blt(CCR0, L_first_loop_exit); 4568 addi(idx, idx, -2); 4569 beq(CCR0, L_one_y); 4570 4571 // Load next two integers of y. 4572 sldi(tmp, idx, LogBytesPerInt); 4573 ldx(y_idx, y, tmp); 4574 #ifdef VM_LITTLE_ENDIAN 4575 rldicl(y_idx, y_idx, 32, 0); 4576 #endif 4577 4578 4579 bind(L_multiply); 4580 multiply64(product_high, product, x_xstart, y_idx); 4581 4582 li(tmp, 0); 4583 addc(product, product, carry); // Add carry to result. 4584 adde(product_high, product_high, tmp); // Add carry of the last addition. 4585 addi(kdx, kdx, -2); 4586 4587 // Store result. 4588 #ifdef VM_LITTLE_ENDIAN 4589 rldicl(product, product, 32, 0); 4590 #endif 4591 sldi(tmp, kdx, LogBytesPerInt); 4592 stdx(product, z, tmp); 4593 mr_if_needed(carry, product_high); 4594 b(L_first_loop); 4595 4596 4597 bind(L_one_y); // Load one 32 bit portion of y as (0,value). 4598 4599 lwz(y_idx, 0, y); 4600 b(L_multiply); 4601 4602 4603 bind(L_one_x); // Load one 32 bit portion of x as (0,value). 4604 4605 lwz(x_xstart, 0, x); 4606 b(L_first_loop); 4607 4608 bind(L_first_loop_exit); 4609 } 4610 4611 // Multiply 64 bit by 64 bit and add 128 bit. 4612 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, 4613 Register z, Register yz_idx, 4614 Register idx, Register carry, 4615 Register product_high, Register product, 4616 Register tmp, int offset) { 4617 4618 // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry; 4619 // z[kdx] = (jlong)product; 4620 4621 sldi(tmp, idx, LogBytesPerInt); 4622 if (offset) { 4623 addi(tmp, tmp, offset); 4624 } 4625 ldx(yz_idx, y, tmp); 4626 #ifdef VM_LITTLE_ENDIAN 4627 rldicl(yz_idx, yz_idx, 32, 0); 4628 #endif 4629 4630 multiply64(product_high, product, x_xstart, yz_idx); 4631 ldx(yz_idx, z, tmp); 4632 #ifdef VM_LITTLE_ENDIAN 4633 rldicl(yz_idx, yz_idx, 32, 0); 4634 #endif 4635 4636 add2_with_carry(product_high, product, carry, yz_idx); 4637 4638 sldi(tmp, idx, LogBytesPerInt); 4639 if (offset) { 4640 addi(tmp, tmp, offset); 4641 } 4642 #ifdef VM_LITTLE_ENDIAN 4643 rldicl(product, product, 32, 0); 4644 #endif 4645 stdx(product, z, tmp); 4646 } 4647 4648 // Multiply 128 bit by 128 bit. Unrolled inner loop. 4649 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, 4650 Register y, Register z, 4651 Register yz_idx, Register idx, Register carry, 4652 Register product_high, Register product, 4653 Register carry2, Register tmp) { 4654 4655 // jlong carry, x[], y[], z[]; 4656 // int kdx = ystart+1; 4657 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 4658 // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry; 4659 // z[kdx+idx+1] = (jlong)product; 4660 // jlong carry2 = (jlong)(product >>> 64); 4661 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2; 4662 // z[kdx+idx] = (jlong)product; 4663 // carry = (jlong)(product >>> 64); 4664 // } 4665 // idx += 2; 4666 // if (idx > 0) { 4667 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry; 4668 // z[kdx+idx] = (jlong)product; 4669 // carry = (jlong)(product >>> 64); 4670 // } 4671 4672 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 4673 const Register jdx = R0; 4674 4675 // Scale the index. 4676 srdi_(jdx, idx, 2); 4677 beq(CCR0, L_third_loop_exit); 4678 mtctr(jdx); 4679 4680 align(32, 16); 4681 bind(L_third_loop); 4682 4683 addi(idx, idx, -4); 4684 4685 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8); 4686 mr_if_needed(carry2, product_high); 4687 4688 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0); 4689 mr_if_needed(carry, product_high); 4690 bdnz(L_third_loop); 4691 4692 bind(L_third_loop_exit); // Handle any left-over operand parts. 4693 4694 andi_(idx, idx, 0x3); 4695 beq(CCR0, L_post_third_loop_done); 4696 4697 Label L_check_1; 4698 4699 addic_(idx, idx, -2); 4700 blt(CCR0, L_check_1); 4701 4702 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0); 4703 mr_if_needed(carry, product_high); 4704 4705 bind(L_check_1); 4706 4707 addi(idx, idx, 0x2); 4708 andi_(idx, idx, 0x1); 4709 addic_(idx, idx, -1); 4710 blt(CCR0, L_post_third_loop_done); 4711 4712 sldi(tmp, idx, LogBytesPerInt); 4713 lwzx(yz_idx, y, tmp); 4714 multiply64(product_high, product, x_xstart, yz_idx); 4715 lwzx(yz_idx, z, tmp); 4716 4717 add2_with_carry(product_high, product, yz_idx, carry); 4718 4719 sldi(tmp, idx, LogBytesPerInt); 4720 stwx(product, z, tmp); 4721 srdi(product, product, 32); 4722 4723 sldi(product_high, product_high, 32); 4724 orr(product, product, product_high); 4725 mr_if_needed(carry, product); 4726 4727 bind(L_post_third_loop_done); 4728 } // multiply_128_x_128_loop 4729 4730 void MacroAssembler::muladd(Register out, Register in, 4731 Register offset, Register len, Register k, 4732 Register tmp1, Register tmp2, Register carry) { 4733 4734 // Labels 4735 Label LOOP, SKIP; 4736 4737 // Make sure length is positive. 4738 cmpdi (CCR0, len, 0); 4739 4740 // Prepare variables 4741 subi (offset, offset, 4); 4742 li (carry, 0); 4743 ble (CCR0, SKIP); 4744 4745 mtctr (len); 4746 subi (len, len, 1 ); 4747 sldi (len, len, 2 ); 4748 4749 // Main loop 4750 bind(LOOP); 4751 lwzx (tmp1, len, in ); 4752 lwzx (tmp2, offset, out ); 4753 mulld (tmp1, tmp1, k ); 4754 add (tmp2, carry, tmp2 ); 4755 add (tmp2, tmp1, tmp2 ); 4756 stwx (tmp2, offset, out ); 4757 srdi (carry, tmp2, 32 ); 4758 subi (offset, offset, 4 ); 4759 subi (len, len, 4 ); 4760 bdnz (LOOP); 4761 bind(SKIP); 4762 } 4763 4764 void MacroAssembler::multiply_to_len(Register x, Register xlen, 4765 Register y, Register ylen, 4766 Register z, Register zlen, 4767 Register tmp1, Register tmp2, 4768 Register tmp3, Register tmp4, 4769 Register tmp5, Register tmp6, 4770 Register tmp7, Register tmp8, 4771 Register tmp9, Register tmp10, 4772 Register tmp11, Register tmp12, 4773 Register tmp13) { 4774 4775 ShortBranchVerifier sbv(this); 4776 4777 assert_different_registers(x, xlen, y, ylen, z, zlen, 4778 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6); 4779 assert_different_registers(x, xlen, y, ylen, z, zlen, 4780 tmp1, tmp2, tmp3, tmp4, tmp5, tmp7); 4781 assert_different_registers(x, xlen, y, ylen, z, zlen, 4782 tmp1, tmp2, tmp3, tmp4, tmp5, tmp8); 4783 4784 const Register idx = tmp1; 4785 const Register kdx = tmp2; 4786 const Register xstart = tmp3; 4787 4788 const Register y_idx = tmp4; 4789 const Register carry = tmp5; 4790 const Register product = tmp6; 4791 const Register product_high = tmp7; 4792 const Register x_xstart = tmp8; 4793 const Register tmp = tmp9; 4794 4795 // First Loop. 4796 // 4797 // final static long LONG_MASK = 0xffffffffL; 4798 // int xstart = xlen - 1; 4799 // int ystart = ylen - 1; 4800 // long carry = 0; 4801 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 4802 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; 4803 // z[kdx] = (int)product; 4804 // carry = product >>> 32; 4805 // } 4806 // z[xstart] = (int)carry; 4807 4808 mr_if_needed(idx, ylen); // idx = ylen 4809 mr_if_needed(kdx, zlen); // kdx = xlen + ylen 4810 li(carry, 0); // carry = 0 4811 4812 Label L_done; 4813 4814 addic_(xstart, xlen, -1); 4815 blt(CCR0, L_done); 4816 4817 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, 4818 carry, product_high, product, idx, kdx, tmp); 4819 4820 Label L_second_loop; 4821 4822 cmpdi(CCR0, kdx, 0); 4823 beq(CCR0, L_second_loop); 4824 4825 Label L_carry; 4826 4827 addic_(kdx, kdx, -1); 4828 beq(CCR0, L_carry); 4829 4830 // Store lower 32 bits of carry. 4831 sldi(tmp, kdx, LogBytesPerInt); 4832 stwx(carry, z, tmp); 4833 srdi(carry, carry, 32); 4834 addi(kdx, kdx, -1); 4835 4836 4837 bind(L_carry); 4838 4839 // Store upper 32 bits of carry. 4840 sldi(tmp, kdx, LogBytesPerInt); 4841 stwx(carry, z, tmp); 4842 4843 // Second and third (nested) loops. 4844 // 4845 // for (int i = xstart-1; i >= 0; i--) { // Second loop 4846 // carry = 0; 4847 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop 4848 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + 4849 // (z[k] & LONG_MASK) + carry; 4850 // z[k] = (int)product; 4851 // carry = product >>> 32; 4852 // } 4853 // z[i] = (int)carry; 4854 // } 4855 // 4856 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx 4857 4858 bind(L_second_loop); 4859 4860 li(carry, 0); // carry = 0; 4861 4862 addic_(xstart, xstart, -1); // i = xstart-1; 4863 blt(CCR0, L_done); 4864 4865 Register zsave = tmp10; 4866 4867 mr(zsave, z); 4868 4869 4870 Label L_last_x; 4871 4872 sldi(tmp, xstart, LogBytesPerInt); 4873 add(z, z, tmp); // z = z + k - j 4874 addi(z, z, 4); 4875 addic_(xstart, xstart, -1); // i = xstart-1; 4876 blt(CCR0, L_last_x); 4877 4878 sldi(tmp, xstart, LogBytesPerInt); 4879 ldx(x_xstart, x, tmp); 4880 #ifdef VM_LITTLE_ENDIAN 4881 rldicl(x_xstart, x_xstart, 32, 0); 4882 #endif 4883 4884 4885 Label L_third_loop_prologue; 4886 4887 bind(L_third_loop_prologue); 4888 4889 Register xsave = tmp11; 4890 Register xlensave = tmp12; 4891 Register ylensave = tmp13; 4892 4893 mr(xsave, x); 4894 mr(xlensave, xstart); 4895 mr(ylensave, ylen); 4896 4897 4898 multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen, 4899 carry, product_high, product, x, tmp); 4900 4901 mr(z, zsave); 4902 mr(x, xsave); 4903 mr(xlen, xlensave); // This is the decrement of the loop counter! 4904 mr(ylen, ylensave); 4905 4906 addi(tmp3, xlen, 1); 4907 sldi(tmp, tmp3, LogBytesPerInt); 4908 stwx(carry, z, tmp); 4909 addic_(tmp3, tmp3, -1); 4910 blt(CCR0, L_done); 4911 4912 srdi(carry, carry, 32); 4913 sldi(tmp, tmp3, LogBytesPerInt); 4914 stwx(carry, z, tmp); 4915 b(L_second_loop); 4916 4917 // Next infrequent code is moved outside loops. 4918 bind(L_last_x); 4919 4920 lwz(x_xstart, 0, x); 4921 b(L_third_loop_prologue); 4922 4923 bind(L_done); 4924 } // multiply_to_len 4925 4926 void MacroAssembler::asm_assert(bool check_equal, const char *msg, int id) { 4927 #ifdef ASSERT 4928 Label ok; 4929 if (check_equal) { 4930 beq(CCR0, ok); 4931 } else { 4932 bne(CCR0, ok); 4933 } 4934 stop(msg, id); 4935 bind(ok); 4936 #endif 4937 } 4938 4939 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset, 4940 Register mem_base, const char* msg, int id) { 4941 #ifdef ASSERT 4942 switch (size) { 4943 case 4: 4944 lwz(R0, mem_offset, mem_base); 4945 cmpwi(CCR0, R0, 0); 4946 break; 4947 case 8: 4948 ld(R0, mem_offset, mem_base); 4949 cmpdi(CCR0, R0, 0); 4950 break; 4951 default: 4952 ShouldNotReachHere(); 4953 } 4954 asm_assert(check_equal, msg, id); 4955 #endif // ASSERT 4956 } 4957 4958 void MacroAssembler::verify_thread() { 4959 if (VerifyThread) { 4960 unimplemented("'VerifyThread' currently not implemented on PPC"); 4961 } 4962 } 4963 4964 // READ: oop. KILL: R0. Volatile floats perhaps. 4965 void MacroAssembler::verify_oop(Register oop, const char* msg) { 4966 if (!VerifyOops) { 4967 return; 4968 } 4969 4970 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address(); 4971 const Register tmp = R11; // Will be preserved. 4972 const int nbytes_save = MacroAssembler::num_volatile_regs * 8; 4973 save_volatile_gprs(R1_SP, -nbytes_save); // except R0 4974 4975 mr_if_needed(R4_ARG2, oop); 4976 save_LR_CR(tmp); // save in old frame 4977 push_frame_reg_args(nbytes_save, tmp); 4978 // load FunctionDescriptor** / entry_address * 4979 load_const_optimized(tmp, fd, R0); 4980 // load FunctionDescriptor* / entry_address 4981 ld(tmp, 0, tmp); 4982 load_const_optimized(R3_ARG1, (address)msg, R0); 4983 // Call destination for its side effect. 4984 call_c(tmp); 4985 4986 pop_frame(); 4987 restore_LR_CR(tmp); 4988 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0 4989 } 4990 4991 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) { 4992 if (!VerifyOops) { 4993 return; 4994 } 4995 4996 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address(); 4997 const Register tmp = R11; // Will be preserved. 4998 const int nbytes_save = MacroAssembler::num_volatile_regs * 8; 4999 save_volatile_gprs(R1_SP, -nbytes_save); // except R0 5000 5001 ld(R4_ARG2, offs, base); 5002 save_LR_CR(tmp); // save in old frame 5003 push_frame_reg_args(nbytes_save, tmp); 5004 // load FunctionDescriptor** / entry_address * 5005 load_const_optimized(tmp, fd, R0); 5006 // load FunctionDescriptor* / entry_address 5007 ld(tmp, 0, tmp); 5008 load_const_optimized(R3_ARG1, (address)msg, R0); 5009 // Call destination for its side effect. 5010 call_c(tmp); 5011 5012 pop_frame(); 5013 restore_LR_CR(tmp); 5014 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0 5015 } 5016 5017 const char* stop_types[] = { 5018 "stop", 5019 "untested", 5020 "unimplemented", 5021 "shouldnotreachhere" 5022 }; 5023 5024 static void stop_on_request(int tp, const char* msg) { 5025 tty->print("PPC assembly code requires stop: (%s) %s\n", stop_types[tp%/*stop_end*/4], msg); 5026 guarantee(false, "PPC assembly code requires stop: %s", msg); 5027 } 5028 5029 // Call a C-function that prints output. 5030 void MacroAssembler::stop(int type, const char* msg, int id) { 5031 #ifndef PRODUCT 5032 block_comment(err_msg("stop: %s %s {", stop_types[type%stop_end], msg)); 5033 #else 5034 block_comment("stop {"); 5035 #endif 5036 5037 // setup arguments 5038 load_const_optimized(R3_ARG1, type); 5039 load_const_optimized(R4_ARG2, (void *)msg, /*tmp=*/R0); 5040 call_VM_leaf(CAST_FROM_FN_PTR(address, stop_on_request), R3_ARG1, R4_ARG2); 5041 illtrap(); 5042 emit_int32(id); 5043 block_comment("} stop;"); 5044 } 5045 5046 #ifndef PRODUCT 5047 // Write pattern 0x0101010101010101 in memory region [low-before, high+after]. 5048 // Val, addr are temp registers. 5049 // If low == addr, addr is killed. 5050 // High is preserved. 5051 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) { 5052 if (!ZapMemory) return; 5053 5054 assert_different_registers(low, val); 5055 5056 BLOCK_COMMENT("zap memory region {"); 5057 load_const_optimized(val, 0x0101010101010101); 5058 int size = before + after; 5059 if (low == high && size < 5 && size > 0) { 5060 int offset = -before*BytesPerWord; 5061 for (int i = 0; i < size; ++i) { 5062 std(val, offset, low); 5063 offset += (1*BytesPerWord); 5064 } 5065 } else { 5066 addi(addr, low, -before*BytesPerWord); 5067 assert_different_registers(high, val); 5068 if (after) addi(high, high, after * BytesPerWord); 5069 Label loop; 5070 bind(loop); 5071 std(val, 0, addr); 5072 addi(addr, addr, 8); 5073 cmpd(CCR6, addr, high); 5074 ble(CCR6, loop); 5075 if (after) addi(high, high, -after * BytesPerWord); // Correct back to old value. 5076 } 5077 BLOCK_COMMENT("} zap memory region"); 5078 } 5079 5080 #endif // !PRODUCT 5081 5082 void SkipIfEqualZero::skip_to_label_if_equal_zero(MacroAssembler* masm, Register temp, 5083 const bool* flag_addr, Label& label) { 5084 int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true); 5085 assert(sizeof(bool) == 1, "PowerPC ABI"); 5086 masm->lbz(temp, simm16_offset, temp); 5087 masm->cmpwi(CCR0, temp, 0); 5088 masm->beq(CCR0, label); 5089 } 5090 5091 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() { 5092 skip_to_label_if_equal_zero(masm, temp, flag_addr, _label); 5093 } 5094 5095 SkipIfEqualZero::~SkipIfEqualZero() { 5096 _masm->bind(_label); 5097 }