1 /* 2 * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2012, 2018, SAP SE. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.inline.hpp" 28 #include "compiler/disassembler.hpp" 29 #include "gc/shared/collectedHeap.inline.hpp" 30 #include "gc/shared/barrierSet.hpp" 31 #include "gc/shared/barrierSetAssembler.hpp" 32 #include "interpreter/interpreter.hpp" 33 #include "memory/resourceArea.hpp" 34 #include "nativeInst_ppc.hpp" 35 #include "prims/methodHandles.hpp" 36 #include "runtime/biasedLocking.hpp" 37 #include "runtime/icache.hpp" 38 #include "runtime/interfaceSupport.inline.hpp" 39 #include "runtime/objectMonitor.hpp" 40 #include "runtime/os.hpp" 41 #include "runtime/safepoint.hpp" 42 #include "runtime/safepointMechanism.hpp" 43 #include "runtime/sharedRuntime.hpp" 44 #include "runtime/stubRoutines.hpp" 45 #include "utilities/macros.hpp" 46 #ifdef COMPILER2 47 #include "opto/intrinsicnode.hpp" 48 #endif 49 50 #ifdef PRODUCT 51 #define BLOCK_COMMENT(str) // nothing 52 #else 53 #define BLOCK_COMMENT(str) block_comment(str) 54 #endif 55 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 56 57 #ifdef ASSERT 58 // On RISC, there's no benefit to verifying instruction boundaries. 59 bool AbstractAssembler::pd_check_instruction_mark() { return false; } 60 #endif 61 62 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) { 63 assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range"); 64 if (Assembler::is_simm(si31, 16)) { 65 ld(d, si31, a); 66 if (emit_filler_nop) nop(); 67 } else { 68 const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31); 69 const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31); 70 addis(d, a, hi); 71 ld(d, lo, d); 72 } 73 } 74 75 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) { 76 assert_different_registers(d, a); 77 ld_largeoffset_unchecked(d, si31, a, emit_filler_nop); 78 } 79 80 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base, 81 size_t size_in_bytes, bool is_signed) { 82 switch (size_in_bytes) { 83 case 8: ld(dst, offs, base); break; 84 case 4: is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break; 85 case 2: is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break; 86 case 1: lbz(dst, offs, base); if (is_signed) extsb(dst, dst); break; // lba doesn't exist :( 87 default: ShouldNotReachHere(); 88 } 89 } 90 91 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base, 92 size_t size_in_bytes) { 93 switch (size_in_bytes) { 94 case 8: std(dst, offs, base); break; 95 case 4: stw(dst, offs, base); break; 96 case 2: sth(dst, offs, base); break; 97 case 1: stb(dst, offs, base); break; 98 default: ShouldNotReachHere(); 99 } 100 } 101 102 void MacroAssembler::align(int modulus, int max, int rem) { 103 int padding = (rem + modulus - (offset() % modulus)) % modulus; 104 if (padding > max) return; 105 for (int c = (padding >> 2); c > 0; --c) { nop(); } 106 } 107 108 // Issue instructions that calculate given TOC from global TOC. 109 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16, 110 bool add_relocation, bool emit_dummy_addr) { 111 int offset = -1; 112 if (emit_dummy_addr) { 113 offset = -128; // dummy address 114 } else if (addr != (address)(intptr_t)-1) { 115 offset = MacroAssembler::offset_to_global_toc(addr); 116 } 117 118 if (hi16) { 119 addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset)); 120 } 121 if (lo16) { 122 if (add_relocation) { 123 // Relocate at the addi to avoid confusion with a load from the method's TOC. 124 relocate(internal_word_Relocation::spec(addr)); 125 } 126 addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset)); 127 } 128 } 129 130 address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) { 131 const int offset = MacroAssembler::offset_to_global_toc(addr); 132 133 const address inst2_addr = a; 134 const int inst2 = *(int *)inst2_addr; 135 136 // The relocation points to the second instruction, the addi, 137 // and the addi reads and writes the same register dst. 138 const int dst = inv_rt_field(inst2); 139 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 140 141 // Now, find the preceding addis which writes to dst. 142 int inst1 = 0; 143 address inst1_addr = inst2_addr - BytesPerInstWord; 144 while (inst1_addr >= bound) { 145 inst1 = *(int *) inst1_addr; 146 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 147 // Stop, found the addis which writes dst. 148 break; 149 } 150 inst1_addr -= BytesPerInstWord; 151 } 152 153 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 154 set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset)); 155 set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset)); 156 return inst1_addr; 157 } 158 159 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) { 160 const address inst2_addr = a; 161 const int inst2 = *(int *)inst2_addr; 162 163 // The relocation points to the second instruction, the addi, 164 // and the addi reads and writes the same register dst. 165 const int dst = inv_rt_field(inst2); 166 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 167 168 // Now, find the preceding addis which writes to dst. 169 int inst1 = 0; 170 address inst1_addr = inst2_addr - BytesPerInstWord; 171 while (inst1_addr >= bound) { 172 inst1 = *(int *) inst1_addr; 173 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 174 // stop, found the addis which writes dst 175 break; 176 } 177 inst1_addr -= BytesPerInstWord; 178 } 179 180 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 181 182 int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0); 183 // -1 is a special case 184 if (offset == -1) { 185 return (address)(intptr_t)-1; 186 } else { 187 return global_toc() + offset; 188 } 189 } 190 191 #ifdef _LP64 192 // Patch compressed oops or klass constants. 193 // Assembler sequence is 194 // 1) compressed oops: 195 // lis rx = const.hi 196 // ori rx = rx | const.lo 197 // 2) compressed klass: 198 // lis rx = const.hi 199 // clrldi rx = rx & 0xFFFFffff // clearMS32b, optional 200 // ori rx = rx | const.lo 201 // Clrldi will be passed by. 202 address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) { 203 assert(UseCompressedOops, "Should only patch compressed oops"); 204 205 const address inst2_addr = a; 206 const int inst2 = *(int *)inst2_addr; 207 208 // The relocation points to the second instruction, the ori, 209 // and the ori reads and writes the same register dst. 210 const int dst = inv_rta_field(inst2); 211 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 212 // Now, find the preceding addis which writes to dst. 213 int inst1 = 0; 214 address inst1_addr = inst2_addr - BytesPerInstWord; 215 bool inst1_found = false; 216 while (inst1_addr >= bound) { 217 inst1 = *(int *)inst1_addr; 218 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; } 219 inst1_addr -= BytesPerInstWord; 220 } 221 assert(inst1_found, "inst is not lis"); 222 223 int xc = (data >> 16) & 0xffff; 224 int xd = (data >> 0) & 0xffff; 225 226 set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo 227 set_imm((int *)inst2_addr, (xd)); // unsigned int 228 return inst1_addr; 229 } 230 231 // Get compressed oop or klass constant. 232 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) { 233 assert(UseCompressedOops, "Should only patch compressed oops"); 234 235 const address inst2_addr = a; 236 const int inst2 = *(int *)inst2_addr; 237 238 // The relocation points to the second instruction, the ori, 239 // and the ori reads and writes the same register dst. 240 const int dst = inv_rta_field(inst2); 241 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 242 // Now, find the preceding lis which writes to dst. 243 int inst1 = 0; 244 address inst1_addr = inst2_addr - BytesPerInstWord; 245 bool inst1_found = false; 246 247 while (inst1_addr >= bound) { 248 inst1 = *(int *) inst1_addr; 249 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;} 250 inst1_addr -= BytesPerInstWord; 251 } 252 assert(inst1_found, "inst is not lis"); 253 254 uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff)); 255 uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16); 256 257 return (int) (xl | xh); 258 } 259 #endif // _LP64 260 261 // Returns true if successful. 262 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a, 263 Register toc, bool fixed_size) { 264 int toc_offset = 0; 265 // Use RelocationHolder::none for the constant pool entry, otherwise 266 // we will end up with a failing NativeCall::verify(x) where x is 267 // the address of the constant pool entry. 268 // FIXME: We should insert relocation information for oops at the constant 269 // pool entries instead of inserting it at the loads; patching of a constant 270 // pool entry should be less expensive. 271 address const_address = address_constant((address)a.value(), RelocationHolder::none); 272 if (const_address == NULL) { return false; } // allocation failure 273 // Relocate at the pc of the load. 274 relocate(a.rspec()); 275 toc_offset = (int)(const_address - code()->consts()->start()); 276 ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size); 277 return true; 278 } 279 280 bool MacroAssembler::is_load_const_from_method_toc_at(address a) { 281 const address inst1_addr = a; 282 const int inst1 = *(int *)inst1_addr; 283 284 // The relocation points to the ld or the addis. 285 return (is_ld(inst1)) || 286 (is_addis(inst1) && inv_ra_field(inst1) != 0); 287 } 288 289 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) { 290 assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc"); 291 292 const address inst1_addr = a; 293 const int inst1 = *(int *)inst1_addr; 294 295 if (is_ld(inst1)) { 296 return inv_d1_field(inst1); 297 } else if (is_addis(inst1)) { 298 const int dst = inv_rt_field(inst1); 299 300 // Now, find the succeeding ld which reads and writes to dst. 301 address inst2_addr = inst1_addr + BytesPerInstWord; 302 int inst2 = 0; 303 while (true) { 304 inst2 = *(int *) inst2_addr; 305 if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) { 306 // Stop, found the ld which reads and writes dst. 307 break; 308 } 309 inst2_addr += BytesPerInstWord; 310 } 311 return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2); 312 } 313 ShouldNotReachHere(); 314 return 0; 315 } 316 317 // Get the constant from a `load_const' sequence. 318 long MacroAssembler::get_const(address a) { 319 assert(is_load_const_at(a), "not a load of a constant"); 320 const int *p = (const int*) a; 321 unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48); 322 if (is_ori(*(p+1))) { 323 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32); 324 x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16); 325 x |= (((unsigned long) (get_imm(a,4) & 0xffff))); 326 } else if (is_lis(*(p+1))) { 327 x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32); 328 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16); 329 x |= (((unsigned long) (get_imm(a,3) & 0xffff))); 330 } else { 331 ShouldNotReachHere(); 332 return (long) 0; 333 } 334 return (long) x; 335 } 336 337 // Patch the 64 bit constant of a `load_const' sequence. This is a low 338 // level procedure. It neither flushes the instruction cache nor is it 339 // mt safe. 340 void MacroAssembler::patch_const(address a, long x) { 341 assert(is_load_const_at(a), "not a load of a constant"); 342 int *p = (int*) a; 343 if (is_ori(*(p+1))) { 344 set_imm(0 + p, (x >> 48) & 0xffff); 345 set_imm(1 + p, (x >> 32) & 0xffff); 346 set_imm(3 + p, (x >> 16) & 0xffff); 347 set_imm(4 + p, x & 0xffff); 348 } else if (is_lis(*(p+1))) { 349 set_imm(0 + p, (x >> 48) & 0xffff); 350 set_imm(2 + p, (x >> 32) & 0xffff); 351 set_imm(1 + p, (x >> 16) & 0xffff); 352 set_imm(3 + p, x & 0xffff); 353 } else { 354 ShouldNotReachHere(); 355 } 356 } 357 358 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) { 359 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 360 int index = oop_recorder()->allocate_metadata_index(obj); 361 RelocationHolder rspec = metadata_Relocation::spec(index); 362 return AddressLiteral((address)obj, rspec); 363 } 364 365 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) { 366 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 367 int index = oop_recorder()->find_index(obj); 368 RelocationHolder rspec = metadata_Relocation::spec(index); 369 return AddressLiteral((address)obj, rspec); 370 } 371 372 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) { 373 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 374 int oop_index = oop_recorder()->allocate_oop_index(obj); 375 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 376 } 377 378 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) { 379 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 380 int oop_index = oop_recorder()->find_index(obj); 381 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 382 } 383 384 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr, 385 Register tmp, int offset) { 386 intptr_t value = *delayed_value_addr; 387 if (value != 0) { 388 return RegisterOrConstant(value + offset); 389 } 390 391 // Load indirectly to solve generation ordering problem. 392 // static address, no relocation 393 int simm16_offset = load_const_optimized(tmp, delayed_value_addr, noreg, true); 394 ld(tmp, simm16_offset, tmp); // must be aligned ((xa & 3) == 0) 395 396 if (offset != 0) { 397 addi(tmp, tmp, offset); 398 } 399 400 return RegisterOrConstant(tmp); 401 } 402 403 #ifndef PRODUCT 404 void MacroAssembler::pd_print_patched_instruction(address branch) { 405 Unimplemented(); // TODO: PPC port 406 } 407 #endif // ndef PRODUCT 408 409 // Conditional far branch for destinations encodable in 24+2 bits. 410 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) { 411 412 // If requested by flag optimize, relocate the bc_far as a 413 // runtime_call and prepare for optimizing it when the code gets 414 // relocated. 415 if (optimize == bc_far_optimize_on_relocate) { 416 relocate(relocInfo::runtime_call_type); 417 } 418 419 // variant 2: 420 // 421 // b!cxx SKIP 422 // bxx DEST 423 // SKIP: 424 // 425 426 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 427 opposite_bcond(inv_boint_bcond(boint))); 428 429 // We emit two branches. 430 // First, a conditional branch which jumps around the far branch. 431 const address not_taken_pc = pc() + 2 * BytesPerInstWord; 432 const address bc_pc = pc(); 433 bc(opposite_boint, biint, not_taken_pc); 434 435 const int bc_instr = *(int*)bc_pc; 436 assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition"); 437 assert(opposite_boint == inv_bo_field(bc_instr), "postcondition"); 438 assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))), 439 opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))), 440 "postcondition"); 441 assert(biint == inv_bi_field(bc_instr), "postcondition"); 442 443 // Second, an unconditional far branch which jumps to dest. 444 // Note: target(dest) remembers the current pc (see CodeSection::target) 445 // and returns the current pc if the label is not bound yet; when 446 // the label gets bound, the unconditional far branch will be patched. 447 const address target_pc = target(dest); 448 const address b_pc = pc(); 449 b(target_pc); 450 451 assert(not_taken_pc == pc(), "postcondition"); 452 assert(dest.is_bound() || target_pc == b_pc, "postcondition"); 453 } 454 455 // 1 or 2 instructions 456 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) { 457 if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) { 458 bc(boint, biint, dest); 459 } else { 460 bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate); 461 } 462 } 463 464 bool MacroAssembler::is_bc_far_at(address instruction_addr) { 465 return is_bc_far_variant1_at(instruction_addr) || 466 is_bc_far_variant2_at(instruction_addr) || 467 is_bc_far_variant3_at(instruction_addr); 468 } 469 470 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) { 471 if (is_bc_far_variant1_at(instruction_addr)) { 472 const address instruction_1_addr = instruction_addr; 473 const int instruction_1 = *(int*)instruction_1_addr; 474 return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr); 475 } else if (is_bc_far_variant2_at(instruction_addr)) { 476 const address instruction_2_addr = instruction_addr + 4; 477 return bxx_destination(instruction_2_addr); 478 } else if (is_bc_far_variant3_at(instruction_addr)) { 479 return instruction_addr + 8; 480 } 481 // variant 4 ??? 482 ShouldNotReachHere(); 483 return NULL; 484 } 485 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) { 486 487 if (is_bc_far_variant3_at(instruction_addr)) { 488 // variant 3, far cond branch to the next instruction, already patched to nops: 489 // 490 // nop 491 // endgroup 492 // SKIP/DEST: 493 // 494 return; 495 } 496 497 // first, extract boint and biint from the current branch 498 int boint = 0; 499 int biint = 0; 500 501 ResourceMark rm; 502 const int code_size = 2 * BytesPerInstWord; 503 CodeBuffer buf(instruction_addr, code_size); 504 MacroAssembler masm(&buf); 505 if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) { 506 // Far branch to next instruction: Optimize it by patching nops (produce variant 3). 507 masm.nop(); 508 masm.endgroup(); 509 } else { 510 if (is_bc_far_variant1_at(instruction_addr)) { 511 // variant 1, the 1st instruction contains the destination address: 512 // 513 // bcxx DEST 514 // nop 515 // 516 const int instruction_1 = *(int*)(instruction_addr); 517 boint = inv_bo_field(instruction_1); 518 biint = inv_bi_field(instruction_1); 519 } else if (is_bc_far_variant2_at(instruction_addr)) { 520 // variant 2, the 2nd instruction contains the destination address: 521 // 522 // b!cxx SKIP 523 // bxx DEST 524 // SKIP: 525 // 526 const int instruction_1 = *(int*)(instruction_addr); 527 boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))), 528 opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1)))); 529 biint = inv_bi_field(instruction_1); 530 } else { 531 // variant 4??? 532 ShouldNotReachHere(); 533 } 534 535 // second, set the new branch destination and optimize the code 536 if (dest != instruction_addr + 4 && // the bc_far is still unbound! 537 masm.is_within_range_of_bcxx(dest, instruction_addr)) { 538 // variant 1: 539 // 540 // bcxx DEST 541 // nop 542 // 543 masm.bc(boint, biint, dest); 544 masm.nop(); 545 } else { 546 // variant 2: 547 // 548 // b!cxx SKIP 549 // bxx DEST 550 // SKIP: 551 // 552 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 553 opposite_bcond(inv_boint_bcond(boint))); 554 const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord; 555 masm.bc(opposite_boint, biint, not_taken_pc); 556 masm.b(dest); 557 } 558 } 559 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 560 } 561 562 // Emit a NOT mt-safe patchable 64 bit absolute call/jump. 563 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) { 564 // get current pc 565 uint64_t start_pc = (uint64_t) pc(); 566 567 const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last 568 const address pc_of_b = (address) (start_pc + (0*BytesPerInstWord)); // b is first 569 570 // relocate here 571 if (rt != relocInfo::none) { 572 relocate(rt); 573 } 574 575 if ( ReoptimizeCallSequences && 576 (( link && is_within_range_of_b(dest, pc_of_bl)) || 577 (!link && is_within_range_of_b(dest, pc_of_b)))) { 578 // variant 2: 579 // Emit an optimized, pc-relative call/jump. 580 581 if (link) { 582 // some padding 583 nop(); 584 nop(); 585 nop(); 586 nop(); 587 nop(); 588 nop(); 589 590 // do the call 591 assert(pc() == pc_of_bl, "just checking"); 592 bl(dest, relocInfo::none); 593 } else { 594 // do the jump 595 assert(pc() == pc_of_b, "just checking"); 596 b(dest, relocInfo::none); 597 598 // some padding 599 nop(); 600 nop(); 601 nop(); 602 nop(); 603 nop(); 604 nop(); 605 } 606 607 // Assert that we can identify the emitted call/jump. 608 assert(is_bxx64_patchable_variant2_at((address)start_pc, link), 609 "can't identify emitted call"); 610 } else { 611 // variant 1: 612 mr(R0, R11); // spill R11 -> R0. 613 614 // Load the destination address into CTR, 615 // calculate destination relative to global toc. 616 calculate_address_from_global_toc(R11, dest, true, true, false); 617 618 mtctr(R11); 619 mr(R11, R0); // spill R11 <- R0. 620 nop(); 621 622 // do the call/jump 623 if (link) { 624 bctrl(); 625 } else{ 626 bctr(); 627 } 628 // Assert that we can identify the emitted call/jump. 629 assert(is_bxx64_patchable_variant1b_at((address)start_pc, link), 630 "can't identify emitted call"); 631 } 632 633 // Assert that we can identify the emitted call/jump. 634 assert(is_bxx64_patchable_at((address)start_pc, link), 635 "can't identify emitted call"); 636 assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest, 637 "wrong encoding of dest address"); 638 } 639 640 // Identify a bxx64_patchable instruction. 641 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) { 642 return is_bxx64_patchable_variant1b_at(instruction_addr, link) 643 //|| is_bxx64_patchable_variant1_at(instruction_addr, link) 644 || is_bxx64_patchable_variant2_at(instruction_addr, link); 645 } 646 647 // Does the call64_patchable instruction use a pc-relative encoding of 648 // the call destination? 649 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) { 650 // variant 2 is pc-relative 651 return is_bxx64_patchable_variant2_at(instruction_addr, link); 652 } 653 654 // Identify variant 1. 655 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) { 656 unsigned int* instr = (unsigned int*) instruction_addr; 657 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 658 && is_mtctr(instr[5]) // mtctr 659 && is_load_const_at(instruction_addr); 660 } 661 662 // Identify variant 1b: load destination relative to global toc. 663 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) { 664 unsigned int* instr = (unsigned int*) instruction_addr; 665 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 666 && is_mtctr(instr[3]) // mtctr 667 && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr); 668 } 669 670 // Identify variant 2. 671 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) { 672 unsigned int* instr = (unsigned int*) instruction_addr; 673 if (link) { 674 return is_bl (instr[6]) // bl dest is last 675 && is_nop(instr[0]) // nop 676 && is_nop(instr[1]) // nop 677 && is_nop(instr[2]) // nop 678 && is_nop(instr[3]) // nop 679 && is_nop(instr[4]) // nop 680 && is_nop(instr[5]); // nop 681 } else { 682 return is_b (instr[0]) // b dest is first 683 && is_nop(instr[1]) // nop 684 && is_nop(instr[2]) // nop 685 && is_nop(instr[3]) // nop 686 && is_nop(instr[4]) // nop 687 && is_nop(instr[5]) // nop 688 && is_nop(instr[6]); // nop 689 } 690 } 691 692 // Set dest address of a bxx64_patchable instruction. 693 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) { 694 ResourceMark rm; 695 int code_size = MacroAssembler::bxx64_patchable_size; 696 CodeBuffer buf(instruction_addr, code_size); 697 MacroAssembler masm(&buf); 698 masm.bxx64_patchable(dest, relocInfo::none, link); 699 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 700 } 701 702 // Get dest address of a bxx64_patchable instruction. 703 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) { 704 if (is_bxx64_patchable_variant1_at(instruction_addr, link)) { 705 return (address) (unsigned long) get_const(instruction_addr); 706 } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) { 707 unsigned int* instr = (unsigned int*) instruction_addr; 708 if (link) { 709 const int instr_idx = 6; // bl is last 710 int branchoffset = branch_destination(instr[instr_idx], 0); 711 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 712 } else { 713 const int instr_idx = 0; // b is first 714 int branchoffset = branch_destination(instr[instr_idx], 0); 715 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 716 } 717 // Load dest relative to global toc. 718 } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) { 719 return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, 720 instruction_addr); 721 } else { 722 ShouldNotReachHere(); 723 return NULL; 724 } 725 } 726 727 // Uses ordering which corresponds to ABI: 728 // _savegpr0_14: std r14,-144(r1) 729 // _savegpr0_15: std r15,-136(r1) 730 // _savegpr0_16: std r16,-128(r1) 731 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) { 732 std(R14, offset, dst); offset += 8; 733 std(R15, offset, dst); offset += 8; 734 std(R16, offset, dst); offset += 8; 735 std(R17, offset, dst); offset += 8; 736 std(R18, offset, dst); offset += 8; 737 std(R19, offset, dst); offset += 8; 738 std(R20, offset, dst); offset += 8; 739 std(R21, offset, dst); offset += 8; 740 std(R22, offset, dst); offset += 8; 741 std(R23, offset, dst); offset += 8; 742 std(R24, offset, dst); offset += 8; 743 std(R25, offset, dst); offset += 8; 744 std(R26, offset, dst); offset += 8; 745 std(R27, offset, dst); offset += 8; 746 std(R28, offset, dst); offset += 8; 747 std(R29, offset, dst); offset += 8; 748 std(R30, offset, dst); offset += 8; 749 std(R31, offset, dst); offset += 8; 750 751 stfd(F14, offset, dst); offset += 8; 752 stfd(F15, offset, dst); offset += 8; 753 stfd(F16, offset, dst); offset += 8; 754 stfd(F17, offset, dst); offset += 8; 755 stfd(F18, offset, dst); offset += 8; 756 stfd(F19, offset, dst); offset += 8; 757 stfd(F20, offset, dst); offset += 8; 758 stfd(F21, offset, dst); offset += 8; 759 stfd(F22, offset, dst); offset += 8; 760 stfd(F23, offset, dst); offset += 8; 761 stfd(F24, offset, dst); offset += 8; 762 stfd(F25, offset, dst); offset += 8; 763 stfd(F26, offset, dst); offset += 8; 764 stfd(F27, offset, dst); offset += 8; 765 stfd(F28, offset, dst); offset += 8; 766 stfd(F29, offset, dst); offset += 8; 767 stfd(F30, offset, dst); offset += 8; 768 stfd(F31, offset, dst); 769 } 770 771 // Uses ordering which corresponds to ABI: 772 // _restgpr0_14: ld r14,-144(r1) 773 // _restgpr0_15: ld r15,-136(r1) 774 // _restgpr0_16: ld r16,-128(r1) 775 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) { 776 ld(R14, offset, src); offset += 8; 777 ld(R15, offset, src); offset += 8; 778 ld(R16, offset, src); offset += 8; 779 ld(R17, offset, src); offset += 8; 780 ld(R18, offset, src); offset += 8; 781 ld(R19, offset, src); offset += 8; 782 ld(R20, offset, src); offset += 8; 783 ld(R21, offset, src); offset += 8; 784 ld(R22, offset, src); offset += 8; 785 ld(R23, offset, src); offset += 8; 786 ld(R24, offset, src); offset += 8; 787 ld(R25, offset, src); offset += 8; 788 ld(R26, offset, src); offset += 8; 789 ld(R27, offset, src); offset += 8; 790 ld(R28, offset, src); offset += 8; 791 ld(R29, offset, src); offset += 8; 792 ld(R30, offset, src); offset += 8; 793 ld(R31, offset, src); offset += 8; 794 795 // FP registers 796 lfd(F14, offset, src); offset += 8; 797 lfd(F15, offset, src); offset += 8; 798 lfd(F16, offset, src); offset += 8; 799 lfd(F17, offset, src); offset += 8; 800 lfd(F18, offset, src); offset += 8; 801 lfd(F19, offset, src); offset += 8; 802 lfd(F20, offset, src); offset += 8; 803 lfd(F21, offset, src); offset += 8; 804 lfd(F22, offset, src); offset += 8; 805 lfd(F23, offset, src); offset += 8; 806 lfd(F24, offset, src); offset += 8; 807 lfd(F25, offset, src); offset += 8; 808 lfd(F26, offset, src); offset += 8; 809 lfd(F27, offset, src); offset += 8; 810 lfd(F28, offset, src); offset += 8; 811 lfd(F29, offset, src); offset += 8; 812 lfd(F30, offset, src); offset += 8; 813 lfd(F31, offset, src); 814 } 815 816 // For verify_oops. 817 void MacroAssembler::save_volatile_gprs(Register dst, int offset) { 818 std(R2, offset, dst); offset += 8; 819 std(R3, offset, dst); offset += 8; 820 std(R4, offset, dst); offset += 8; 821 std(R5, offset, dst); offset += 8; 822 std(R6, offset, dst); offset += 8; 823 std(R7, offset, dst); offset += 8; 824 std(R8, offset, dst); offset += 8; 825 std(R9, offset, dst); offset += 8; 826 std(R10, offset, dst); offset += 8; 827 std(R11, offset, dst); offset += 8; 828 std(R12, offset, dst); offset += 8; 829 830 stfd(F0, offset, dst); offset += 8; 831 stfd(F1, offset, dst); offset += 8; 832 stfd(F2, offset, dst); offset += 8; 833 stfd(F3, offset, dst); offset += 8; 834 stfd(F4, offset, dst); offset += 8; 835 stfd(F5, offset, dst); offset += 8; 836 stfd(F6, offset, dst); offset += 8; 837 stfd(F7, offset, dst); offset += 8; 838 stfd(F8, offset, dst); offset += 8; 839 stfd(F9, offset, dst); offset += 8; 840 stfd(F10, offset, dst); offset += 8; 841 stfd(F11, offset, dst); offset += 8; 842 stfd(F12, offset, dst); offset += 8; 843 stfd(F13, offset, dst); 844 } 845 846 // For verify_oops. 847 void MacroAssembler::restore_volatile_gprs(Register src, int offset) { 848 ld(R2, offset, src); offset += 8; 849 ld(R3, offset, src); offset += 8; 850 ld(R4, offset, src); offset += 8; 851 ld(R5, offset, src); offset += 8; 852 ld(R6, offset, src); offset += 8; 853 ld(R7, offset, src); offset += 8; 854 ld(R8, offset, src); offset += 8; 855 ld(R9, offset, src); offset += 8; 856 ld(R10, offset, src); offset += 8; 857 ld(R11, offset, src); offset += 8; 858 ld(R12, offset, src); offset += 8; 859 860 lfd(F0, offset, src); offset += 8; 861 lfd(F1, offset, src); offset += 8; 862 lfd(F2, offset, src); offset += 8; 863 lfd(F3, offset, src); offset += 8; 864 lfd(F4, offset, src); offset += 8; 865 lfd(F5, offset, src); offset += 8; 866 lfd(F6, offset, src); offset += 8; 867 lfd(F7, offset, src); offset += 8; 868 lfd(F8, offset, src); offset += 8; 869 lfd(F9, offset, src); offset += 8; 870 lfd(F10, offset, src); offset += 8; 871 lfd(F11, offset, src); offset += 8; 872 lfd(F12, offset, src); offset += 8; 873 lfd(F13, offset, src); 874 } 875 876 void MacroAssembler::save_LR_CR(Register tmp) { 877 mfcr(tmp); 878 std(tmp, _abi(cr), R1_SP); 879 mflr(tmp); 880 std(tmp, _abi(lr), R1_SP); 881 // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad) 882 } 883 884 void MacroAssembler::restore_LR_CR(Register tmp) { 885 assert(tmp != R1_SP, "must be distinct"); 886 ld(tmp, _abi(lr), R1_SP); 887 mtlr(tmp); 888 ld(tmp, _abi(cr), R1_SP); 889 mtcr(tmp); 890 } 891 892 address MacroAssembler::get_PC_trash_LR(Register result) { 893 Label L; 894 bl(L); 895 bind(L); 896 address lr_pc = pc(); 897 mflr(result); 898 return lr_pc; 899 } 900 901 void MacroAssembler::resize_frame(Register offset, Register tmp) { 902 #ifdef ASSERT 903 assert_different_registers(offset, tmp, R1_SP); 904 andi_(tmp, offset, frame::alignment_in_bytes-1); 905 asm_assert_eq("resize_frame: unaligned", 0x204); 906 #endif 907 908 // tmp <- *(SP) 909 ld(tmp, _abi(callers_sp), R1_SP); 910 // addr <- SP + offset; 911 // *(addr) <- tmp; 912 // SP <- addr 913 stdux(tmp, R1_SP, offset); 914 } 915 916 void MacroAssembler::resize_frame(int offset, Register tmp) { 917 assert(is_simm(offset, 16), "too big an offset"); 918 assert_different_registers(tmp, R1_SP); 919 assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned"); 920 // tmp <- *(SP) 921 ld(tmp, _abi(callers_sp), R1_SP); 922 // addr <- SP + offset; 923 // *(addr) <- tmp; 924 // SP <- addr 925 stdu(tmp, offset, R1_SP); 926 } 927 928 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) { 929 // (addr == tmp1) || (addr == tmp2) is allowed here! 930 assert(tmp1 != tmp2, "must be distinct"); 931 932 // compute offset w.r.t. current stack pointer 933 // tmp_1 <- addr - SP (!) 934 subf(tmp1, R1_SP, addr); 935 936 // atomically update SP keeping back link. 937 resize_frame(tmp1/* offset */, tmp2/* tmp */); 938 } 939 940 void MacroAssembler::push_frame(Register bytes, Register tmp) { 941 #ifdef ASSERT 942 assert(bytes != R0, "r0 not allowed here"); 943 andi_(R0, bytes, frame::alignment_in_bytes-1); 944 asm_assert_eq("push_frame(Reg, Reg): unaligned", 0x203); 945 #endif 946 neg(tmp, bytes); 947 stdux(R1_SP, R1_SP, tmp); 948 } 949 950 // Push a frame of size `bytes'. 951 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) { 952 long offset = align_addr(bytes, frame::alignment_in_bytes); 953 if (is_simm(-offset, 16)) { 954 stdu(R1_SP, -offset, R1_SP); 955 } else { 956 load_const_optimized(tmp, -offset); 957 stdux(R1_SP, R1_SP, tmp); 958 } 959 } 960 961 // Push a frame of size `bytes' plus abi_reg_args on top. 962 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) { 963 push_frame(bytes + frame::abi_reg_args_size, tmp); 964 } 965 966 // Setup up a new C frame with a spill area for non-volatile GPRs and 967 // additional space for local variables. 968 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes, 969 Register tmp) { 970 push_frame(bytes + frame::abi_reg_args_size + frame::spill_nonvolatiles_size, tmp); 971 } 972 973 // Pop current C frame. 974 void MacroAssembler::pop_frame() { 975 ld(R1_SP, _abi(callers_sp), R1_SP); 976 } 977 978 #if defined(ABI_ELFv2) 979 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) { 980 // TODO(asmundak): make sure the caller uses R12 as function descriptor 981 // most of the times. 982 if (R12 != r_function_entry) { 983 mr(R12, r_function_entry); 984 } 985 mtctr(R12); 986 // Do a call or a branch. 987 if (and_link) { 988 bctrl(); 989 } else { 990 bctr(); 991 } 992 _last_calls_return_pc = pc(); 993 994 return _last_calls_return_pc; 995 } 996 997 // Call a C function via a function descriptor and use full C 998 // calling conventions. Updates and returns _last_calls_return_pc. 999 address MacroAssembler::call_c(Register r_function_entry) { 1000 return branch_to(r_function_entry, /*and_link=*/true); 1001 } 1002 1003 // For tail calls: only branch, don't link, so callee returns to caller of this function. 1004 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) { 1005 return branch_to(r_function_entry, /*and_link=*/false); 1006 } 1007 1008 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) { 1009 load_const(R12, function_entry, R0); 1010 return branch_to(R12, /*and_link=*/true); 1011 } 1012 1013 #else 1014 // Generic version of a call to C function via a function descriptor 1015 // with variable support for C calling conventions (TOC, ENV, etc.). 1016 // Updates and returns _last_calls_return_pc. 1017 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call, 1018 bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) { 1019 // we emit standard ptrgl glue code here 1020 assert((function_descriptor != R0), "function_descriptor cannot be R0"); 1021 1022 // retrieve necessary entries from the function descriptor 1023 ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor); 1024 mtctr(R0); 1025 1026 if (load_toc_of_callee) { 1027 ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor); 1028 } 1029 if (load_env_of_callee) { 1030 ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor); 1031 } else if (load_toc_of_callee) { 1032 li(R11, 0); 1033 } 1034 1035 // do a call or a branch 1036 if (and_link) { 1037 bctrl(); 1038 } else { 1039 bctr(); 1040 } 1041 _last_calls_return_pc = pc(); 1042 1043 return _last_calls_return_pc; 1044 } 1045 1046 // Call a C function via a function descriptor and use full C calling 1047 // conventions. 1048 // We don't use the TOC in generated code, so there is no need to save 1049 // and restore its value. 1050 address MacroAssembler::call_c(Register fd) { 1051 return branch_to(fd, /*and_link=*/true, 1052 /*save toc=*/false, 1053 /*restore toc=*/false, 1054 /*load toc=*/true, 1055 /*load env=*/true); 1056 } 1057 1058 address MacroAssembler::call_c_and_return_to_caller(Register fd) { 1059 return branch_to(fd, /*and_link=*/false, 1060 /*save toc=*/false, 1061 /*restore toc=*/false, 1062 /*load toc=*/true, 1063 /*load env=*/true); 1064 } 1065 1066 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) { 1067 if (rt != relocInfo::none) { 1068 // this call needs to be relocatable 1069 if (!ReoptimizeCallSequences 1070 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1071 || fd == NULL // support code-size estimation 1072 || !fd->is_friend_function() 1073 || fd->entry() == NULL) { 1074 // it's not a friend function as defined by class FunctionDescriptor, 1075 // so do a full call-c here. 1076 load_const(R11, (address)fd, R0); 1077 1078 bool has_env = (fd != NULL && fd->env() != NULL); 1079 return branch_to(R11, /*and_link=*/true, 1080 /*save toc=*/false, 1081 /*restore toc=*/false, 1082 /*load toc=*/true, 1083 /*load env=*/has_env); 1084 } else { 1085 // It's a friend function. Load the entry point and don't care about 1086 // toc and env. Use an optimizable call instruction, but ensure the 1087 // same code-size as in the case of a non-friend function. 1088 nop(); 1089 nop(); 1090 nop(); 1091 bl64_patchable(fd->entry(), rt); 1092 _last_calls_return_pc = pc(); 1093 return _last_calls_return_pc; 1094 } 1095 } else { 1096 // This call does not need to be relocatable, do more aggressive 1097 // optimizations. 1098 if (!ReoptimizeCallSequences 1099 || !fd->is_friend_function()) { 1100 // It's not a friend function as defined by class FunctionDescriptor, 1101 // so do a full call-c here. 1102 load_const(R11, (address)fd, R0); 1103 return branch_to(R11, /*and_link=*/true, 1104 /*save toc=*/false, 1105 /*restore toc=*/false, 1106 /*load toc=*/true, 1107 /*load env=*/true); 1108 } else { 1109 // it's a friend function, load the entry point and don't care about 1110 // toc and env. 1111 address dest = fd->entry(); 1112 if (is_within_range_of_b(dest, pc())) { 1113 bl(dest); 1114 } else { 1115 bl64_patchable(dest, rt); 1116 } 1117 _last_calls_return_pc = pc(); 1118 return _last_calls_return_pc; 1119 } 1120 } 1121 } 1122 1123 // Call a C function. All constants needed reside in TOC. 1124 // 1125 // Read the address to call from the TOC. 1126 // Read env from TOC, if fd specifies an env. 1127 // Read new TOC from TOC. 1128 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd, 1129 relocInfo::relocType rt, Register toc) { 1130 if (!ReoptimizeCallSequences 1131 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1132 || !fd->is_friend_function()) { 1133 // It's not a friend function as defined by class FunctionDescriptor, 1134 // so do a full call-c here. 1135 assert(fd->entry() != NULL, "function must be linked"); 1136 1137 AddressLiteral fd_entry(fd->entry()); 1138 bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true); 1139 mtctr(R11); 1140 if (fd->env() == NULL) { 1141 li(R11, 0); 1142 nop(); 1143 } else { 1144 AddressLiteral fd_env(fd->env()); 1145 success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true); 1146 } 1147 AddressLiteral fd_toc(fd->toc()); 1148 // Set R2_TOC (load from toc) 1149 success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true); 1150 bctrl(); 1151 _last_calls_return_pc = pc(); 1152 if (!success) { return NULL; } 1153 } else { 1154 // It's a friend function, load the entry point and don't care about 1155 // toc and env. Use an optimizable call instruction, but ensure the 1156 // same code-size as in the case of a non-friend function. 1157 nop(); 1158 bl64_patchable(fd->entry(), rt); 1159 _last_calls_return_pc = pc(); 1160 } 1161 return _last_calls_return_pc; 1162 } 1163 #endif // ABI_ELFv2 1164 1165 void MacroAssembler::call_VM_base(Register oop_result, 1166 Register last_java_sp, 1167 address entry_point, 1168 bool check_exceptions) { 1169 BLOCK_COMMENT("call_VM {"); 1170 // Determine last_java_sp register. 1171 if (!last_java_sp->is_valid()) { 1172 last_java_sp = R1_SP; 1173 } 1174 set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1); 1175 1176 // ARG1 must hold thread address. 1177 mr(R3_ARG1, R16_thread); 1178 #if defined(ABI_ELFv2) 1179 address return_pc = call_c(entry_point, relocInfo::none); 1180 #else 1181 address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none); 1182 #endif 1183 1184 reset_last_Java_frame(); 1185 1186 // Check for pending exceptions. 1187 if (check_exceptions) { 1188 // We don't check for exceptions here. 1189 ShouldNotReachHere(); 1190 } 1191 1192 // Get oop result if there is one and reset the value in the thread. 1193 if (oop_result->is_valid()) { 1194 get_vm_result(oop_result); 1195 } 1196 1197 _last_calls_return_pc = return_pc; 1198 BLOCK_COMMENT("} call_VM"); 1199 } 1200 1201 void MacroAssembler::call_VM_leaf_base(address entry_point) { 1202 BLOCK_COMMENT("call_VM_leaf {"); 1203 #if defined(ABI_ELFv2) 1204 call_c(entry_point, relocInfo::none); 1205 #else 1206 call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none); 1207 #endif 1208 BLOCK_COMMENT("} call_VM_leaf"); 1209 } 1210 1211 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) { 1212 call_VM_base(oop_result, noreg, entry_point, check_exceptions); 1213 } 1214 1215 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, 1216 bool check_exceptions) { 1217 // R3_ARG1 is reserved for the thread. 1218 mr_if_needed(R4_ARG2, arg_1); 1219 call_VM(oop_result, entry_point, check_exceptions); 1220 } 1221 1222 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, 1223 bool check_exceptions) { 1224 // R3_ARG1 is reserved for the thread 1225 mr_if_needed(R4_ARG2, arg_1); 1226 assert(arg_2 != R4_ARG2, "smashed argument"); 1227 mr_if_needed(R5_ARG3, arg_2); 1228 call_VM(oop_result, entry_point, check_exceptions); 1229 } 1230 1231 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3, 1232 bool check_exceptions) { 1233 // R3_ARG1 is reserved for the thread 1234 mr_if_needed(R4_ARG2, arg_1); 1235 assert(arg_2 != R4_ARG2, "smashed argument"); 1236 mr_if_needed(R5_ARG3, arg_2); 1237 mr_if_needed(R6_ARG4, arg_3); 1238 call_VM(oop_result, entry_point, check_exceptions); 1239 } 1240 1241 void MacroAssembler::call_VM_leaf(address entry_point) { 1242 call_VM_leaf_base(entry_point); 1243 } 1244 1245 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) { 1246 mr_if_needed(R3_ARG1, arg_1); 1247 call_VM_leaf(entry_point); 1248 } 1249 1250 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) { 1251 mr_if_needed(R3_ARG1, arg_1); 1252 assert(arg_2 != R3_ARG1, "smashed argument"); 1253 mr_if_needed(R4_ARG2, arg_2); 1254 call_VM_leaf(entry_point); 1255 } 1256 1257 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) { 1258 mr_if_needed(R3_ARG1, arg_1); 1259 assert(arg_2 != R3_ARG1, "smashed argument"); 1260 mr_if_needed(R4_ARG2, arg_2); 1261 assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument"); 1262 mr_if_needed(R5_ARG3, arg_3); 1263 call_VM_leaf(entry_point); 1264 } 1265 1266 // Check whether instruction is a read access to the polling page 1267 // which was emitted by load_from_polling_page(..). 1268 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext, 1269 address* polling_address_ptr) { 1270 if (!is_ld(instruction)) 1271 return false; // It's not a ld. Fail. 1272 1273 int rt = inv_rt_field(instruction); 1274 int ra = inv_ra_field(instruction); 1275 int ds = inv_ds_field(instruction); 1276 if (!(ds == 0 && ra != 0 && rt == 0)) { 1277 return false; // It's not a ld(r0, X, ra). Fail. 1278 } 1279 1280 if (!ucontext) { 1281 // Set polling address. 1282 if (polling_address_ptr != NULL) { 1283 *polling_address_ptr = NULL; 1284 } 1285 return true; // No ucontext given. Can't check value of ra. Assume true. 1286 } 1287 1288 #ifdef LINUX 1289 // Ucontext given. Check that register ra contains the address of 1290 // the safepoing polling page. 1291 ucontext_t* uc = (ucontext_t*) ucontext; 1292 // Set polling address. 1293 address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds; 1294 if (polling_address_ptr != NULL) { 1295 *polling_address_ptr = addr; 1296 } 1297 return os::is_poll_address(addr); 1298 #else 1299 // Not on Linux, ucontext must be NULL. 1300 ShouldNotReachHere(); 1301 return false; 1302 #endif 1303 } 1304 1305 bool MacroAssembler::is_memory_serialization(int instruction, JavaThread* thread, void* ucontext) { 1306 #ifdef LINUX 1307 ucontext_t* uc = (ucontext_t*) ucontext; 1308 1309 if (is_stwx(instruction) || is_stwux(instruction)) { 1310 int ra = inv_ra_field(instruction); 1311 int rb = inv_rb_field(instruction); 1312 1313 // look up content of ra and rb in ucontext 1314 address ra_val=(address)uc->uc_mcontext.regs->gpr[ra]; 1315 long rb_val=(long)uc->uc_mcontext.regs->gpr[rb]; 1316 return os::is_memory_serialize_page(thread, ra_val+rb_val); 1317 } else if (is_stw(instruction) || is_stwu(instruction)) { 1318 int ra = inv_ra_field(instruction); 1319 int d1 = inv_d1_field(instruction); 1320 1321 // look up content of ra in ucontext 1322 address ra_val=(address)uc->uc_mcontext.regs->gpr[ra]; 1323 return os::is_memory_serialize_page(thread, ra_val+d1); 1324 } else { 1325 return false; 1326 } 1327 #else 1328 // workaround not needed on !LINUX :-) 1329 ShouldNotCallThis(); 1330 return false; 1331 #endif 1332 } 1333 1334 void MacroAssembler::bang_stack_with_offset(int offset) { 1335 // When increasing the stack, the old stack pointer will be written 1336 // to the new top of stack according to the PPC64 abi. 1337 // Therefore, stack banging is not necessary when increasing 1338 // the stack by <= os::vm_page_size() bytes. 1339 // When increasing the stack by a larger amount, this method is 1340 // called repeatedly to bang the intermediate pages. 1341 1342 // Stack grows down, caller passes positive offset. 1343 assert(offset > 0, "must bang with positive offset"); 1344 1345 long stdoffset = -offset; 1346 1347 if (is_simm(stdoffset, 16)) { 1348 // Signed 16 bit offset, a simple std is ok. 1349 if (UseLoadInstructionsForStackBangingPPC64) { 1350 ld(R0, (int)(signed short)stdoffset, R1_SP); 1351 } else { 1352 std(R0,(int)(signed short)stdoffset, R1_SP); 1353 } 1354 } else if (is_simm(stdoffset, 31)) { 1355 const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset); 1356 const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset); 1357 1358 Register tmp = R11; 1359 addis(tmp, R1_SP, hi); 1360 if (UseLoadInstructionsForStackBangingPPC64) { 1361 ld(R0, lo, tmp); 1362 } else { 1363 std(R0, lo, tmp); 1364 } 1365 } else { 1366 ShouldNotReachHere(); 1367 } 1368 } 1369 1370 // If instruction is a stack bang of the form 1371 // std R0, x(Ry), (see bang_stack_with_offset()) 1372 // stdu R1_SP, x(R1_SP), (see push_frame(), resize_frame()) 1373 // or stdux R1_SP, Rx, R1_SP (see push_frame(), resize_frame()) 1374 // return the banged address. Otherwise, return 0. 1375 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) { 1376 #ifdef LINUX 1377 ucontext_t* uc = (ucontext_t*) ucontext; 1378 int rs = inv_rs_field(instruction); 1379 int ra = inv_ra_field(instruction); 1380 if ( (is_ld(instruction) && rs == 0 && UseLoadInstructionsForStackBangingPPC64) 1381 || (is_std(instruction) && rs == 0 && !UseLoadInstructionsForStackBangingPPC64) 1382 || (is_stdu(instruction) && rs == 1)) { 1383 int ds = inv_ds_field(instruction); 1384 // return banged address 1385 return ds+(address)uc->uc_mcontext.regs->gpr[ra]; 1386 } else if (is_stdux(instruction) && rs == 1) { 1387 int rb = inv_rb_field(instruction); 1388 address sp = (address)uc->uc_mcontext.regs->gpr[1]; 1389 long rb_val = (long)uc->uc_mcontext.regs->gpr[rb]; 1390 return ra != 1 || rb_val >= 0 ? NULL // not a stack bang 1391 : sp + rb_val; // banged address 1392 } 1393 return NULL; // not a stack bang 1394 #else 1395 // workaround not needed on !LINUX :-) 1396 ShouldNotCallThis(); 1397 return NULL; 1398 #endif 1399 } 1400 1401 void MacroAssembler::reserved_stack_check(Register return_pc) { 1402 // Test if reserved zone needs to be enabled. 1403 Label no_reserved_zone_enabling; 1404 1405 ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread); 1406 cmpld(CCR0, R1_SP, R0); 1407 blt_predict_taken(CCR0, no_reserved_zone_enabling); 1408 1409 // Enable reserved zone again, throw stack overflow exception. 1410 push_frame_reg_args(0, R0); 1411 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread); 1412 pop_frame(); 1413 mtlr(return_pc); 1414 load_const_optimized(R0, StubRoutines::throw_delayed_StackOverflowError_entry()); 1415 mtctr(R0); 1416 bctr(); 1417 1418 should_not_reach_here(); 1419 1420 bind(no_reserved_zone_enabling); 1421 } 1422 1423 void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base, 1424 bool cmpxchgx_hint) { 1425 Label retry; 1426 bind(retry); 1427 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1428 stdcx_(exchange_value, addr_base); 1429 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1430 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1431 } else { 1432 bne( CCR0, retry); // StXcx_ sets CCR0. 1433 } 1434 } 1435 1436 void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base, 1437 Register tmp, bool cmpxchgx_hint) { 1438 Label retry; 1439 bind(retry); 1440 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1441 add(tmp, dest_current_value, inc_value); 1442 stdcx_(tmp, addr_base); 1443 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1444 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1445 } else { 1446 bne( CCR0, retry); // StXcx_ sets CCR0. 1447 } 1448 } 1449 1450 // Word/sub-word atomic helper functions 1451 1452 // Temps and addr_base are killed if size < 4 and processor does not support respective instructions. 1453 // Only signed types are supported with size < 4. 1454 // Atomic add always kills tmp1. 1455 void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value, 1456 Register addr_base, Register tmp1, Register tmp2, Register tmp3, 1457 bool cmpxchgx_hint, bool is_add, int size) { 1458 // Sub-word instructions are available since Power 8. 1459 // For older processors, instruction_type != size holds, and we 1460 // emulate the sub-word instructions by constructing a 4-byte value 1461 // that leaves the other bytes unchanged. 1462 const int instruction_type = VM_Version::has_lqarx() ? size : 4; 1463 1464 Label retry; 1465 Register shift_amount = noreg, 1466 val32 = dest_current_value, 1467 modval = is_add ? tmp1 : exchange_value; 1468 1469 if (instruction_type != size) { 1470 assert_different_registers(tmp1, tmp2, tmp3, dest_current_value, exchange_value, addr_base); 1471 modval = tmp1; 1472 shift_amount = tmp2; 1473 val32 = tmp3; 1474 // Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned. 1475 #ifdef VM_LITTLE_ENDIAN 1476 rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8; 1477 clrrdi(addr_base, addr_base, 2); 1478 #else 1479 xori(shift_amount, addr_base, (size == 1) ? 3 : 2); 1480 clrrdi(addr_base, addr_base, 2); 1481 rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16; 1482 #endif 1483 } 1484 1485 // atomic emulation loop 1486 bind(retry); 1487 1488 switch (instruction_type) { 1489 case 4: lwarx(val32, addr_base, cmpxchgx_hint); break; 1490 case 2: lharx(val32, addr_base, cmpxchgx_hint); break; 1491 case 1: lbarx(val32, addr_base, cmpxchgx_hint); break; 1492 default: ShouldNotReachHere(); 1493 } 1494 1495 if (instruction_type != size) { 1496 srw(dest_current_value, val32, shift_amount); 1497 } 1498 1499 if (is_add) { add(modval, dest_current_value, exchange_value); } 1500 1501 if (instruction_type != size) { 1502 // Transform exchange value such that the replacement can be done by one xor instruction. 1503 xorr(modval, dest_current_value, is_add ? modval : exchange_value); 1504 clrldi(modval, modval, (size == 1) ? 56 : 48); 1505 slw(modval, modval, shift_amount); 1506 xorr(modval, val32, modval); 1507 } 1508 1509 switch (instruction_type) { 1510 case 4: stwcx_(modval, addr_base); break; 1511 case 2: sthcx_(modval, addr_base); break; 1512 case 1: stbcx_(modval, addr_base); break; 1513 default: ShouldNotReachHere(); 1514 } 1515 1516 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1517 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1518 } else { 1519 bne( CCR0, retry); // StXcx_ sets CCR0. 1520 } 1521 1522 // l?arx zero-extends, but Java wants byte/short values sign-extended. 1523 if (size == 1) { 1524 extsb(dest_current_value, dest_current_value); 1525 } else if (size == 2) { 1526 extsh(dest_current_value, dest_current_value); 1527 }; 1528 } 1529 1530 // Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions. 1531 // Only signed types are supported with size < 4. 1532 void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value, 1533 Register compare_value, Register exchange_value, 1534 Register addr_base, Register tmp1, Register tmp2, 1535 Label &retry, Label &failed, bool cmpxchgx_hint, int size) { 1536 // Sub-word instructions are available since Power 8. 1537 // For older processors, instruction_type != size holds, and we 1538 // emulate the sub-word instructions by constructing a 4-byte value 1539 // that leaves the other bytes unchanged. 1540 const int instruction_type = VM_Version::has_lqarx() ? size : 4; 1541 1542 Register shift_amount = noreg, 1543 val32 = dest_current_value, 1544 modval = exchange_value; 1545 1546 if (instruction_type != size) { 1547 assert_different_registers(tmp1, tmp2, dest_current_value, compare_value, exchange_value, addr_base); 1548 shift_amount = tmp1; 1549 val32 = tmp2; 1550 modval = tmp2; 1551 // Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned. 1552 #ifdef VM_LITTLE_ENDIAN 1553 rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8; 1554 clrrdi(addr_base, addr_base, 2); 1555 #else 1556 xori(shift_amount, addr_base, (size == 1) ? 3 : 2); 1557 clrrdi(addr_base, addr_base, 2); 1558 rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16; 1559 #endif 1560 // Transform exchange value such that the replacement can be done by one xor instruction. 1561 xorr(exchange_value, compare_value, exchange_value); 1562 clrldi(exchange_value, exchange_value, (size == 1) ? 56 : 48); 1563 slw(exchange_value, exchange_value, shift_amount); 1564 } 1565 1566 // atomic emulation loop 1567 bind(retry); 1568 1569 switch (instruction_type) { 1570 case 4: lwarx(val32, addr_base, cmpxchgx_hint); break; 1571 case 2: lharx(val32, addr_base, cmpxchgx_hint); break; 1572 case 1: lbarx(val32, addr_base, cmpxchgx_hint); break; 1573 default: ShouldNotReachHere(); 1574 } 1575 1576 if (instruction_type != size) { 1577 srw(dest_current_value, val32, shift_amount); 1578 } 1579 if (size == 1) { 1580 extsb(dest_current_value, dest_current_value); 1581 } else if (size == 2) { 1582 extsh(dest_current_value, dest_current_value); 1583 }; 1584 1585 cmpw(flag, dest_current_value, compare_value); 1586 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1587 bne_predict_not_taken(flag, failed); 1588 } else { 1589 bne( flag, failed); 1590 } 1591 // branch to done => (flag == ne), (dest_current_value != compare_value) 1592 // fall through => (flag == eq), (dest_current_value == compare_value) 1593 1594 if (instruction_type != size) { 1595 xorr(modval, val32, exchange_value); 1596 } 1597 1598 switch (instruction_type) { 1599 case 4: stwcx_(modval, addr_base); break; 1600 case 2: sthcx_(modval, addr_base); break; 1601 case 1: stbcx_(modval, addr_base); break; 1602 default: ShouldNotReachHere(); 1603 } 1604 } 1605 1606 // CmpxchgX sets condition register to cmpX(current, compare). 1607 void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value, 1608 Register compare_value, Register exchange_value, 1609 Register addr_base, Register tmp1, Register tmp2, 1610 int semantics, bool cmpxchgx_hint, 1611 Register int_flag_success, bool contention_hint, bool weak, int size) { 1612 Label retry; 1613 Label failed; 1614 Label done; 1615 1616 // Save one branch if result is returned via register and 1617 // result register is different from the other ones. 1618 bool use_result_reg = (int_flag_success != noreg); 1619 bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value && 1620 int_flag_success != exchange_value && int_flag_success != addr_base && 1621 int_flag_success != tmp1 && int_flag_success != tmp2); 1622 assert(!weak || flag == CCR0, "weak only supported with CCR0"); 1623 assert(size == 1 || size == 2 || size == 4, "unsupported"); 1624 1625 if (use_result_reg && preset_result_reg) { 1626 li(int_flag_success, 0); // preset (assume cas failed) 1627 } 1628 1629 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1630 if (contention_hint) { // Don't try to reserve if cmp fails. 1631 switch (size) { 1632 case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break; 1633 case 2: lha(dest_current_value, 0, addr_base); break; 1634 case 4: lwz(dest_current_value, 0, addr_base); break; 1635 default: ShouldNotReachHere(); 1636 } 1637 cmpw(flag, dest_current_value, compare_value); 1638 bne(flag, failed); 1639 } 1640 1641 // release/fence semantics 1642 if (semantics & MemBarRel) { 1643 release(); 1644 } 1645 1646 cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base, tmp1, tmp2, 1647 retry, failed, cmpxchgx_hint, size); 1648 if (!weak || use_result_reg) { 1649 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1650 bne_predict_not_taken(CCR0, weak ? failed : retry); // StXcx_ sets CCR0. 1651 } else { 1652 bne( CCR0, weak ? failed : retry); // StXcx_ sets CCR0. 1653 } 1654 } 1655 // fall through => (flag == eq), (dest_current_value == compare_value), (swapped) 1656 1657 // Result in register (must do this at the end because int_flag_success can be the 1658 // same register as one above). 1659 if (use_result_reg) { 1660 li(int_flag_success, 1); 1661 } 1662 1663 if (semantics & MemBarFenceAfter) { 1664 fence(); 1665 } else if (semantics & MemBarAcq) { 1666 isync(); 1667 } 1668 1669 if (use_result_reg && !preset_result_reg) { 1670 b(done); 1671 } 1672 1673 bind(failed); 1674 if (use_result_reg && !preset_result_reg) { 1675 li(int_flag_success, 0); 1676 } 1677 1678 bind(done); 1679 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1680 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1681 } 1682 1683 // Preforms atomic compare exchange: 1684 // if (compare_value == *addr_base) 1685 // *addr_base = exchange_value 1686 // int_flag_success = 1; 1687 // else 1688 // int_flag_success = 0; 1689 // 1690 // ConditionRegister flag = cmp(compare_value, *addr_base) 1691 // Register dest_current_value = *addr_base 1692 // Register compare_value Used to compare with value in memory 1693 // Register exchange_value Written to memory if compare_value == *addr_base 1694 // Register addr_base The memory location to compareXChange 1695 // Register int_flag_success Set to 1 if exchange_value was written to *addr_base 1696 // 1697 // To avoid the costly compare exchange the value is tested beforehand. 1698 // Several special cases exist to avoid that unnecessary information is generated. 1699 // 1700 void MacroAssembler::cmpxchgd(ConditionRegister flag, 1701 Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value, 1702 Register addr_base, int semantics, bool cmpxchgx_hint, 1703 Register int_flag_success, Label* failed_ext, bool contention_hint, bool weak) { 1704 Label retry; 1705 Label failed_int; 1706 Label& failed = (failed_ext != NULL) ? *failed_ext : failed_int; 1707 Label done; 1708 1709 // Save one branch if result is returned via register and result register is different from the other ones. 1710 bool use_result_reg = (int_flag_success!=noreg); 1711 bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() && 1712 int_flag_success!=exchange_value && int_flag_success!=addr_base); 1713 assert(!weak || flag == CCR0, "weak only supported with CCR0"); 1714 assert(int_flag_success == noreg || failed_ext == NULL, "cannot have both"); 1715 1716 if (use_result_reg && preset_result_reg) { 1717 li(int_flag_success, 0); // preset (assume cas failed) 1718 } 1719 1720 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1721 if (contention_hint) { // Don't try to reserve if cmp fails. 1722 ld(dest_current_value, 0, addr_base); 1723 cmpd(flag, compare_value, dest_current_value); 1724 bne(flag, failed); 1725 } 1726 1727 // release/fence semantics 1728 if (semantics & MemBarRel) { 1729 release(); 1730 } 1731 1732 // atomic emulation loop 1733 bind(retry); 1734 1735 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1736 cmpd(flag, compare_value, dest_current_value); 1737 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1738 bne_predict_not_taken(flag, failed); 1739 } else { 1740 bne( flag, failed); 1741 } 1742 1743 stdcx_(exchange_value, addr_base); 1744 if (!weak || use_result_reg || failed_ext) { 1745 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1746 bne_predict_not_taken(CCR0, weak ? failed : retry); // stXcx_ sets CCR0 1747 } else { 1748 bne( CCR0, weak ? failed : retry); // stXcx_ sets CCR0 1749 } 1750 } 1751 1752 // result in register (must do this at the end because int_flag_success can be the same register as one above) 1753 if (use_result_reg) { 1754 li(int_flag_success, 1); 1755 } 1756 1757 if (semantics & MemBarFenceAfter) { 1758 fence(); 1759 } else if (semantics & MemBarAcq) { 1760 isync(); 1761 } 1762 1763 if (use_result_reg && !preset_result_reg) { 1764 b(done); 1765 } 1766 1767 bind(failed_int); 1768 if (use_result_reg && !preset_result_reg) { 1769 li(int_flag_success, 0); 1770 } 1771 1772 bind(done); 1773 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1774 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1775 } 1776 1777 // Look up the method for a megamorphic invokeinterface call. 1778 // The target method is determined by <intf_klass, itable_index>. 1779 // The receiver klass is in recv_klass. 1780 // On success, the result will be in method_result, and execution falls through. 1781 // On failure, execution transfers to the given label. 1782 void MacroAssembler::lookup_interface_method(Register recv_klass, 1783 Register intf_klass, 1784 RegisterOrConstant itable_index, 1785 Register method_result, 1786 Register scan_temp, 1787 Register temp2, 1788 Label& L_no_such_interface, 1789 bool return_method) { 1790 assert_different_registers(recv_klass, intf_klass, method_result, scan_temp); 1791 1792 // Compute start of first itableOffsetEntry (which is at the end of the vtable). 1793 int vtable_base = in_bytes(Klass::vtable_start_offset()); 1794 int itentry_off = itableMethodEntry::method_offset_in_bytes(); 1795 int logMEsize = exact_log2(itableMethodEntry::size() * wordSize); 1796 int scan_step = itableOffsetEntry::size() * wordSize; 1797 int log_vte_size= exact_log2(vtableEntry::size_in_bytes()); 1798 1799 lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass); 1800 // %%% We should store the aligned, prescaled offset in the klassoop. 1801 // Then the next several instructions would fold away. 1802 1803 sldi(scan_temp, scan_temp, log_vte_size); 1804 addi(scan_temp, scan_temp, vtable_base); 1805 add(scan_temp, recv_klass, scan_temp); 1806 1807 // Adjust recv_klass by scaled itable_index, so we can free itable_index. 1808 if (return_method) { 1809 if (itable_index.is_register()) { 1810 Register itable_offset = itable_index.as_register(); 1811 sldi(method_result, itable_offset, logMEsize); 1812 if (itentry_off) { addi(method_result, method_result, itentry_off); } 1813 add(method_result, method_result, recv_klass); 1814 } else { 1815 long itable_offset = (long)itable_index.as_constant(); 1816 // static address, no relocation 1817 add_const_optimized(method_result, recv_klass, (itable_offset << logMEsize) + itentry_off, temp2); 1818 } 1819 } 1820 1821 // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) { 1822 // if (scan->interface() == intf) { 1823 // result = (klass + scan->offset() + itable_index); 1824 // } 1825 // } 1826 Label search, found_method; 1827 1828 for (int peel = 1; peel >= 0; peel--) { 1829 // %%%% Could load both offset and interface in one ldx, if they were 1830 // in the opposite order. This would save a load. 1831 ld(temp2, itableOffsetEntry::interface_offset_in_bytes(), scan_temp); 1832 1833 // Check that this entry is non-null. A null entry means that 1834 // the receiver class doesn't implement the interface, and wasn't the 1835 // same as when the caller was compiled. 1836 cmpd(CCR0, temp2, intf_klass); 1837 1838 if (peel) { 1839 beq(CCR0, found_method); 1840 } else { 1841 bne(CCR0, search); 1842 // (invert the test to fall through to found_method...) 1843 } 1844 1845 if (!peel) break; 1846 1847 bind(search); 1848 1849 cmpdi(CCR0, temp2, 0); 1850 beq(CCR0, L_no_such_interface); 1851 addi(scan_temp, scan_temp, scan_step); 1852 } 1853 1854 bind(found_method); 1855 1856 // Got a hit. 1857 if (return_method) { 1858 int ito_offset = itableOffsetEntry::offset_offset_in_bytes(); 1859 lwz(scan_temp, ito_offset, scan_temp); 1860 ldx(method_result, scan_temp, method_result); 1861 } 1862 } 1863 1864 // virtual method calling 1865 void MacroAssembler::lookup_virtual_method(Register recv_klass, 1866 RegisterOrConstant vtable_index, 1867 Register method_result) { 1868 1869 assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg()); 1870 1871 const int base = in_bytes(Klass::vtable_start_offset()); 1872 assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); 1873 1874 if (vtable_index.is_register()) { 1875 sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord); 1876 add(recv_klass, vtable_index.as_register(), recv_klass); 1877 } else { 1878 addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord); 1879 } 1880 ld(R19_method, base + vtableEntry::method_offset_in_bytes(), recv_klass); 1881 } 1882 1883 /////////////////////////////////////////// subtype checking //////////////////////////////////////////// 1884 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, 1885 Register super_klass, 1886 Register temp1_reg, 1887 Register temp2_reg, 1888 Label* L_success, 1889 Label* L_failure, 1890 Label* L_slow_path, 1891 RegisterOrConstant super_check_offset) { 1892 1893 const Register check_cache_offset = temp1_reg; 1894 const Register cached_super = temp2_reg; 1895 1896 assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super); 1897 1898 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1899 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1900 1901 bool must_load_sco = (super_check_offset.constant_or_zero() == -1); 1902 bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset); 1903 1904 Label L_fallthrough; 1905 int label_nulls = 0; 1906 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 1907 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 1908 if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; } 1909 assert(label_nulls <= 1 || 1910 (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path), 1911 "at most one NULL in the batch, usually"); 1912 1913 // If the pointers are equal, we are done (e.g., String[] elements). 1914 // This self-check enables sharing of secondary supertype arrays among 1915 // non-primary types such as array-of-interface. Otherwise, each such 1916 // type would need its own customized SSA. 1917 // We move this check to the front of the fast path because many 1918 // type checks are in fact trivially successful in this manner, 1919 // so we get a nicely predicted branch right at the start of the check. 1920 cmpd(CCR0, sub_klass, super_klass); 1921 beq(CCR0, *L_success); 1922 1923 // Check the supertype display: 1924 if (must_load_sco) { 1925 // The super check offset is always positive... 1926 lwz(check_cache_offset, sco_offset, super_klass); 1927 super_check_offset = RegisterOrConstant(check_cache_offset); 1928 // super_check_offset is register. 1929 assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register()); 1930 } 1931 // The loaded value is the offset from KlassOopDesc. 1932 1933 ld(cached_super, super_check_offset, sub_klass); 1934 cmpd(CCR0, cached_super, super_klass); 1935 1936 // This check has worked decisively for primary supers. 1937 // Secondary supers are sought in the super_cache ('super_cache_addr'). 1938 // (Secondary supers are interfaces and very deeply nested subtypes.) 1939 // This works in the same check above because of a tricky aliasing 1940 // between the super_cache and the primary super display elements. 1941 // (The 'super_check_addr' can address either, as the case requires.) 1942 // Note that the cache is updated below if it does not help us find 1943 // what we need immediately. 1944 // So if it was a primary super, we can just fail immediately. 1945 // Otherwise, it's the slow path for us (no success at this point). 1946 1947 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); } 1948 1949 if (super_check_offset.is_register()) { 1950 beq(CCR0, *L_success); 1951 cmpwi(CCR0, super_check_offset.as_register(), sc_offset); 1952 if (L_failure == &L_fallthrough) { 1953 beq(CCR0, *L_slow_path); 1954 } else { 1955 bne(CCR0, *L_failure); 1956 FINAL_JUMP(*L_slow_path); 1957 } 1958 } else { 1959 if (super_check_offset.as_constant() == sc_offset) { 1960 // Need a slow path; fast failure is impossible. 1961 if (L_slow_path == &L_fallthrough) { 1962 beq(CCR0, *L_success); 1963 } else { 1964 bne(CCR0, *L_slow_path); 1965 FINAL_JUMP(*L_success); 1966 } 1967 } else { 1968 // No slow path; it's a fast decision. 1969 if (L_failure == &L_fallthrough) { 1970 beq(CCR0, *L_success); 1971 } else { 1972 bne(CCR0, *L_failure); 1973 FINAL_JUMP(*L_success); 1974 } 1975 } 1976 } 1977 1978 bind(L_fallthrough); 1979 #undef FINAL_JUMP 1980 } 1981 1982 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 1983 Register super_klass, 1984 Register temp1_reg, 1985 Register temp2_reg, 1986 Label* L_success, 1987 Register result_reg) { 1988 const Register array_ptr = temp1_reg; // current value from cache array 1989 const Register temp = temp2_reg; 1990 1991 assert_different_registers(sub_klass, super_klass, array_ptr, temp); 1992 1993 int source_offset = in_bytes(Klass::secondary_supers_offset()); 1994 int target_offset = in_bytes(Klass::secondary_super_cache_offset()); 1995 1996 int length_offset = Array<Klass*>::length_offset_in_bytes(); 1997 int base_offset = Array<Klass*>::base_offset_in_bytes(); 1998 1999 Label hit, loop, failure, fallthru; 2000 2001 ld(array_ptr, source_offset, sub_klass); 2002 2003 // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated."); 2004 lwz(temp, length_offset, array_ptr); 2005 cmpwi(CCR0, temp, 0); 2006 beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0 2007 2008 mtctr(temp); // load ctr 2009 2010 bind(loop); 2011 // Oops in table are NO MORE compressed. 2012 ld(temp, base_offset, array_ptr); 2013 cmpd(CCR0, temp, super_klass); 2014 beq(CCR0, hit); 2015 addi(array_ptr, array_ptr, BytesPerWord); 2016 bdnz(loop); 2017 2018 bind(failure); 2019 if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss) 2020 b(fallthru); 2021 2022 bind(hit); 2023 std(super_klass, target_offset, sub_klass); // save result to cache 2024 if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit) 2025 if (L_success != NULL) { b(*L_success); } 2026 else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided 2027 2028 bind(fallthru); 2029 } 2030 2031 // Try fast path, then go to slow one if not successful 2032 void MacroAssembler::check_klass_subtype(Register sub_klass, 2033 Register super_klass, 2034 Register temp1_reg, 2035 Register temp2_reg, 2036 Label& L_success) { 2037 Label L_failure; 2038 check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure); 2039 check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success); 2040 bind(L_failure); // Fallthru if not successful. 2041 } 2042 2043 void MacroAssembler::check_method_handle_type(Register mtype_reg, Register mh_reg, 2044 Register temp_reg, 2045 Label& wrong_method_type) { 2046 assert_different_registers(mtype_reg, mh_reg, temp_reg); 2047 // Compare method type against that of the receiver. 2048 load_heap_oop(temp_reg, delayed_value(java_lang_invoke_MethodHandle::type_offset_in_bytes, temp_reg), mh_reg, 2049 noreg, noreg, false, IS_NOT_NULL); 2050 cmpd(CCR0, temp_reg, mtype_reg); 2051 bne(CCR0, wrong_method_type); 2052 } 2053 2054 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot, 2055 Register temp_reg, 2056 int extra_slot_offset) { 2057 // cf. TemplateTable::prepare_invoke(), if (load_receiver). 2058 int stackElementSize = Interpreter::stackElementSize; 2059 int offset = extra_slot_offset * stackElementSize; 2060 if (arg_slot.is_constant()) { 2061 offset += arg_slot.as_constant() * stackElementSize; 2062 return offset; 2063 } else { 2064 assert(temp_reg != noreg, "must specify"); 2065 sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize)); 2066 if (offset != 0) 2067 addi(temp_reg, temp_reg, offset); 2068 return temp_reg; 2069 } 2070 } 2071 2072 // Supports temp2_reg = R0. 2073 void MacroAssembler::biased_locking_enter(ConditionRegister cr_reg, Register obj_reg, 2074 Register mark_reg, Register temp_reg, 2075 Register temp2_reg, Label& done, Label* slow_case) { 2076 assert(UseBiasedLocking, "why call this otherwise?"); 2077 2078 #ifdef ASSERT 2079 assert_different_registers(obj_reg, mark_reg, temp_reg, temp2_reg); 2080 #endif 2081 2082 Label cas_label; 2083 2084 // Branch to done if fast path fails and no slow_case provided. 2085 Label *slow_case_int = (slow_case != NULL) ? slow_case : &done; 2086 2087 // Biased locking 2088 // See whether the lock is currently biased toward our thread and 2089 // whether the epoch is still valid 2090 // Note that the runtime guarantees sufficient alignment of JavaThread 2091 // pointers to allow age to be placed into low bits 2092 assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, 2093 "biased locking makes assumptions about bit layout"); 2094 2095 if (PrintBiasedLockingStatistics) { 2096 load_const(temp2_reg, (address) BiasedLocking::total_entry_count_addr(), temp_reg); 2097 lwzx(temp_reg, temp2_reg); 2098 addi(temp_reg, temp_reg, 1); 2099 stwx(temp_reg, temp2_reg); 2100 } 2101 2102 andi(temp_reg, mark_reg, markOopDesc::biased_lock_mask_in_place); 2103 cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern); 2104 bne(cr_reg, cas_label); 2105 2106 load_klass(temp_reg, obj_reg); 2107 2108 load_const_optimized(temp2_reg, ~((int) markOopDesc::age_mask_in_place)); 2109 ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg); 2110 orr(temp_reg, R16_thread, temp_reg); 2111 xorr(temp_reg, mark_reg, temp_reg); 2112 andr(temp_reg, temp_reg, temp2_reg); 2113 cmpdi(cr_reg, temp_reg, 0); 2114 if (PrintBiasedLockingStatistics) { 2115 Label l; 2116 bne(cr_reg, l); 2117 load_const(temp2_reg, (address) BiasedLocking::biased_lock_entry_count_addr()); 2118 lwzx(mark_reg, temp2_reg); 2119 addi(mark_reg, mark_reg, 1); 2120 stwx(mark_reg, temp2_reg); 2121 // restore mark_reg 2122 ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg); 2123 bind(l); 2124 } 2125 beq(cr_reg, done); 2126 2127 Label try_revoke_bias; 2128 Label try_rebias; 2129 2130 // At this point we know that the header has the bias pattern and 2131 // that we are not the bias owner in the current epoch. We need to 2132 // figure out more details about the state of the header in order to 2133 // know what operations can be legally performed on the object's 2134 // header. 2135 2136 // If the low three bits in the xor result aren't clear, that means 2137 // the prototype header is no longer biased and we have to revoke 2138 // the bias on this object. 2139 andi(temp2_reg, temp_reg, markOopDesc::biased_lock_mask_in_place); 2140 cmpwi(cr_reg, temp2_reg, 0); 2141 bne(cr_reg, try_revoke_bias); 2142 2143 // Biasing is still enabled for this data type. See whether the 2144 // epoch of the current bias is still valid, meaning that the epoch 2145 // bits of the mark word are equal to the epoch bits of the 2146 // prototype header. (Note that the prototype header's epoch bits 2147 // only change at a safepoint.) If not, attempt to rebias the object 2148 // toward the current thread. Note that we must be absolutely sure 2149 // that the current epoch is invalid in order to do this because 2150 // otherwise the manipulations it performs on the mark word are 2151 // illegal. 2152 2153 int shift_amount = 64 - markOopDesc::epoch_shift; 2154 // rotate epoch bits to right (little) end and set other bits to 0 2155 // [ big part | epoch | little part ] -> [ 0..0 | epoch ] 2156 rldicl_(temp2_reg, temp_reg, shift_amount, 64 - markOopDesc::epoch_bits); 2157 // branch if epoch bits are != 0, i.e. they differ, because the epoch has been incremented 2158 bne(CCR0, try_rebias); 2159 2160 // The epoch of the current bias is still valid but we know nothing 2161 // about the owner; it might be set or it might be clear. Try to 2162 // acquire the bias of the object using an atomic operation. If this 2163 // fails we will go in to the runtime to revoke the object's bias. 2164 // Note that we first construct the presumed unbiased header so we 2165 // don't accidentally blow away another thread's valid bias. 2166 andi(mark_reg, mark_reg, (markOopDesc::biased_lock_mask_in_place | 2167 markOopDesc::age_mask_in_place | 2168 markOopDesc::epoch_mask_in_place)); 2169 orr(temp_reg, R16_thread, mark_reg); 2170 2171 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2172 2173 // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg). 2174 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg, 2175 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg, 2176 /*where=*/obj_reg, 2177 MacroAssembler::MemBarAcq, 2178 MacroAssembler::cmpxchgx_hint_acquire_lock(), 2179 noreg, slow_case_int); // bail out if failed 2180 2181 // If the biasing toward our thread failed, this means that 2182 // another thread succeeded in biasing it toward itself and we 2183 // need to revoke that bias. The revocation will occur in the 2184 // interpreter runtime in the slow case. 2185 if (PrintBiasedLockingStatistics) { 2186 load_const(temp2_reg, (address) BiasedLocking::anonymously_biased_lock_entry_count_addr(), temp_reg); 2187 lwzx(temp_reg, temp2_reg); 2188 addi(temp_reg, temp_reg, 1); 2189 stwx(temp_reg, temp2_reg); 2190 } 2191 b(done); 2192 2193 bind(try_rebias); 2194 // At this point we know the epoch has expired, meaning that the 2195 // current "bias owner", if any, is actually invalid. Under these 2196 // circumstances _only_, we are allowed to use the current header's 2197 // value as the comparison value when doing the cas to acquire the 2198 // bias in the current epoch. In other words, we allow transfer of 2199 // the bias from one thread to another directly in this situation. 2200 load_klass(temp_reg, obj_reg); 2201 andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place); 2202 orr(temp2_reg, R16_thread, temp2_reg); 2203 ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg); 2204 orr(temp_reg, temp2_reg, temp_reg); 2205 2206 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2207 2208 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg, 2209 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg, 2210 /*where=*/obj_reg, 2211 MacroAssembler::MemBarAcq, 2212 MacroAssembler::cmpxchgx_hint_acquire_lock(), 2213 noreg, slow_case_int); // bail out if failed 2214 2215 // If the biasing toward our thread failed, this means that 2216 // another thread succeeded in biasing it toward itself and we 2217 // need to revoke that bias. The revocation will occur in the 2218 // interpreter runtime in the slow case. 2219 if (PrintBiasedLockingStatistics) { 2220 load_const(temp2_reg, (address) BiasedLocking::rebiased_lock_entry_count_addr(), temp_reg); 2221 lwzx(temp_reg, temp2_reg); 2222 addi(temp_reg, temp_reg, 1); 2223 stwx(temp_reg, temp2_reg); 2224 } 2225 b(done); 2226 2227 bind(try_revoke_bias); 2228 // The prototype mark in the klass doesn't have the bias bit set any 2229 // more, indicating that objects of this data type are not supposed 2230 // to be biased any more. We are going to try to reset the mark of 2231 // this object to the prototype value and fall through to the 2232 // CAS-based locking scheme. Note that if our CAS fails, it means 2233 // that another thread raced us for the privilege of revoking the 2234 // bias of this particular object, so it's okay to continue in the 2235 // normal locking code. 2236 load_klass(temp_reg, obj_reg); 2237 ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg); 2238 andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place); 2239 orr(temp_reg, temp_reg, temp2_reg); 2240 2241 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2242 2243 // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg). 2244 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg, 2245 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg, 2246 /*where=*/obj_reg, 2247 MacroAssembler::MemBarAcq, 2248 MacroAssembler::cmpxchgx_hint_acquire_lock()); 2249 2250 // reload markOop in mark_reg before continuing with lightweight locking 2251 ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg); 2252 2253 // Fall through to the normal CAS-based lock, because no matter what 2254 // the result of the above CAS, some thread must have succeeded in 2255 // removing the bias bit from the object's header. 2256 if (PrintBiasedLockingStatistics) { 2257 Label l; 2258 bne(cr_reg, l); 2259 load_const(temp2_reg, (address) BiasedLocking::revoked_lock_entry_count_addr(), temp_reg); 2260 lwzx(temp_reg, temp2_reg); 2261 addi(temp_reg, temp_reg, 1); 2262 stwx(temp_reg, temp2_reg); 2263 bind(l); 2264 } 2265 2266 bind(cas_label); 2267 } 2268 2269 void MacroAssembler::biased_locking_exit (ConditionRegister cr_reg, Register mark_addr, Register temp_reg, Label& done) { 2270 // Check for biased locking unlock case, which is a no-op 2271 // Note: we do not have to check the thread ID for two reasons. 2272 // First, the interpreter checks for IllegalMonitorStateException at 2273 // a higher level. Second, if the bias was revoked while we held the 2274 // lock, the object could not be rebiased toward another thread, so 2275 // the bias bit would be clear. 2276 2277 ld(temp_reg, 0, mark_addr); 2278 andi(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place); 2279 2280 cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern); 2281 beq(cr_reg, done); 2282 } 2283 2284 // allocation (for C1) 2285 void MacroAssembler::eden_allocate( 2286 Register obj, // result: pointer to object after successful allocation 2287 Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise 2288 int con_size_in_bytes, // object size in bytes if known at compile time 2289 Register t1, // temp register 2290 Register t2, // temp register 2291 Label& slow_case // continuation point if fast allocation fails 2292 ) { 2293 b(slow_case); 2294 } 2295 2296 void MacroAssembler::tlab_allocate( 2297 Register obj, // result: pointer to object after successful allocation 2298 Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise 2299 int con_size_in_bytes, // object size in bytes if known at compile time 2300 Register t1, // temp register 2301 Label& slow_case // continuation point if fast allocation fails 2302 ) { 2303 // make sure arguments make sense 2304 assert_different_registers(obj, var_size_in_bytes, t1); 2305 assert(0 <= con_size_in_bytes && is_simm13(con_size_in_bytes), "illegal object size"); 2306 assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment"); 2307 2308 const Register new_top = t1; 2309 //verify_tlab(); not implemented 2310 2311 ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread); 2312 ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread); 2313 if (var_size_in_bytes == noreg) { 2314 addi(new_top, obj, con_size_in_bytes); 2315 } else { 2316 add(new_top, obj, var_size_in_bytes); 2317 } 2318 cmpld(CCR0, new_top, R0); 2319 bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case); 2320 2321 #ifdef ASSERT 2322 // make sure new free pointer is properly aligned 2323 { 2324 Label L; 2325 andi_(R0, new_top, MinObjAlignmentInBytesMask); 2326 beq(CCR0, L); 2327 stop("updated TLAB free is not properly aligned", 0x934); 2328 bind(L); 2329 } 2330 #endif // ASSERT 2331 2332 // update the tlab top pointer 2333 std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread); 2334 //verify_tlab(); not implemented 2335 } 2336 void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register t1, Register t2) { 2337 unimplemented("incr_allocated_bytes"); 2338 } 2339 2340 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset, 2341 int insts_call_instruction_offset, Register Rtoc) { 2342 // Start the stub. 2343 address stub = start_a_stub(64); 2344 if (stub == NULL) { return NULL; } // CodeCache full: bail out 2345 2346 // Create a trampoline stub relocation which relates this trampoline stub 2347 // with the call instruction at insts_call_instruction_offset in the 2348 // instructions code-section. 2349 relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset)); 2350 const int stub_start_offset = offset(); 2351 2352 // For java_to_interp stubs we use R11_scratch1 as scratch register 2353 // and in call trampoline stubs we use R12_scratch2. This way we 2354 // can distinguish them (see is_NativeCallTrampolineStub_at()). 2355 Register reg_scratch = R12_scratch2; 2356 2357 // Now, create the trampoline stub's code: 2358 // - load the TOC 2359 // - load the call target from the constant pool 2360 // - call 2361 if (Rtoc == noreg) { 2362 calculate_address_from_global_toc(reg_scratch, method_toc()); 2363 Rtoc = reg_scratch; 2364 } 2365 2366 ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false); 2367 mtctr(reg_scratch); 2368 bctr(); 2369 2370 const address stub_start_addr = addr_at(stub_start_offset); 2371 2372 // Assert that the encoded destination_toc_offset can be identified and that it is correct. 2373 assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(), 2374 "encoded offset into the constant pool must match"); 2375 // Trampoline_stub_size should be good. 2376 assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size"); 2377 assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline"); 2378 2379 // End the stub. 2380 end_a_stub(); 2381 return stub; 2382 } 2383 2384 // TM on PPC64. 2385 void MacroAssembler::atomic_inc_ptr(Register addr, Register result, int simm16) { 2386 Label retry; 2387 bind(retry); 2388 ldarx(result, addr, /*hint*/ false); 2389 addi(result, result, simm16); 2390 stdcx_(result, addr); 2391 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 2392 bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0 2393 } else { 2394 bne( CCR0, retry); // stXcx_ sets CCR0 2395 } 2396 } 2397 2398 void MacroAssembler::atomic_ori_int(Register addr, Register result, int uimm16) { 2399 Label retry; 2400 bind(retry); 2401 lwarx(result, addr, /*hint*/ false); 2402 ori(result, result, uimm16); 2403 stwcx_(result, addr); 2404 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 2405 bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0 2406 } else { 2407 bne( CCR0, retry); // stXcx_ sets CCR0 2408 } 2409 } 2410 2411 #if INCLUDE_RTM_OPT 2412 2413 // Update rtm_counters based on abort status 2414 // input: abort_status 2415 // rtm_counters (RTMLockingCounters*) 2416 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters_Reg) { 2417 // Mapping to keep PreciseRTMLockingStatistics similar to x86. 2418 // x86 ppc (! means inverted, ? means not the same) 2419 // 0 31 Set if abort caused by XABORT instruction. 2420 // 1 ! 7 If set, the transaction may succeed on a retry. This bit is always clear if bit 0 is set. 2421 // 2 13 Set if another logical processor conflicted with a memory address that was part of the transaction that aborted. 2422 // 3 10 Set if an internal buffer overflowed. 2423 // 4 ?12 Set if a debug breakpoint was hit. 2424 // 5 ?32 Set if an abort occurred during execution of a nested transaction. 2425 const int tm_failure_bit[] = {Assembler::tm_tabort, // Note: Seems like signal handler sets this, too. 2426 Assembler::tm_failure_persistent, // inverted: transient 2427 Assembler::tm_trans_cf, 2428 Assembler::tm_footprint_of, 2429 Assembler::tm_non_trans_cf, 2430 Assembler::tm_suspended}; 2431 const bool tm_failure_inv[] = {false, true, false, false, false, false}; 2432 assert(sizeof(tm_failure_bit)/sizeof(int) == RTMLockingCounters::ABORT_STATUS_LIMIT, "adapt mapping!"); 2433 2434 const Register addr_Reg = R0; 2435 // Keep track of offset to where rtm_counters_Reg had pointed to. 2436 int counters_offs = RTMLockingCounters::abort_count_offset(); 2437 addi(addr_Reg, rtm_counters_Reg, counters_offs); 2438 const Register temp_Reg = rtm_counters_Reg; 2439 2440 //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically 2441 ldx(temp_Reg, addr_Reg); 2442 addi(temp_Reg, temp_Reg, 1); 2443 stdx(temp_Reg, addr_Reg); 2444 2445 if (PrintPreciseRTMLockingStatistics) { 2446 int counters_offs_delta = RTMLockingCounters::abortX_count_offset() - counters_offs; 2447 2448 //mftexasr(abort_status); done by caller 2449 for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) { 2450 counters_offs += counters_offs_delta; 2451 li(temp_Reg, counters_offs_delta); // can't use addi with R0 2452 add(addr_Reg, addr_Reg, temp_Reg); // point to next counter 2453 counters_offs_delta = sizeof(uintx); 2454 2455 Label check_abort; 2456 rldicr_(temp_Reg, abort_status, tm_failure_bit[i], 0); 2457 if (tm_failure_inv[i]) { 2458 bne(CCR0, check_abort); 2459 } else { 2460 beq(CCR0, check_abort); 2461 } 2462 //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically 2463 ldx(temp_Reg, addr_Reg); 2464 addi(temp_Reg, temp_Reg, 1); 2465 stdx(temp_Reg, addr_Reg); 2466 bind(check_abort); 2467 } 2468 } 2469 li(temp_Reg, -counters_offs); // can't use addi with R0 2470 add(rtm_counters_Reg, addr_Reg, temp_Reg); // restore 2471 } 2472 2473 // Branch if (random & (count-1) != 0), count is 2^n 2474 // tmp and CR0 are killed 2475 void MacroAssembler::branch_on_random_using_tb(Register tmp, int count, Label& brLabel) { 2476 mftb(tmp); 2477 andi_(tmp, tmp, count-1); 2478 bne(CCR0, brLabel); 2479 } 2480 2481 // Perform abort ratio calculation, set no_rtm bit if high ratio. 2482 // input: rtm_counters_Reg (RTMLockingCounters* address) - KILLED 2483 void MacroAssembler::rtm_abort_ratio_calculation(Register rtm_counters_Reg, 2484 RTMLockingCounters* rtm_counters, 2485 Metadata* method_data) { 2486 Label L_done, L_check_always_rtm1, L_check_always_rtm2; 2487 2488 if (RTMLockingCalculationDelay > 0) { 2489 // Delay calculation. 2490 ld(rtm_counters_Reg, (RegisterOrConstant)(intptr_t)RTMLockingCounters::rtm_calculation_flag_addr()); 2491 cmpdi(CCR0, rtm_counters_Reg, 0); 2492 beq(CCR0, L_done); 2493 load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload 2494 } 2495 // Abort ratio calculation only if abort_count > RTMAbortThreshold. 2496 // Aborted transactions = abort_count * 100 2497 // All transactions = total_count * RTMTotalCountIncrRate 2498 // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio) 2499 ld(R0, RTMLockingCounters::abort_count_offset(), rtm_counters_Reg); 2500 if (is_simm(RTMAbortThreshold, 16)) { // cmpdi can handle 16bit immediate only. 2501 cmpdi(CCR0, R0, RTMAbortThreshold); 2502 blt(CCR0, L_check_always_rtm2); // reload of rtm_counters_Reg not necessary 2503 } else { 2504 load_const_optimized(rtm_counters_Reg, RTMAbortThreshold); 2505 cmpd(CCR0, R0, rtm_counters_Reg); 2506 blt(CCR0, L_check_always_rtm1); // reload of rtm_counters_Reg required 2507 } 2508 mulli(R0, R0, 100); 2509 2510 const Register tmpReg = rtm_counters_Reg; 2511 ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg); 2512 mulli(tmpReg, tmpReg, RTMTotalCountIncrRate); // allowable range: int16 2513 mulli(tmpReg, tmpReg, RTMAbortRatio); // allowable range: int16 2514 cmpd(CCR0, R0, tmpReg); 2515 blt(CCR0, L_check_always_rtm1); // jump to reload 2516 if (method_data != NULL) { 2517 // Set rtm_state to "no rtm" in MDO. 2518 // Not using a metadata relocation. Method and Class Loader are kept alive anyway. 2519 // (See nmethod::metadata_do and CodeBuffer::finalize_oop_references.) 2520 load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg); 2521 atomic_ori_int(R0, tmpReg, NoRTM); 2522 } 2523 b(L_done); 2524 2525 bind(L_check_always_rtm1); 2526 load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload 2527 bind(L_check_always_rtm2); 2528 ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg); 2529 int64_t thresholdValue = RTMLockingThreshold / RTMTotalCountIncrRate; 2530 if (is_simm(thresholdValue, 16)) { // cmpdi can handle 16bit immediate only. 2531 cmpdi(CCR0, tmpReg, thresholdValue); 2532 } else { 2533 load_const_optimized(R0, thresholdValue); 2534 cmpd(CCR0, tmpReg, R0); 2535 } 2536 blt(CCR0, L_done); 2537 if (method_data != NULL) { 2538 // Set rtm_state to "always rtm" in MDO. 2539 // Not using a metadata relocation. See above. 2540 load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg); 2541 atomic_ori_int(R0, tmpReg, UseRTM); 2542 } 2543 bind(L_done); 2544 } 2545 2546 // Update counters and perform abort ratio calculation. 2547 // input: abort_status_Reg 2548 void MacroAssembler::rtm_profiling(Register abort_status_Reg, Register temp_Reg, 2549 RTMLockingCounters* rtm_counters, 2550 Metadata* method_data, 2551 bool profile_rtm) { 2552 2553 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2554 // Update rtm counters based on state at abort. 2555 // Reads abort_status_Reg, updates flags. 2556 assert_different_registers(abort_status_Reg, temp_Reg); 2557 load_const_optimized(temp_Reg, (address)rtm_counters, R0); 2558 rtm_counters_update(abort_status_Reg, temp_Reg); 2559 if (profile_rtm) { 2560 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2561 rtm_abort_ratio_calculation(temp_Reg, rtm_counters, method_data); 2562 } 2563 } 2564 2565 // Retry on abort if abort's status indicates non-persistent failure. 2566 // inputs: retry_count_Reg 2567 // : abort_status_Reg 2568 // output: retry_count_Reg decremented by 1 2569 void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, 2570 Label& retryLabel, Label* checkRetry) { 2571 Label doneRetry; 2572 rldicr_(R0, abort_status_Reg, tm_failure_persistent, 0); 2573 bne(CCR0, doneRetry); 2574 if (checkRetry) { bind(*checkRetry); } 2575 addic_(retry_count_Reg, retry_count_Reg, -1); 2576 blt(CCR0, doneRetry); 2577 b(retryLabel); 2578 bind(doneRetry); 2579 } 2580 2581 // Spin and retry if lock is busy. 2582 // inputs: owner_addr_Reg (monitor address) 2583 // : retry_count_Reg 2584 // output: retry_count_Reg decremented by 1 2585 // CTR is killed 2586 void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register owner_addr_Reg, Label& retryLabel) { 2587 Label SpinLoop, doneRetry, doRetry; 2588 addic_(retry_count_Reg, retry_count_Reg, -1); 2589 blt(CCR0, doneRetry); 2590 2591 if (RTMSpinLoopCount > 1) { 2592 li(R0, RTMSpinLoopCount); 2593 mtctr(R0); 2594 } 2595 2596 // low thread priority 2597 smt_prio_low(); 2598 bind(SpinLoop); 2599 2600 if (RTMSpinLoopCount > 1) { 2601 bdz(doRetry); 2602 ld(R0, 0, owner_addr_Reg); 2603 cmpdi(CCR0, R0, 0); 2604 bne(CCR0, SpinLoop); 2605 } 2606 2607 bind(doRetry); 2608 2609 // restore thread priority to default in userspace 2610 #ifdef LINUX 2611 smt_prio_medium_low(); 2612 #else 2613 smt_prio_medium(); 2614 #endif 2615 2616 b(retryLabel); 2617 2618 bind(doneRetry); 2619 } 2620 2621 // Use RTM for normal stack locks. 2622 // Input: objReg (object to lock) 2623 void MacroAssembler::rtm_stack_locking(ConditionRegister flag, 2624 Register obj, Register mark_word, Register tmp, 2625 Register retry_on_abort_count_Reg, 2626 RTMLockingCounters* stack_rtm_counters, 2627 Metadata* method_data, bool profile_rtm, 2628 Label& DONE_LABEL, Label& IsInflated) { 2629 assert(UseRTMForStackLocks, "why call this otherwise?"); 2630 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); 2631 Label L_rtm_retry, L_decrement_retry, L_on_abort; 2632 2633 if (RTMRetryCount > 0) { 2634 load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 2635 bind(L_rtm_retry); 2636 } 2637 andi_(R0, mark_word, markOopDesc::monitor_value); // inflated vs stack-locked|neutral|biased 2638 bne(CCR0, IsInflated); 2639 2640 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2641 Label L_noincrement; 2642 if (RTMTotalCountIncrRate > 1) { 2643 branch_on_random_using_tb(tmp, RTMTotalCountIncrRate, L_noincrement); 2644 } 2645 assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM"); 2646 load_const_optimized(tmp, (address)stack_rtm_counters->total_count_addr(), R0); 2647 //atomic_inc_ptr(tmp, /*temp, will be reloaded*/mark_word); We don't increment atomically 2648 ldx(mark_word, tmp); 2649 addi(mark_word, mark_word, 1); 2650 stdx(mark_word, tmp); 2651 bind(L_noincrement); 2652 } 2653 tbegin_(); 2654 beq(CCR0, L_on_abort); 2655 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // Reload in transaction, conflicts need to be tracked. 2656 andi(R0, mark_word, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits 2657 cmpwi(flag, R0, markOopDesc::unlocked_value); // bits = 001 unlocked 2658 beq(flag, DONE_LABEL); // all done if unlocked 2659 2660 if (UseRTMXendForLockBusy) { 2661 tend_(); 2662 b(L_decrement_retry); 2663 } else { 2664 tabort_(); 2665 } 2666 bind(L_on_abort); 2667 const Register abort_status_Reg = tmp; 2668 mftexasr(abort_status_Reg); 2669 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2670 rtm_profiling(abort_status_Reg, /*temp*/mark_word, stack_rtm_counters, method_data, profile_rtm); 2671 } 2672 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // reload 2673 if (RTMRetryCount > 0) { 2674 // Retry on lock abort if abort status is not permanent. 2675 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry, &L_decrement_retry); 2676 } else { 2677 bind(L_decrement_retry); 2678 } 2679 } 2680 2681 // Use RTM for inflating locks 2682 // inputs: obj (object to lock) 2683 // mark_word (current header - KILLED) 2684 // boxReg (on-stack box address (displaced header location) - KILLED) 2685 void MacroAssembler::rtm_inflated_locking(ConditionRegister flag, 2686 Register obj, Register mark_word, Register boxReg, 2687 Register retry_on_busy_count_Reg, Register retry_on_abort_count_Reg, 2688 RTMLockingCounters* rtm_counters, 2689 Metadata* method_data, bool profile_rtm, 2690 Label& DONE_LABEL) { 2691 assert(UseRTMLocking, "why call this otherwise?"); 2692 Label L_rtm_retry, L_decrement_retry, L_on_abort; 2693 // Clean monitor_value bit to get valid pointer. 2694 int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markOopDesc::monitor_value; 2695 2696 // Store non-null, using boxReg instead of (intptr_t)markOopDesc::unused_mark(). 2697 std(boxReg, BasicLock::displaced_header_offset_in_bytes(), boxReg); 2698 const Register tmpReg = boxReg; 2699 const Register owner_addr_Reg = mark_word; 2700 addi(owner_addr_Reg, mark_word, owner_offset); 2701 2702 if (RTMRetryCount > 0) { 2703 load_const_optimized(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy. 2704 load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort. 2705 bind(L_rtm_retry); 2706 } 2707 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2708 Label L_noincrement; 2709 if (RTMTotalCountIncrRate > 1) { 2710 branch_on_random_using_tb(R0, RTMTotalCountIncrRate, L_noincrement); 2711 } 2712 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2713 load_const(R0, (address)rtm_counters->total_count_addr(), tmpReg); 2714 //atomic_inc_ptr(R0, tmpReg); We don't increment atomically 2715 ldx(tmpReg, R0); 2716 addi(tmpReg, tmpReg, 1); 2717 stdx(tmpReg, R0); 2718 bind(L_noincrement); 2719 } 2720 tbegin_(); 2721 beq(CCR0, L_on_abort); 2722 // We don't reload mark word. Will only be reset at safepoint. 2723 ld(R0, 0, owner_addr_Reg); // Load in transaction, conflicts need to be tracked. 2724 cmpdi(flag, R0, 0); 2725 beq(flag, DONE_LABEL); 2726 2727 if (UseRTMXendForLockBusy) { 2728 tend_(); 2729 b(L_decrement_retry); 2730 } else { 2731 tabort_(); 2732 } 2733 bind(L_on_abort); 2734 const Register abort_status_Reg = tmpReg; 2735 mftexasr(abort_status_Reg); 2736 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2737 rtm_profiling(abort_status_Reg, /*temp*/ owner_addr_Reg, rtm_counters, method_data, profile_rtm); 2738 // Restore owner_addr_Reg 2739 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); 2740 #ifdef ASSERT 2741 andi_(R0, mark_word, markOopDesc::monitor_value); 2742 asm_assert_ne("must be inflated", 0xa754); // Deflating only allowed at safepoint. 2743 #endif 2744 addi(owner_addr_Reg, mark_word, owner_offset); 2745 } 2746 if (RTMRetryCount > 0) { 2747 // Retry on lock abort if abort status is not permanent. 2748 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 2749 } 2750 2751 // Appears unlocked - try to swing _owner from null to non-null. 2752 cmpxchgd(flag, /*current val*/ R0, (intptr_t)0, /*new val*/ R16_thread, owner_addr_Reg, 2753 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2754 MacroAssembler::cmpxchgx_hint_acquire_lock(), noreg, &L_decrement_retry, true); 2755 2756 if (RTMRetryCount > 0) { 2757 // success done else retry 2758 b(DONE_LABEL); 2759 bind(L_decrement_retry); 2760 // Spin and retry if lock is busy. 2761 rtm_retry_lock_on_busy(retry_on_busy_count_Reg, owner_addr_Reg, L_rtm_retry); 2762 } else { 2763 bind(L_decrement_retry); 2764 } 2765 } 2766 2767 #endif // INCLUDE_RTM_OPT 2768 2769 // "The box" is the space on the stack where we copy the object mark. 2770 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box, 2771 Register temp, Register displaced_header, Register current_header, 2772 bool try_bias, 2773 RTMLockingCounters* rtm_counters, 2774 RTMLockingCounters* stack_rtm_counters, 2775 Metadata* method_data, 2776 bool use_rtm, bool profile_rtm) { 2777 assert_different_registers(oop, box, temp, displaced_header, current_header); 2778 assert(flag != CCR0, "bad condition register"); 2779 Label cont; 2780 Label object_has_monitor; 2781 Label cas_failed; 2782 2783 // Load markOop from object into displaced_header. 2784 ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop); 2785 2786 2787 // Always do locking in runtime. 2788 if (EmitSync & 0x01) { 2789 cmpdi(flag, oop, 0); // Oop can't be 0 here => always false. 2790 return; 2791 } 2792 2793 if (try_bias) { 2794 biased_locking_enter(flag, oop, displaced_header, temp, current_header, cont); 2795 } 2796 2797 #if INCLUDE_RTM_OPT 2798 if (UseRTMForStackLocks && use_rtm) { 2799 rtm_stack_locking(flag, oop, displaced_header, temp, /*temp*/ current_header, 2800 stack_rtm_counters, method_data, profile_rtm, 2801 cont, object_has_monitor); 2802 } 2803 #endif // INCLUDE_RTM_OPT 2804 2805 // Handle existing monitor. 2806 if ((EmitSync & 0x02) == 0) { 2807 // The object has an existing monitor iff (mark & monitor_value) != 0. 2808 andi_(temp, displaced_header, markOopDesc::monitor_value); 2809 bne(CCR0, object_has_monitor); 2810 } 2811 2812 // Set displaced_header to be (markOop of object | UNLOCK_VALUE). 2813 ori(displaced_header, displaced_header, markOopDesc::unlocked_value); 2814 2815 // Load Compare Value application register. 2816 2817 // Initialize the box. (Must happen before we update the object mark!) 2818 std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2819 2820 // Must fence, otherwise, preceding store(s) may float below cmpxchg. 2821 // Compare object markOop with mark and if equal exchange scratch1 with object markOop. 2822 cmpxchgd(/*flag=*/flag, 2823 /*current_value=*/current_header, 2824 /*compare_value=*/displaced_header, 2825 /*exchange_value=*/box, 2826 /*where=*/oop, 2827 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2828 MacroAssembler::cmpxchgx_hint_acquire_lock(), 2829 noreg, 2830 &cas_failed, 2831 /*check without membar and ldarx first*/true); 2832 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2833 2834 // If the compare-and-exchange succeeded, then we found an unlocked 2835 // object and we have now locked it. 2836 b(cont); 2837 2838 bind(cas_failed); 2839 // We did not see an unlocked object so try the fast recursive case. 2840 2841 // Check if the owner is self by comparing the value in the markOop of object 2842 // (current_header) with the stack pointer. 2843 sub(current_header, current_header, R1_SP); 2844 load_const_optimized(temp, ~(os::vm_page_size()-1) | markOopDesc::lock_mask_in_place); 2845 2846 and_(R0/*==0?*/, current_header, temp); 2847 // If condition is true we are cont and hence we can store 0 as the 2848 // displaced header in the box, which indicates that it is a recursive lock. 2849 mcrf(flag,CCR0); 2850 std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box); 2851 2852 // Handle existing monitor. 2853 if ((EmitSync & 0x02) == 0) { 2854 b(cont); 2855 2856 bind(object_has_monitor); 2857 // The object's monitor m is unlocked iff m->owner == NULL, 2858 // otherwise m->owner may contain a thread or a stack address. 2859 2860 #if INCLUDE_RTM_OPT 2861 // Use the same RTM locking code in 32- and 64-bit VM. 2862 if (use_rtm) { 2863 rtm_inflated_locking(flag, oop, displaced_header, box, temp, /*temp*/ current_header, 2864 rtm_counters, method_data, profile_rtm, cont); 2865 } else { 2866 #endif // INCLUDE_RTM_OPT 2867 2868 // Try to CAS m->owner from NULL to current thread. 2869 addi(temp, displaced_header, ObjectMonitor::owner_offset_in_bytes()-markOopDesc::monitor_value); 2870 cmpxchgd(/*flag=*/flag, 2871 /*current_value=*/current_header, 2872 /*compare_value=*/(intptr_t)0, 2873 /*exchange_value=*/R16_thread, 2874 /*where=*/temp, 2875 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2876 MacroAssembler::cmpxchgx_hint_acquire_lock()); 2877 2878 // Store a non-null value into the box. 2879 std(box, BasicLock::displaced_header_offset_in_bytes(), box); 2880 2881 # ifdef ASSERT 2882 bne(flag, cont); 2883 // We have acquired the monitor, check some invariants. 2884 addi(/*monitor=*/temp, temp, -ObjectMonitor::owner_offset_in_bytes()); 2885 // Invariant 1: _recursions should be 0. 2886 //assert(ObjectMonitor::recursions_size_in_bytes() == 8, "unexpected size"); 2887 asm_assert_mem8_is_zero(ObjectMonitor::recursions_offset_in_bytes(), temp, 2888 "monitor->_recursions should be 0", -1); 2889 # endif 2890 2891 #if INCLUDE_RTM_OPT 2892 } // use_rtm() 2893 #endif 2894 } 2895 2896 bind(cont); 2897 // flag == EQ indicates success 2898 // flag == NE indicates failure 2899 } 2900 2901 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box, 2902 Register temp, Register displaced_header, Register current_header, 2903 bool try_bias, bool use_rtm) { 2904 assert_different_registers(oop, box, temp, displaced_header, current_header); 2905 assert(flag != CCR0, "bad condition register"); 2906 Label cont; 2907 Label object_has_monitor; 2908 2909 // Always do locking in runtime. 2910 if (EmitSync & 0x01) { 2911 cmpdi(flag, oop, 0); // Oop can't be 0 here => always false. 2912 return; 2913 } 2914 2915 if (try_bias) { 2916 biased_locking_exit(flag, oop, current_header, cont); 2917 } 2918 2919 #if INCLUDE_RTM_OPT 2920 if (UseRTMForStackLocks && use_rtm) { 2921 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); 2922 Label L_regular_unlock; 2923 ld(current_header, oopDesc::mark_offset_in_bytes(), oop); // fetch markword 2924 andi(R0, current_header, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits 2925 cmpwi(flag, R0, markOopDesc::unlocked_value); // bits = 001 unlocked 2926 bne(flag, L_regular_unlock); // else RegularLock 2927 tend_(); // otherwise end... 2928 b(cont); // ... and we're done 2929 bind(L_regular_unlock); 2930 } 2931 #endif 2932 2933 // Find the lock address and load the displaced header from the stack. 2934 ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2935 2936 // If the displaced header is 0, we have a recursive unlock. 2937 cmpdi(flag, displaced_header, 0); 2938 beq(flag, cont); 2939 2940 // Handle existing monitor. 2941 if ((EmitSync & 0x02) == 0) { 2942 // The object has an existing monitor iff (mark & monitor_value) != 0. 2943 RTM_OPT_ONLY( if (!(UseRTMForStackLocks && use_rtm)) ) // skip load if already done 2944 ld(current_header, oopDesc::mark_offset_in_bytes(), oop); 2945 andi_(R0, current_header, markOopDesc::monitor_value); 2946 bne(CCR0, object_has_monitor); 2947 } 2948 2949 // Check if it is still a light weight lock, this is is true if we see 2950 // the stack address of the basicLock in the markOop of the object. 2951 // Cmpxchg sets flag to cmpd(current_header, box). 2952 cmpxchgd(/*flag=*/flag, 2953 /*current_value=*/current_header, 2954 /*compare_value=*/box, 2955 /*exchange_value=*/displaced_header, 2956 /*where=*/oop, 2957 MacroAssembler::MemBarRel, 2958 MacroAssembler::cmpxchgx_hint_release_lock(), 2959 noreg, 2960 &cont); 2961 2962 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2963 2964 // Handle existing monitor. 2965 if ((EmitSync & 0x02) == 0) { 2966 b(cont); 2967 2968 bind(object_has_monitor); 2969 addi(current_header, current_header, -markOopDesc::monitor_value); // monitor 2970 ld(temp, ObjectMonitor::owner_offset_in_bytes(), current_header); 2971 2972 // It's inflated. 2973 #if INCLUDE_RTM_OPT 2974 if (use_rtm) { 2975 Label L_regular_inflated_unlock; 2976 // Clean monitor_value bit to get valid pointer 2977 cmpdi(flag, temp, 0); 2978 bne(flag, L_regular_inflated_unlock); 2979 tend_(); 2980 b(cont); 2981 bind(L_regular_inflated_unlock); 2982 } 2983 #endif 2984 2985 ld(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header); 2986 xorr(temp, R16_thread, temp); // Will be 0 if we are the owner. 2987 orr(temp, temp, displaced_header); // Will be 0 if there are 0 recursions. 2988 cmpdi(flag, temp, 0); 2989 bne(flag, cont); 2990 2991 ld(temp, ObjectMonitor::EntryList_offset_in_bytes(), current_header); 2992 ld(displaced_header, ObjectMonitor::cxq_offset_in_bytes(), current_header); 2993 orr(temp, temp, displaced_header); // Will be 0 if both are 0. 2994 cmpdi(flag, temp, 0); 2995 bne(flag, cont); 2996 release(); 2997 std(temp, ObjectMonitor::owner_offset_in_bytes(), current_header); 2998 } 2999 3000 bind(cont); 3001 // flag == EQ indicates success 3002 // flag == NE indicates failure 3003 } 3004 3005 // Write serialization page so VM thread can do a pseudo remote membar. 3006 // We use the current thread pointer to calculate a thread specific 3007 // offset to write to within the page. This minimizes bus traffic 3008 // due to cache line collision. 3009 void MacroAssembler::serialize_memory(Register thread, Register tmp1, Register tmp2) { 3010 srdi(tmp2, thread, os::get_serialize_page_shift_count()); 3011 3012 int mask = os::vm_page_size() - sizeof(int); 3013 if (Assembler::is_simm(mask, 16)) { 3014 andi(tmp2, tmp2, mask); 3015 } else { 3016 lis(tmp1, (int)((signed short) (mask >> 16))); 3017 ori(tmp1, tmp1, mask & 0x0000ffff); 3018 andr(tmp2, tmp2, tmp1); 3019 } 3020 3021 load_const(tmp1, (long) os::get_memory_serialize_page()); 3022 release(); 3023 stwx(R0, tmp1, tmp2); 3024 } 3025 3026 void MacroAssembler::safepoint_poll(Label& slow_path, Register temp_reg) { 3027 if (SafepointMechanism::uses_thread_local_poll()) { 3028 ld(temp_reg, in_bytes(Thread::polling_page_offset()), R16_thread); 3029 // Armed page has poll_bit set. 3030 andi_(temp_reg, temp_reg, SafepointMechanism::poll_bit()); 3031 } else { 3032 lwz(temp_reg, (RegisterOrConstant)(intptr_t)SafepointSynchronize::address_of_state()); 3033 cmpwi(CCR0, temp_reg, SafepointSynchronize::_not_synchronized); 3034 } 3035 bne(CCR0, slow_path); 3036 } 3037 3038 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2, bool needs_frame) { 3039 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 3040 bs->resolve_jobject(this, value, tmp1, tmp2, needs_frame); 3041 } 3042 3043 // Values for last_Java_pc, and last_Java_sp must comply to the rules 3044 // in frame_ppc.hpp. 3045 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) { 3046 // Always set last_Java_pc and flags first because once last_Java_sp 3047 // is visible has_last_Java_frame is true and users will look at the 3048 // rest of the fields. (Note: flags should always be zero before we 3049 // get here so doesn't need to be set.) 3050 3051 // Verify that last_Java_pc was zeroed on return to Java 3052 asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread, 3053 "last_Java_pc not zeroed before leaving Java", 0x200); 3054 3055 // When returning from calling out from Java mode the frame anchor's 3056 // last_Java_pc will always be set to NULL. It is set here so that 3057 // if we are doing a call to native (not VM) that we capture the 3058 // known pc and don't have to rely on the native call having a 3059 // standard frame linkage where we can find the pc. 3060 if (last_Java_pc != noreg) 3061 std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 3062 3063 // Set last_Java_sp last. 3064 std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 3065 } 3066 3067 void MacroAssembler::reset_last_Java_frame(void) { 3068 asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()), 3069 R16_thread, "SP was not set, still zero", 0x202); 3070 3071 BLOCK_COMMENT("reset_last_Java_frame {"); 3072 li(R0, 0); 3073 3074 // _last_Java_sp = 0 3075 std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 3076 3077 // _last_Java_pc = 0 3078 std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 3079 BLOCK_COMMENT("} reset_last_Java_frame"); 3080 } 3081 3082 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) { 3083 assert_different_registers(sp, tmp1); 3084 3085 // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via 3086 // TOP_IJAVA_FRAME_ABI. 3087 // FIXME: assert that we really have a TOP_IJAVA_FRAME here! 3088 address entry = pc(); 3089 load_const_optimized(tmp1, entry); 3090 3091 set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1); 3092 } 3093 3094 void MacroAssembler::get_vm_result(Register oop_result) { 3095 // Read: 3096 // R16_thread 3097 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 3098 // 3099 // Updated: 3100 // oop_result 3101 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 3102 3103 verify_thread(); 3104 3105 ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread); 3106 li(R0, 0); 3107 std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread); 3108 3109 verify_oop(oop_result); 3110 } 3111 3112 void MacroAssembler::get_vm_result_2(Register metadata_result) { 3113 // Read: 3114 // R16_thread 3115 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 3116 // 3117 // Updated: 3118 // metadata_result 3119 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 3120 3121 ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 3122 li(R0, 0); 3123 std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 3124 } 3125 3126 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) { 3127 Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided. 3128 if (Universe::narrow_klass_base() != 0) { 3129 // Use dst as temp if it is free. 3130 sub_const_optimized(dst, current, Universe::narrow_klass_base(), R0); 3131 current = dst; 3132 } 3133 if (Universe::narrow_klass_shift() != 0) { 3134 srdi(dst, current, Universe::narrow_klass_shift()); 3135 current = dst; 3136 } 3137 return current; 3138 } 3139 3140 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) { 3141 if (UseCompressedClassPointers) { 3142 Register compressedKlass = encode_klass_not_null(ck, klass); 3143 stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop); 3144 } else { 3145 std(klass, oopDesc::klass_offset_in_bytes(), dst_oop); 3146 } 3147 } 3148 3149 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) { 3150 if (UseCompressedClassPointers) { 3151 if (val == noreg) { 3152 val = R0; 3153 li(val, 0); 3154 } 3155 stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed 3156 } 3157 } 3158 3159 int MacroAssembler::instr_size_for_decode_klass_not_null() { 3160 if (!UseCompressedClassPointers) return 0; 3161 int num_instrs = 1; // shift or move 3162 if (Universe::narrow_klass_base() != 0) num_instrs = 7; // shift + load const + add 3163 return num_instrs * BytesPerInstWord; 3164 } 3165 3166 void MacroAssembler::decode_klass_not_null(Register dst, Register src) { 3167 assert(dst != R0, "Dst reg may not be R0, as R0 is used here."); 3168 if (src == noreg) src = dst; 3169 Register shifted_src = src; 3170 if (Universe::narrow_klass_shift() != 0 || 3171 Universe::narrow_klass_base() == 0 && src != dst) { // Move required. 3172 shifted_src = dst; 3173 sldi(shifted_src, src, Universe::narrow_klass_shift()); 3174 } 3175 if (Universe::narrow_klass_base() != 0) { 3176 add_const_optimized(dst, shifted_src, Universe::narrow_klass_base(), R0); 3177 } 3178 } 3179 3180 void MacroAssembler::load_klass(Register dst, Register src) { 3181 if (UseCompressedClassPointers) { 3182 lwz(dst, oopDesc::klass_offset_in_bytes(), src); 3183 // Attention: no null check here! 3184 decode_klass_not_null(dst, dst); 3185 } else { 3186 ld(dst, oopDesc::klass_offset_in_bytes(), src); 3187 } 3188 } 3189 3190 // ((OopHandle)result).resolve(); 3191 void MacroAssembler::resolve_oop_handle(Register result) { 3192 // OopHandle::resolve is an indirection. 3193 ld(result, 0, result); 3194 } 3195 3196 void MacroAssembler::load_mirror_from_const_method(Register mirror, Register const_method) { 3197 ld(mirror, in_bytes(ConstMethod::constants_offset()), const_method); 3198 ld(mirror, ConstantPool::pool_holder_offset_in_bytes(), mirror); 3199 ld(mirror, in_bytes(Klass::java_mirror_offset()), mirror); 3200 resolve_oop_handle(mirror); 3201 } 3202 3203 // Clear Array 3204 // For very short arrays. tmp == R0 is allowed. 3205 void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) { 3206 if (cnt_dwords > 0) { li(tmp, 0); } 3207 for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); } 3208 } 3209 3210 // Version for constant short array length. Kills base_ptr. tmp == R0 is allowed. 3211 void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) { 3212 if (cnt_dwords < 8) { 3213 clear_memory_unrolled(base_ptr, cnt_dwords, tmp); 3214 return; 3215 } 3216 3217 Label loop; 3218 const long loopcnt = cnt_dwords >> 1, 3219 remainder = cnt_dwords & 1; 3220 3221 li(tmp, loopcnt); 3222 mtctr(tmp); 3223 li(tmp, 0); 3224 bind(loop); 3225 std(tmp, 0, base_ptr); 3226 std(tmp, 8, base_ptr); 3227 addi(base_ptr, base_ptr, 16); 3228 bdnz(loop); 3229 if (remainder) { std(tmp, 0, base_ptr); } 3230 } 3231 3232 // Kills both input registers. tmp == R0 is allowed. 3233 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) { 3234 // Procedure for large arrays (uses data cache block zero instruction). 3235 Label startloop, fast, fastloop, small_rest, restloop, done; 3236 const int cl_size = VM_Version::L1_data_cache_line_size(), 3237 cl_dwords = cl_size >> 3, 3238 cl_dw_addr_bits = exact_log2(cl_dwords), 3239 dcbz_min = 1, // Min count of dcbz executions, needs to be >0. 3240 min_cnt = ((dcbz_min + 1) << cl_dw_addr_bits) - 1; 3241 3242 if (const_cnt >= 0) { 3243 // Constant case. 3244 if (const_cnt < min_cnt) { 3245 clear_memory_constlen(base_ptr, const_cnt, tmp); 3246 return; 3247 } 3248 load_const_optimized(cnt_dwords, const_cnt, tmp); 3249 } else { 3250 // cnt_dwords already loaded in register. Need to check size. 3251 cmpdi(CCR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included). 3252 blt(CCR1, small_rest); 3253 } 3254 rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line. 3255 beq(CCR0, fast); // Already 128byte aligned. 3256 3257 subfic(tmp, tmp, cl_dwords); 3258 mtctr(tmp); // Set ctr to hit 128byte boundary (0<ctr<cl_dwords). 3259 subf(cnt_dwords, tmp, cnt_dwords); // rest. 3260 li(tmp, 0); 3261 3262 bind(startloop); // Clear at the beginning to reach 128byte boundary. 3263 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 3264 addi(base_ptr, base_ptr, 8); 3265 bdnz(startloop); 3266 3267 bind(fast); // Clear 128byte blocks. 3268 srdi(tmp, cnt_dwords, cl_dw_addr_bits); // Loop count for 128byte loop (>0). 3269 andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords. 3270 mtctr(tmp); // Load counter. 3271 3272 bind(fastloop); 3273 dcbz(base_ptr); // Clear 128byte aligned block. 3274 addi(base_ptr, base_ptr, cl_size); 3275 bdnz(fastloop); 3276 3277 bind(small_rest); 3278 cmpdi(CCR0, cnt_dwords, 0); // size 0? 3279 beq(CCR0, done); // rest == 0 3280 li(tmp, 0); 3281 mtctr(cnt_dwords); // Load counter. 3282 3283 bind(restloop); // Clear rest. 3284 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 3285 addi(base_ptr, base_ptr, 8); 3286 bdnz(restloop); 3287 3288 bind(done); 3289 } 3290 3291 /////////////////////////////////////////// String intrinsics //////////////////////////////////////////// 3292 3293 #ifdef COMPILER2 3294 // Intrinsics for CompactStrings 3295 3296 // Compress char[] to byte[] by compressing 16 bytes at once. 3297 void MacroAssembler::string_compress_16(Register src, Register dst, Register cnt, 3298 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, 3299 Label& Lfailure) { 3300 3301 const Register tmp0 = R0; 3302 assert_different_registers(src, dst, cnt, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5); 3303 Label Lloop, Lslow; 3304 3305 // Check if cnt >= 8 (= 16 bytes) 3306 lis(tmp1, 0xFF); // tmp1 = 0x00FF00FF00FF00FF 3307 srwi_(tmp2, cnt, 3); 3308 beq(CCR0, Lslow); 3309 ori(tmp1, tmp1, 0xFF); 3310 rldimi(tmp1, tmp1, 32, 0); 3311 mtctr(tmp2); 3312 3313 // 2x unrolled loop 3314 bind(Lloop); 3315 ld(tmp2, 0, src); // _0_1_2_3 (Big Endian) 3316 ld(tmp4, 8, src); // _4_5_6_7 3317 3318 orr(tmp0, tmp2, tmp4); 3319 rldicl(tmp3, tmp2, 6*8, 64-24); // _____1_2 3320 rldimi(tmp2, tmp2, 2*8, 2*8); // _0_2_3_3 3321 rldicl(tmp5, tmp4, 6*8, 64-24); // _____5_6 3322 rldimi(tmp4, tmp4, 2*8, 2*8); // _4_6_7_7 3323 3324 andc_(tmp0, tmp0, tmp1); 3325 bne(CCR0, Lfailure); // Not latin1. 3326 addi(src, src, 16); 3327 3328 rlwimi(tmp3, tmp2, 0*8, 24, 31);// _____1_3 3329 srdi(tmp2, tmp2, 3*8); // ____0_2_ 3330 rlwimi(tmp5, tmp4, 0*8, 24, 31);// _____5_7 3331 srdi(tmp4, tmp4, 3*8); // ____4_6_ 3332 3333 orr(tmp2, tmp2, tmp3); // ____0123 3334 orr(tmp4, tmp4, tmp5); // ____4567 3335 3336 stw(tmp2, 0, dst); 3337 stw(tmp4, 4, dst); 3338 addi(dst, dst, 8); 3339 bdnz(Lloop); 3340 3341 bind(Lslow); // Fallback to slow version 3342 } 3343 3344 // Compress char[] to byte[]. cnt must be positive int. 3345 void MacroAssembler::string_compress(Register src, Register dst, Register cnt, Register tmp, Label& Lfailure) { 3346 Label Lloop; 3347 mtctr(cnt); 3348 3349 bind(Lloop); 3350 lhz(tmp, 0, src); 3351 cmplwi(CCR0, tmp, 0xff); 3352 bgt(CCR0, Lfailure); // Not latin1. 3353 addi(src, src, 2); 3354 stb(tmp, 0, dst); 3355 addi(dst, dst, 1); 3356 bdnz(Lloop); 3357 } 3358 3359 // Inflate byte[] to char[] by inflating 16 bytes at once. 3360 void MacroAssembler::string_inflate_16(Register src, Register dst, Register cnt, 3361 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) { 3362 const Register tmp0 = R0; 3363 assert_different_registers(src, dst, cnt, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5); 3364 Label Lloop, Lslow; 3365 3366 // Check if cnt >= 8 3367 srwi_(tmp2, cnt, 3); 3368 beq(CCR0, Lslow); 3369 lis(tmp1, 0xFF); // tmp1 = 0x00FF00FF 3370 ori(tmp1, tmp1, 0xFF); 3371 mtctr(tmp2); 3372 3373 // 2x unrolled loop 3374 bind(Lloop); 3375 lwz(tmp2, 0, src); // ____0123 (Big Endian) 3376 lwz(tmp4, 4, src); // ____4567 3377 addi(src, src, 8); 3378 3379 rldicl(tmp3, tmp2, 7*8, 64-8); // _______2 3380 rlwimi(tmp2, tmp2, 3*8, 16, 23);// ____0113 3381 rldicl(tmp5, tmp4, 7*8, 64-8); // _______6 3382 rlwimi(tmp4, tmp4, 3*8, 16, 23);// ____4557 3383 3384 andc(tmp0, tmp2, tmp1); // ____0_1_ 3385 rlwimi(tmp2, tmp3, 2*8, 0, 23); // _____2_3 3386 andc(tmp3, tmp4, tmp1); // ____4_5_ 3387 rlwimi(tmp4, tmp5, 2*8, 0, 23); // _____6_7 3388 3389 rldimi(tmp2, tmp0, 3*8, 0*8); // _0_1_2_3 3390 rldimi(tmp4, tmp3, 3*8, 0*8); // _4_5_6_7 3391 3392 std(tmp2, 0, dst); 3393 std(tmp4, 8, dst); 3394 addi(dst, dst, 16); 3395 bdnz(Lloop); 3396 3397 bind(Lslow); // Fallback to slow version 3398 } 3399 3400 // Inflate byte[] to char[]. cnt must be positive int. 3401 void MacroAssembler::string_inflate(Register src, Register dst, Register cnt, Register tmp) { 3402 Label Lloop; 3403 mtctr(cnt); 3404 3405 bind(Lloop); 3406 lbz(tmp, 0, src); 3407 addi(src, src, 1); 3408 sth(tmp, 0, dst); 3409 addi(dst, dst, 2); 3410 bdnz(Lloop); 3411 } 3412 3413 void MacroAssembler::string_compare(Register str1, Register str2, 3414 Register cnt1, Register cnt2, 3415 Register tmp1, Register result, int ae) { 3416 const Register tmp0 = R0, 3417 diff = tmp1; 3418 3419 assert_different_registers(str1, str2, cnt1, cnt2, tmp0, tmp1, result); 3420 Label Ldone, Lslow, Lloop, Lreturn_diff; 3421 3422 // Note: Making use of the fact that compareTo(a, b) == -compareTo(b, a) 3423 // we interchange str1 and str2 in the UL case and negate the result. 3424 // Like this, str1 is always latin1 encoded, except for the UU case. 3425 // In addition, we need 0 (or sign which is 0) extend. 3426 3427 if (ae == StrIntrinsicNode::UU) { 3428 srwi(cnt1, cnt1, 1); 3429 } else { 3430 clrldi(cnt1, cnt1, 32); 3431 } 3432 3433 if (ae != StrIntrinsicNode::LL) { 3434 srwi(cnt2, cnt2, 1); 3435 } else { 3436 clrldi(cnt2, cnt2, 32); 3437 } 3438 3439 // See if the lengths are different, and calculate min in cnt1. 3440 // Save diff in case we need it for a tie-breaker. 3441 subf_(diff, cnt2, cnt1); // diff = cnt1 - cnt2 3442 // if (diff > 0) { cnt1 = cnt2; } 3443 if (VM_Version::has_isel()) { 3444 isel(cnt1, CCR0, Assembler::greater, /*invert*/ false, cnt2); 3445 } else { 3446 Label Lskip; 3447 blt(CCR0, Lskip); 3448 mr(cnt1, cnt2); 3449 bind(Lskip); 3450 } 3451 3452 // Rename registers 3453 Register chr1 = result; 3454 Register chr2 = tmp0; 3455 3456 // Compare multiple characters in fast loop (only implemented for same encoding). 3457 int stride1 = 8, stride2 = 8; 3458 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3459 int log2_chars_per_iter = (ae == StrIntrinsicNode::LL) ? 3 : 2; 3460 Label Lfastloop, Lskipfast; 3461 3462 srwi_(tmp0, cnt1, log2_chars_per_iter); 3463 beq(CCR0, Lskipfast); 3464 rldicl(cnt2, cnt1, 0, 64 - log2_chars_per_iter); // Remaining characters. 3465 li(cnt1, 1 << log2_chars_per_iter); // Initialize for failure case: Rescan characters from current iteration. 3466 mtctr(tmp0); 3467 3468 bind(Lfastloop); 3469 ld(chr1, 0, str1); 3470 ld(chr2, 0, str2); 3471 cmpd(CCR0, chr1, chr2); 3472 bne(CCR0, Lslow); 3473 addi(str1, str1, stride1); 3474 addi(str2, str2, stride2); 3475 bdnz(Lfastloop); 3476 mr(cnt1, cnt2); // Remaining characters. 3477 bind(Lskipfast); 3478 } 3479 3480 // Loop which searches the first difference character by character. 3481 cmpwi(CCR0, cnt1, 0); 3482 beq(CCR0, Lreturn_diff); 3483 bind(Lslow); 3484 mtctr(cnt1); 3485 3486 switch (ae) { 3487 case StrIntrinsicNode::LL: stride1 = 1; stride2 = 1; break; 3488 case StrIntrinsicNode::UL: // fallthru (see comment above) 3489 case StrIntrinsicNode::LU: stride1 = 1; stride2 = 2; break; 3490 case StrIntrinsicNode::UU: stride1 = 2; stride2 = 2; break; 3491 default: ShouldNotReachHere(); break; 3492 } 3493 3494 bind(Lloop); 3495 if (stride1 == 1) { lbz(chr1, 0, str1); } else { lhz(chr1, 0, str1); } 3496 if (stride2 == 1) { lbz(chr2, 0, str2); } else { lhz(chr2, 0, str2); } 3497 subf_(result, chr2, chr1); // result = chr1 - chr2 3498 bne(CCR0, Ldone); 3499 addi(str1, str1, stride1); 3500 addi(str2, str2, stride2); 3501 bdnz(Lloop); 3502 3503 // If strings are equal up to min length, return the length difference. 3504 bind(Lreturn_diff); 3505 mr(result, diff); 3506 3507 // Otherwise, return the difference between the first mismatched chars. 3508 bind(Ldone); 3509 if (ae == StrIntrinsicNode::UL) { 3510 neg(result, result); // Negate result (see note above). 3511 } 3512 } 3513 3514 void MacroAssembler::array_equals(bool is_array_equ, Register ary1, Register ary2, 3515 Register limit, Register tmp1, Register result, bool is_byte) { 3516 const Register tmp0 = R0; 3517 assert_different_registers(ary1, ary2, limit, tmp0, tmp1, result); 3518 Label Ldone, Lskiploop, Lloop, Lfastloop, Lskipfast; 3519 bool limit_needs_shift = false; 3520 3521 if (is_array_equ) { 3522 const int length_offset = arrayOopDesc::length_offset_in_bytes(); 3523 const int base_offset = arrayOopDesc::base_offset_in_bytes(is_byte ? T_BYTE : T_CHAR); 3524 3525 // Return true if the same array. 3526 cmpd(CCR0, ary1, ary2); 3527 beq(CCR0, Lskiploop); 3528 3529 // Return false if one of them is NULL. 3530 cmpdi(CCR0, ary1, 0); 3531 cmpdi(CCR1, ary2, 0); 3532 li(result, 0); 3533 cror(CCR0, Assembler::equal, CCR1, Assembler::equal); 3534 beq(CCR0, Ldone); 3535 3536 // Load the lengths of arrays. 3537 lwz(limit, length_offset, ary1); 3538 lwz(tmp0, length_offset, ary2); 3539 3540 // Return false if the two arrays are not equal length. 3541 cmpw(CCR0, limit, tmp0); 3542 bne(CCR0, Ldone); 3543 3544 // Load array addresses. 3545 addi(ary1, ary1, base_offset); 3546 addi(ary2, ary2, base_offset); 3547 } else { 3548 limit_needs_shift = !is_byte; 3549 li(result, 0); // Assume not equal. 3550 } 3551 3552 // Rename registers 3553 Register chr1 = tmp0; 3554 Register chr2 = tmp1; 3555 3556 // Compare 8 bytes per iteration in fast loop. 3557 const int log2_chars_per_iter = is_byte ? 3 : 2; 3558 3559 srwi_(tmp0, limit, log2_chars_per_iter + (limit_needs_shift ? 1 : 0)); 3560 beq(CCR0, Lskipfast); 3561 mtctr(tmp0); 3562 3563 bind(Lfastloop); 3564 ld(chr1, 0, ary1); 3565 ld(chr2, 0, ary2); 3566 addi(ary1, ary1, 8); 3567 addi(ary2, ary2, 8); 3568 cmpd(CCR0, chr1, chr2); 3569 bne(CCR0, Ldone); 3570 bdnz(Lfastloop); 3571 3572 bind(Lskipfast); 3573 rldicl_(limit, limit, limit_needs_shift ? 64 - 1 : 0, 64 - log2_chars_per_iter); // Remaining characters. 3574 beq(CCR0, Lskiploop); 3575 mtctr(limit); 3576 3577 // Character by character. 3578 bind(Lloop); 3579 if (is_byte) { 3580 lbz(chr1, 0, ary1); 3581 lbz(chr2, 0, ary2); 3582 addi(ary1, ary1, 1); 3583 addi(ary2, ary2, 1); 3584 } else { 3585 lhz(chr1, 0, ary1); 3586 lhz(chr2, 0, ary2); 3587 addi(ary1, ary1, 2); 3588 addi(ary2, ary2, 2); 3589 } 3590 cmpw(CCR0, chr1, chr2); 3591 bne(CCR0, Ldone); 3592 bdnz(Lloop); 3593 3594 bind(Lskiploop); 3595 li(result, 1); // All characters are equal. 3596 bind(Ldone); 3597 } 3598 3599 void MacroAssembler::string_indexof(Register result, Register haystack, Register haycnt, 3600 Register needle, ciTypeArray* needle_values, Register needlecnt, int needlecntval, 3601 Register tmp1, Register tmp2, Register tmp3, Register tmp4, int ae) { 3602 3603 // Ensure 0<needlecnt<=haycnt in ideal graph as prerequisite! 3604 Label L_TooShort, L_Found, L_NotFound, L_End; 3605 Register last_addr = haycnt, // Kill haycnt at the beginning. 3606 addr = tmp1, 3607 n_start = tmp2, 3608 ch1 = tmp3, 3609 ch2 = R0; 3610 3611 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 3612 const int h_csize = (ae == StrIntrinsicNode::LL) ? 1 : 2; 3613 const int n_csize = (ae == StrIntrinsicNode::UU) ? 2 : 1; 3614 3615 // ************************************************************************************************** 3616 // Prepare for main loop: optimized for needle count >=2, bail out otherwise. 3617 // ************************************************************************************************** 3618 3619 // Compute last haystack addr to use if no match gets found. 3620 clrldi(haycnt, haycnt, 32); // Ensure positive int is valid as 64 bit value. 3621 addi(addr, haystack, -h_csize); // Accesses use pre-increment. 3622 if (needlecntval == 0) { // variable needlecnt 3623 cmpwi(CCR6, needlecnt, 2); 3624 clrldi(needlecnt, needlecnt, 32); // Ensure positive int is valid as 64 bit value. 3625 blt(CCR6, L_TooShort); // Variable needlecnt: handle short needle separately. 3626 } 3627 3628 if (n_csize == 2) { lwz(n_start, 0, needle); } else { lhz(n_start, 0, needle); } // Load first 2 characters of needle. 3629 3630 if (needlecntval == 0) { // variable needlecnt 3631 subf(ch1, needlecnt, haycnt); // Last character index to compare is haycnt-needlecnt. 3632 addi(needlecnt, needlecnt, -2); // Rest of needle. 3633 } else { // constant needlecnt 3634 guarantee(needlecntval != 1, "IndexOf with single-character needle must be handled separately"); 3635 assert((needlecntval & 0x7fff) == needlecntval, "wrong immediate"); 3636 addi(ch1, haycnt, -needlecntval); // Last character index to compare is haycnt-needlecnt. 3637 if (needlecntval > 3) { li(needlecnt, needlecntval - 2); } // Rest of needle. 3638 } 3639 3640 if (h_csize == 2) { slwi(ch1, ch1, 1); } // Scale to number of bytes. 3641 3642 if (ae ==StrIntrinsicNode::UL) { 3643 srwi(tmp4, n_start, 1*8); // ___0 3644 rlwimi(n_start, tmp4, 2*8, 0, 23); // _0_1 3645 } 3646 3647 add(last_addr, haystack, ch1); // Point to last address to compare (haystack+2*(haycnt-needlecnt)). 3648 3649 // Main Loop (now we have at least 2 characters). 3650 Label L_OuterLoop, L_InnerLoop, L_FinalCheck, L_Comp1, L_Comp2; 3651 bind(L_OuterLoop); // Search for 1st 2 characters. 3652 Register addr_diff = tmp4; 3653 subf(addr_diff, addr, last_addr); // Difference between already checked address and last address to check. 3654 addi(addr, addr, h_csize); // This is the new address we want to use for comparing. 3655 srdi_(ch2, addr_diff, h_csize); 3656 beq(CCR0, L_FinalCheck); // 2 characters left? 3657 mtctr(ch2); // num of characters / 2 3658 bind(L_InnerLoop); // Main work horse (2x unrolled search loop) 3659 if (h_csize == 2) { // Load 2 characters of haystack (ignore alignment). 3660 lwz(ch1, 0, addr); 3661 lwz(ch2, 2, addr); 3662 } else { 3663 lhz(ch1, 0, addr); 3664 lhz(ch2, 1, addr); 3665 } 3666 cmpw(CCR0, ch1, n_start); // Compare 2 characters (1 would be sufficient but try to reduce branches to CompLoop). 3667 cmpw(CCR1, ch2, n_start); 3668 beq(CCR0, L_Comp1); // Did we find the needle start? 3669 beq(CCR1, L_Comp2); 3670 addi(addr, addr, 2 * h_csize); 3671 bdnz(L_InnerLoop); 3672 bind(L_FinalCheck); 3673 andi_(addr_diff, addr_diff, h_csize); // Remaining characters not covered by InnerLoop: (num of characters) & 1. 3674 beq(CCR0, L_NotFound); 3675 if (h_csize == 2) { lwz(ch1, 0, addr); } else { lhz(ch1, 0, addr); } // One position left at which we have to compare. 3676 cmpw(CCR1, ch1, n_start); 3677 beq(CCR1, L_Comp1); 3678 bind(L_NotFound); 3679 li(result, -1); // not found 3680 b(L_End); 3681 3682 // ************************************************************************************************** 3683 // Special Case: unfortunately, the variable needle case can be called with needlecnt<2 3684 // ************************************************************************************************** 3685 if (needlecntval == 0) { // We have to handle these cases separately. 3686 Label L_OneCharLoop; 3687 bind(L_TooShort); 3688 mtctr(haycnt); 3689 if (n_csize == 2) { lhz(n_start, 0, needle); } else { lbz(n_start, 0, needle); } // First character of needle 3690 bind(L_OneCharLoop); 3691 if (h_csize == 2) { lhzu(ch1, 2, addr); } else { lbzu(ch1, 1, addr); } 3692 cmpw(CCR1, ch1, n_start); 3693 beq(CCR1, L_Found); // Did we find the one character needle? 3694 bdnz(L_OneCharLoop); 3695 li(result, -1); // Not found. 3696 b(L_End); 3697 } 3698 3699 // ************************************************************************************************** 3700 // Regular Case Part II: compare rest of needle (first 2 characters have been compared already) 3701 // ************************************************************************************************** 3702 3703 // Compare the rest 3704 bind(L_Comp2); 3705 addi(addr, addr, h_csize); // First comparison has failed, 2nd one hit. 3706 bind(L_Comp1); // Addr points to possible needle start. 3707 if (needlecntval != 2) { // Const needlecnt==2? 3708 if (needlecntval != 3) { 3709 if (needlecntval == 0) { beq(CCR6, L_Found); } // Variable needlecnt==2? 3710 Register n_ind = tmp4, 3711 h_ind = n_ind; 3712 li(n_ind, 2 * n_csize); // First 2 characters are already compared, use index 2. 3713 mtctr(needlecnt); // Decremented by 2, still > 0. 3714 Label L_CompLoop; 3715 bind(L_CompLoop); 3716 if (ae ==StrIntrinsicNode::UL) { 3717 h_ind = ch1; 3718 sldi(h_ind, n_ind, 1); 3719 } 3720 if (n_csize == 2) { lhzx(ch2, needle, n_ind); } else { lbzx(ch2, needle, n_ind); } 3721 if (h_csize == 2) { lhzx(ch1, addr, h_ind); } else { lbzx(ch1, addr, h_ind); } 3722 cmpw(CCR1, ch1, ch2); 3723 bne(CCR1, L_OuterLoop); 3724 addi(n_ind, n_ind, n_csize); 3725 bdnz(L_CompLoop); 3726 } else { // No loop required if there's only one needle character left. 3727 if (n_csize == 2) { lhz(ch2, 2 * 2, needle); } else { lbz(ch2, 2 * 1, needle); } 3728 if (h_csize == 2) { lhz(ch1, 2 * 2, addr); } else { lbz(ch1, 2 * 1, addr); } 3729 cmpw(CCR1, ch1, ch2); 3730 bne(CCR1, L_OuterLoop); 3731 } 3732 } 3733 // Return index ... 3734 bind(L_Found); 3735 subf(result, haystack, addr); // relative to haystack, ... 3736 if (h_csize == 2) { srdi(result, result, 1); } // in characters. 3737 bind(L_End); 3738 } // string_indexof 3739 3740 void MacroAssembler::string_indexof_char(Register result, Register haystack, Register haycnt, 3741 Register needle, jchar needleChar, Register tmp1, Register tmp2, bool is_byte) { 3742 assert_different_registers(haystack, haycnt, needle, tmp1, tmp2); 3743 3744 Label L_InnerLoop, L_FinalCheck, L_Found1, L_Found2, L_NotFound, L_End; 3745 Register addr = tmp1, 3746 ch1 = tmp2, 3747 ch2 = R0; 3748 3749 const int h_csize = is_byte ? 1 : 2; 3750 3751 //4: 3752 srwi_(tmp2, haycnt, 1); // Shift right by exact_log2(UNROLL_FACTOR). 3753 mr(addr, haystack); 3754 beq(CCR0, L_FinalCheck); 3755 mtctr(tmp2); // Move to count register. 3756 //8: 3757 bind(L_InnerLoop); // Main work horse (2x unrolled search loop). 3758 if (!is_byte) { 3759 lhz(ch1, 0, addr); 3760 lhz(ch2, 2, addr); 3761 } else { 3762 lbz(ch1, 0, addr); 3763 lbz(ch2, 1, addr); 3764 } 3765 (needle != R0) ? cmpw(CCR0, ch1, needle) : cmplwi(CCR0, ch1, (unsigned int)needleChar); 3766 (needle != R0) ? cmpw(CCR1, ch2, needle) : cmplwi(CCR1, ch2, (unsigned int)needleChar); 3767 beq(CCR0, L_Found1); // Did we find the needle? 3768 beq(CCR1, L_Found2); 3769 addi(addr, addr, 2 * h_csize); 3770 bdnz(L_InnerLoop); 3771 //16: 3772 bind(L_FinalCheck); 3773 andi_(R0, haycnt, 1); 3774 beq(CCR0, L_NotFound); 3775 if (!is_byte) { lhz(ch1, 0, addr); } else { lbz(ch1, 0, addr); } // One position left at which we have to compare. 3776 (needle != R0) ? cmpw(CCR1, ch1, needle) : cmplwi(CCR1, ch1, (unsigned int)needleChar); 3777 beq(CCR1, L_Found1); 3778 //21: 3779 bind(L_NotFound); 3780 li(result, -1); // Not found. 3781 b(L_End); 3782 3783 bind(L_Found2); 3784 addi(addr, addr, h_csize); 3785 //24: 3786 bind(L_Found1); // Return index ... 3787 subf(result, haystack, addr); // relative to haystack, ... 3788 if (!is_byte) { srdi(result, result, 1); } // in characters. 3789 bind(L_End); 3790 } // string_indexof_char 3791 3792 3793 void MacroAssembler::has_negatives(Register src, Register cnt, Register result, 3794 Register tmp1, Register tmp2) { 3795 const Register tmp0 = R0; 3796 assert_different_registers(src, result, cnt, tmp0, tmp1, tmp2); 3797 Label Lfastloop, Lslow, Lloop, Lnoneg, Ldone; 3798 3799 // Check if cnt >= 8 (= 16 bytes) 3800 lis(tmp1, (int)(short)0x8080); // tmp1 = 0x8080808080808080 3801 srwi_(tmp2, cnt, 4); 3802 li(result, 1); // Assume there's a negative byte. 3803 beq(CCR0, Lslow); 3804 ori(tmp1, tmp1, 0x8080); 3805 rldimi(tmp1, tmp1, 32, 0); 3806 mtctr(tmp2); 3807 3808 // 2x unrolled loop 3809 bind(Lfastloop); 3810 ld(tmp2, 0, src); 3811 ld(tmp0, 8, src); 3812 3813 orr(tmp0, tmp2, tmp0); 3814 3815 and_(tmp0, tmp0, tmp1); 3816 bne(CCR0, Ldone); // Found negative byte. 3817 addi(src, src, 16); 3818 3819 bdnz(Lfastloop); 3820 3821 bind(Lslow); // Fallback to slow version 3822 rldicl_(tmp0, cnt, 0, 64-4); 3823 beq(CCR0, Lnoneg); 3824 mtctr(tmp0); 3825 bind(Lloop); 3826 lbz(tmp0, 0, src); 3827 addi(src, src, 1); 3828 andi_(tmp0, tmp0, 0x80); 3829 bne(CCR0, Ldone); // Found negative byte. 3830 bdnz(Lloop); 3831 bind(Lnoneg); 3832 li(result, 0); 3833 3834 bind(Ldone); 3835 } 3836 3837 #endif // Compiler2 3838 3839 // Helpers for Intrinsic Emitters 3840 // 3841 // Revert the byte order of a 32bit value in a register 3842 // src: 0x44556677 3843 // dst: 0x77665544 3844 // Three steps to obtain the result: 3845 // 1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word 3846 // into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared. 3847 // This value initializes dst. 3848 // 2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost 3849 // byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go. 3850 // This value is mask inserted into dst with a [0..23] mask of 1s. 3851 // 3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position. 3852 // This value is mask inserted into dst with a [8..15] mask of 1s. 3853 void MacroAssembler::load_reverse_32(Register dst, Register src) { 3854 assert_different_registers(dst, src); 3855 3856 rldicl(dst, src, (4+1)*8, 56); // Rotate byte 4 into position 7 (rightmost), clear all to the left. 3857 rlwimi(dst, src, 3*8, 0, 23); // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone. 3858 rlwimi(dst, src, 1*8, 8, 15); // Insert byte 6 into position 5, leave the rest alone. 3859 } 3860 3861 // Calculate the column addresses of the crc32 lookup table into distinct registers. 3862 // This loop-invariant calculation is moved out of the loop body, reducing the loop 3863 // body size from 20 to 16 instructions. 3864 // Returns the offset that was used to calculate the address of column tc3. 3865 // Due to register shortage, setting tc3 may overwrite table. With the return offset 3866 // at hand, the original table address can be easily reconstructed. 3867 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) { 3868 3869 #ifdef VM_LITTLE_ENDIAN 3870 // This is what we implement (the DOLIT4 part): 3871 // ========================================================================= */ 3872 // #define DOLIT4 c ^= *buf4++; \ 3873 // c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \ 3874 // crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24] 3875 // #define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4 3876 // ========================================================================= */ 3877 const int ix0 = 3*(4*CRC32_COLUMN_SIZE); 3878 const int ix1 = 2*(4*CRC32_COLUMN_SIZE); 3879 const int ix2 = 1*(4*CRC32_COLUMN_SIZE); 3880 const int ix3 = 0*(4*CRC32_COLUMN_SIZE); 3881 #else 3882 // This is what we implement (the DOBIG4 part): 3883 // ========================================================================= 3884 // #define DOBIG4 c ^= *++buf4; \ 3885 // c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \ 3886 // crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24] 3887 // #define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4 3888 // ========================================================================= 3889 const int ix0 = 4*(4*CRC32_COLUMN_SIZE); 3890 const int ix1 = 5*(4*CRC32_COLUMN_SIZE); 3891 const int ix2 = 6*(4*CRC32_COLUMN_SIZE); 3892 const int ix3 = 7*(4*CRC32_COLUMN_SIZE); 3893 #endif 3894 assert_different_registers(table, tc0, tc1, tc2); 3895 assert(table == tc3, "must be!"); 3896 3897 addi(tc0, table, ix0); 3898 addi(tc1, table, ix1); 3899 addi(tc2, table, ix2); 3900 if (ix3 != 0) addi(tc3, table, ix3); 3901 3902 return ix3; 3903 } 3904 3905 /** 3906 * uint32_t crc; 3907 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8); 3908 */ 3909 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) { 3910 assert_different_registers(crc, table, tmp); 3911 assert_different_registers(val, table); 3912 3913 if (crc == val) { // Must rotate first to use the unmodified value. 3914 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 3915 // As we use a word (4-byte) instruction, we have to adapt the mask bit positions. 3916 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 3917 } else { 3918 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 3919 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 3920 } 3921 lwzx(tmp, table, tmp); 3922 xorr(crc, crc, tmp); 3923 } 3924 3925 /** 3926 * uint32_t crc; 3927 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8); 3928 */ 3929 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) { 3930 fold_byte_crc32(crc, crc, table, tmp); 3931 } 3932 3933 /** 3934 * Emits code to update CRC-32 with a byte value according to constants in table. 3935 * 3936 * @param [in,out]crc Register containing the crc. 3937 * @param [in]val Register containing the byte to fold into the CRC. 3938 * @param [in]table Register containing the table of crc constants. 3939 * 3940 * uint32_t crc; 3941 * val = crc_table[(val ^ crc) & 0xFF]; 3942 * crc = val ^ (crc >> 8); 3943 */ 3944 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 3945 BLOCK_COMMENT("update_byte_crc32:"); 3946 xorr(val, val, crc); 3947 fold_byte_crc32(crc, val, table, val); 3948 } 3949 3950 /** 3951 * @param crc register containing existing CRC (32-bit) 3952 * @param buf register pointing to input byte buffer (byte*) 3953 * @param len register containing number of bytes 3954 * @param table register pointing to CRC table 3955 */ 3956 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table, 3957 Register data, bool loopAlignment) { 3958 assert_different_registers(crc, buf, len, table, data); 3959 3960 Label L_mainLoop, L_done; 3961 const int mainLoop_stepping = 1; 3962 const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4; 3963 3964 // Process all bytes in a single-byte loop. 3965 clrldi_(len, len, 32); // Enforce 32 bit. Anything to do? 3966 beq(CCR0, L_done); 3967 3968 mtctr(len); 3969 align(mainLoop_alignment); 3970 BIND(L_mainLoop); 3971 lbz(data, 0, buf); // Byte from buffer, zero-extended. 3972 addi(buf, buf, mainLoop_stepping); // Advance buffer position. 3973 update_byte_crc32(crc, data, table); 3974 bdnz(L_mainLoop); // Iterate. 3975 3976 bind(L_done); 3977 } 3978 3979 /** 3980 * Emits code to update CRC-32 with a 4-byte value according to constants in table 3981 * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c 3982 */ 3983 // A not on the lookup table address(es): 3984 // The lookup table consists of two sets of four columns each. 3985 // The columns {0..3} are used for little-endian machines. 3986 // The columns {4..7} are used for big-endian machines. 3987 // To save the effort of adding the column offset to the table address each time 3988 // a table element is looked up, it is possible to pass the pre-calculated 3989 // column addresses. 3990 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary. 3991 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc, 3992 Register t0, Register t1, Register t2, Register t3, 3993 Register tc0, Register tc1, Register tc2, Register tc3) { 3994 assert_different_registers(crc, t3); 3995 3996 // XOR crc with next four bytes of buffer. 3997 lwz(t3, bufDisp, buf); 3998 if (bufInc != 0) { 3999 addi(buf, buf, bufInc); 4000 } 4001 xorr(t3, t3, crc); 4002 4003 // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices. 4004 rlwinm(t0, t3, 2, 24-2, 31-2); // ((t1 >> 0) & 0xff) << 2 4005 rlwinm(t1, t3, 32+(2- 8), 24-2, 31-2); // ((t1 >> 8) & 0xff) << 2 4006 rlwinm(t2, t3, 32+(2-16), 24-2, 31-2); // ((t1 >> 16) & 0xff) << 2 4007 rlwinm(t3, t3, 32+(2-24), 24-2, 31-2); // ((t1 >> 24) & 0xff) << 2 4008 4009 // Use the pre-calculated column addresses. 4010 // Load pre-calculated table values. 4011 lwzx(t0, tc0, t0); 4012 lwzx(t1, tc1, t1); 4013 lwzx(t2, tc2, t2); 4014 lwzx(t3, tc3, t3); 4015 4016 // Calculate new crc from table values. 4017 xorr(t0, t0, t1); 4018 xorr(t2, t2, t3); 4019 xorr(crc, t0, t2); // Now crc contains the final checksum value. 4020 } 4021 4022 /** 4023 * @param crc register containing existing CRC (32-bit) 4024 * @param buf register pointing to input byte buffer (byte*) 4025 * @param len register containing number of bytes 4026 * @param table register pointing to CRC table 4027 * 4028 * Uses R9..R12 as work register. Must be saved/restored by caller! 4029 */ 4030 void MacroAssembler::kernel_crc32_2word(Register crc, Register buf, Register len, Register table, 4031 Register t0, Register t1, Register t2, Register t3, 4032 Register tc0, Register tc1, Register tc2, Register tc3, 4033 bool invertCRC) { 4034 assert_different_registers(crc, buf, len, table); 4035 4036 Label L_mainLoop, L_tail; 4037 Register tmp = t0; 4038 Register data = t0; 4039 Register tmp2 = t1; 4040 const int mainLoop_stepping = 8; 4041 const int tailLoop_stepping = 1; 4042 const int log_stepping = exact_log2(mainLoop_stepping); 4043 const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32; 4044 const int complexThreshold = 2*mainLoop_stepping; 4045 4046 // Don't test for len <= 0 here. This pathological case should not occur anyway. 4047 // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles 4048 // for all well-behaved cases. The situation itself is detected and handled correctly 4049 // within update_byteLoop_crc32. 4050 assert(tailLoop_stepping == 1, "check tailLoop_stepping!"); 4051 4052 BLOCK_COMMENT("kernel_crc32_2word {"); 4053 4054 if (invertCRC) { 4055 nand(crc, crc, crc); // 1s complement of crc 4056 } 4057 4058 // Check for short (<mainLoop_stepping) buffer. 4059 cmpdi(CCR0, len, complexThreshold); 4060 blt(CCR0, L_tail); 4061 4062 // Pre-mainLoop alignment did show a slight (1%) positive effect on performance. 4063 // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions. 4064 { 4065 // Align buf addr to mainLoop_stepping boundary. 4066 neg(tmp2, buf); // Calculate # preLoop iterations for alignment. 4067 rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63. 4068 4069 if (complexThreshold > mainLoop_stepping) { 4070 sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 4071 } else { 4072 sub(tmp, len, tmp2); // Remaining bytes for main loop. 4073 cmpdi(CCR0, tmp, mainLoop_stepping); 4074 blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing 4075 mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 4076 } 4077 update_byteLoop_crc32(crc, buf, tmp2, table, data, false); 4078 } 4079 4080 srdi(tmp2, len, log_stepping); // #iterations for mainLoop 4081 andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop 4082 mtctr(tmp2); 4083 4084 #ifdef VM_LITTLE_ENDIAN 4085 Register crc_rv = crc; 4086 #else 4087 Register crc_rv = tmp; // Load_reverse needs separate registers to work on. 4088 // Occupies tmp, but frees up crc. 4089 load_reverse_32(crc_rv, crc); // Revert byte order because we are dealing with big-endian data. 4090 tmp = crc; 4091 #endif 4092 4093 int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3); 4094 4095 align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement. 4096 BIND(L_mainLoop); 4097 update_1word_crc32(crc_rv, buf, table, 0, 0, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 4098 update_1word_crc32(crc_rv, buf, table, 4, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 4099 bdnz(L_mainLoop); 4100 4101 #ifndef VM_LITTLE_ENDIAN 4102 load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data. 4103 tmp = crc_rv; // Tmp uses it's original register again. 4104 #endif 4105 4106 // Restore original table address for tailLoop. 4107 if (reconstructTableOffset != 0) { 4108 addi(table, table, -reconstructTableOffset); 4109 } 4110 4111 // Process last few (<complexThreshold) bytes of buffer. 4112 BIND(L_tail); 4113 update_byteLoop_crc32(crc, buf, len, table, data, false); 4114 4115 if (invertCRC) { 4116 nand(crc, crc, crc); // 1s complement of crc 4117 } 4118 BLOCK_COMMENT("} kernel_crc32_2word"); 4119 } 4120 4121 /** 4122 * @param crc register containing existing CRC (32-bit) 4123 * @param buf register pointing to input byte buffer (byte*) 4124 * @param len register containing number of bytes 4125 * @param table register pointing to CRC table 4126 * 4127 * uses R9..R12 as work register. Must be saved/restored by caller! 4128 */ 4129 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table, 4130 Register t0, Register t1, Register t2, Register t3, 4131 Register tc0, Register tc1, Register tc2, Register tc3, 4132 bool invertCRC) { 4133 assert_different_registers(crc, buf, len, table); 4134 4135 Label L_mainLoop, L_tail; 4136 Register tmp = t0; 4137 Register data = t0; 4138 Register tmp2 = t1; 4139 const int mainLoop_stepping = 4; 4140 const int tailLoop_stepping = 1; 4141 const int log_stepping = exact_log2(mainLoop_stepping); 4142 const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32; 4143 const int complexThreshold = 2*mainLoop_stepping; 4144 4145 // Don't test for len <= 0 here. This pathological case should not occur anyway. 4146 // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles 4147 // for all well-behaved cases. The situation itself is detected and handled correctly 4148 // within update_byteLoop_crc32. 4149 assert(tailLoop_stepping == 1, "check tailLoop_stepping!"); 4150 4151 BLOCK_COMMENT("kernel_crc32_1word {"); 4152 4153 if (invertCRC) { 4154 nand(crc, crc, crc); // 1s complement of crc 4155 } 4156 4157 // Check for short (<mainLoop_stepping) buffer. 4158 cmpdi(CCR0, len, complexThreshold); 4159 blt(CCR0, L_tail); 4160 4161 // Pre-mainLoop alignment did show a slight (1%) positive effect on performance. 4162 // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions. 4163 { 4164 // Align buf addr to mainLoop_stepping boundary. 4165 neg(tmp2, buf); // Calculate # preLoop iterations for alignment. 4166 rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63. 4167 4168 if (complexThreshold > mainLoop_stepping) { 4169 sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 4170 } else { 4171 sub(tmp, len, tmp2); // Remaining bytes for main loop. 4172 cmpdi(CCR0, tmp, mainLoop_stepping); 4173 blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing 4174 mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 4175 } 4176 update_byteLoop_crc32(crc, buf, tmp2, table, data, false); 4177 } 4178 4179 srdi(tmp2, len, log_stepping); // #iterations for mainLoop 4180 andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop 4181 mtctr(tmp2); 4182 4183 #ifdef VM_LITTLE_ENDIAN 4184 Register crc_rv = crc; 4185 #else 4186 Register crc_rv = tmp; // Load_reverse needs separate registers to work on. 4187 // Occupies tmp, but frees up crc. 4188 load_reverse_32(crc_rv, crc); // Revert byte order because we are dealing with big-endian data. 4189 tmp = crc; 4190 #endif 4191 4192 int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3); 4193 4194 align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement. 4195 BIND(L_mainLoop); 4196 update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 4197 bdnz(L_mainLoop); 4198 4199 #ifndef VM_LITTLE_ENDIAN 4200 load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data. 4201 tmp = crc_rv; // Tmp uses it's original register again. 4202 #endif 4203 4204 // Restore original table address for tailLoop. 4205 if (reconstructTableOffset != 0) { 4206 addi(table, table, -reconstructTableOffset); 4207 } 4208 4209 // Process last few (<complexThreshold) bytes of buffer. 4210 BIND(L_tail); 4211 update_byteLoop_crc32(crc, buf, len, table, data, false); 4212 4213 if (invertCRC) { 4214 nand(crc, crc, crc); // 1s complement of crc 4215 } 4216 BLOCK_COMMENT("} kernel_crc32_1word"); 4217 } 4218 4219 /** 4220 * @param crc register containing existing CRC (32-bit) 4221 * @param buf register pointing to input byte buffer (byte*) 4222 * @param len register containing number of bytes 4223 * @param table register pointing to CRC table 4224 * 4225 * Uses R7_ARG5, R8_ARG6 as work registers. 4226 */ 4227 void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len, Register table, 4228 Register t0, Register t1, Register t2, Register t3, 4229 bool invertCRC) { 4230 assert_different_registers(crc, buf, len, table); 4231 4232 Register data = t0; // Holds the current byte to be folded into crc. 4233 4234 BLOCK_COMMENT("kernel_crc32_1byte {"); 4235 4236 if (invertCRC) { 4237 nand(crc, crc, crc); // 1s complement of crc 4238 } 4239 4240 // Process all bytes in a single-byte loop. 4241 update_byteLoop_crc32(crc, buf, len, table, data, true); 4242 4243 if (invertCRC) { 4244 nand(crc, crc, crc); // 1s complement of crc 4245 } 4246 BLOCK_COMMENT("} kernel_crc32_1byte"); 4247 } 4248 4249 /** 4250 * @param crc register containing existing CRC (32-bit) 4251 * @param buf register pointing to input byte buffer (byte*) 4252 * @param len register containing number of bytes 4253 * @param table register pointing to CRC table 4254 * @param constants register pointing to CRC table for 128-bit aligned memory 4255 * @param barretConstants register pointing to table for barrett reduction 4256 * @param t0-t4 temp registers 4257 */ 4258 void MacroAssembler::kernel_crc32_1word_vpmsum(Register crc, Register buf, Register len, Register table, 4259 Register constants, Register barretConstants, 4260 Register t0, Register t1, Register t2, Register t3, Register t4, 4261 bool invertCRC) { 4262 assert_different_registers(crc, buf, len, table); 4263 4264 Label L_alignedHead, L_tail; 4265 4266 BLOCK_COMMENT("kernel_crc32_1word_vpmsum {"); 4267 4268 // 1. ~c 4269 if (invertCRC) { 4270 nand(crc, crc, crc); // 1s complement of crc 4271 } 4272 4273 // 2. use kernel_crc32_1word for short len 4274 clrldi(len, len, 32); 4275 cmpdi(CCR0, len, 512); 4276 blt(CCR0, L_tail); 4277 4278 // 3. calculate from 0 to first aligned address 4279 const int alignment = 16; 4280 Register prealign = t0; 4281 4282 andi_(prealign, buf, alignment - 1); 4283 beq(CCR0, L_alignedHead); 4284 subfic(prealign, prealign, alignment); 4285 4286 subf(len, prealign, len); 4287 update_byteLoop_crc32(crc, buf, prealign, table, t2, false); 4288 4289 // 4. calculate from first aligned address as far as possible 4290 BIND(L_alignedHead); 4291 kernel_crc32_1word_aligned(crc, buf, len, constants, barretConstants, t0, t1, t2, t3, t4); 4292 4293 // 5. remaining bytes 4294 BIND(L_tail); 4295 Register tc0 = t4; 4296 Register tc1 = constants; 4297 Register tc2 = barretConstants; 4298 kernel_crc32_1word(crc, buf, len, table, t0, t1, t2, t3, tc0, tc1, tc2, table, false); 4299 4300 // 6. ~c 4301 if (invertCRC) { 4302 nand(crc, crc, crc); // 1s complement of crc 4303 } 4304 4305 BLOCK_COMMENT("} kernel_crc32_1word_vpmsum"); 4306 } 4307 4308 /** 4309 * @param crc register containing existing CRC (32-bit) 4310 * @param buf register pointing to input byte buffer (byte*) 4311 * @param len register containing number of bytes (will get updated to remaining bytes) 4312 * @param constants register pointing to CRC table for 128-bit aligned memory 4313 * @param barretConstants register pointing to table for barrett reduction 4314 * @param t0-t4 temp registers 4315 * Precondition: len should be >= 512. Otherwise, nothing will be done. 4316 */ 4317 void MacroAssembler::kernel_crc32_1word_aligned(Register crc, Register buf, Register len, 4318 Register constants, Register barretConstants, 4319 Register t0, Register t1, Register t2, Register t3, Register t4) { 4320 4321 // Save non-volatile vector registers (frameless). 4322 Register offset = t1; 4323 int offsetInt = 0; 4324 offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP); 4325 offsetInt -= 16; li(offset, offsetInt); stvx(VR21, offset, R1_SP); 4326 offsetInt -= 16; li(offset, offsetInt); stvx(VR22, offset, R1_SP); 4327 offsetInt -= 16; li(offset, offsetInt); stvx(VR23, offset, R1_SP); 4328 offsetInt -= 16; li(offset, offsetInt); stvx(VR24, offset, R1_SP); 4329 offsetInt -= 16; li(offset, offsetInt); stvx(VR25, offset, R1_SP); 4330 #ifndef VM_LITTLE_ENDIAN 4331 offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP); 4332 #endif 4333 offsetInt -= 8; std(R14, offsetInt, R1_SP); 4334 offsetInt -= 8; std(R15, offsetInt, R1_SP); 4335 offsetInt -= 8; std(R16, offsetInt, R1_SP); 4336 offsetInt -= 8; std(R17, offsetInt, R1_SP); 4337 4338 // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor 4339 // bytes per iteration. The basic scheme is: 4340 // lvx: load vector (Big Endian needs reversal) 4341 // vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift 4342 // vxor: xor partial results together to get unroll_factor2 vectors 4343 4344 // Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors. 4345 4346 // Using 16 * unroll_factor / unroll_factor_2 bytes for constants. 4347 const int unroll_factor = 2048; 4348 const int unroll_factor2 = 8; 4349 4350 // Support registers. 4351 Register offs[] = { noreg, t0, t1, t2, t3, t4, crc /* will live in VCRC */, R14 }; 4352 Register num_bytes = R15, 4353 loop_count = R16, 4354 cur_const = R17; 4355 // Constant array for outer loop: unroll_factor2 - 1 registers, 4356 // Constant array for inner loop: unroll_factor / unroll_factor2 registers. 4357 VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 }, 4358 consts1[] = { VR23, VR24 }; 4359 // Data register arrays: 2 arrays with unroll_factor2 registers. 4360 VectorRegister data0[] = { VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7 }, 4361 data1[] = { VR8, VR9, VR10, VR11, VR12, VR13, VR14, VR15 }; 4362 4363 VectorRegister VCRC = data0[0]; 4364 VectorRegister Vc = VR25; 4365 VectorRegister swap_bytes = VR26; // Only for Big Endian. 4366 4367 // We have at least 1 iteration (ensured by caller). 4368 Label L_outer_loop, L_inner_loop, L_last; 4369 4370 // If supported set DSCR pre-fetch to deepest. 4371 if (VM_Version::has_mfdscr()) { 4372 load_const_optimized(t0, VM_Version::_dscr_val | 7); 4373 mtdscr(t0); 4374 } 4375 4376 mtvrwz(VCRC, crc); // crc lives lives in VCRC, now 4377 4378 for (int i = 1; i < unroll_factor2; ++i) { 4379 li(offs[i], 16 * i); 4380 } 4381 4382 // Load consts for outer loop 4383 lvx(consts0[0], constants); 4384 for (int i = 1; i < unroll_factor2 - 1; ++i) { 4385 lvx(consts0[i], offs[i], constants); 4386 } 4387 addi(constants, constants, (unroll_factor2 - 1) * 16); 4388 4389 load_const_optimized(num_bytes, 16 * unroll_factor); 4390 load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off. 4391 4392 // Reuse data registers outside of the loop. 4393 VectorRegister Vtmp = data1[0]; 4394 VectorRegister Vtmp2 = data1[1]; 4395 VectorRegister zeroes = data1[2]; 4396 4397 vspltisb(Vtmp, 0); 4398 vsldoi(VCRC, Vtmp, VCRC, 8); // 96 bit zeroes, 32 bit CRC. 4399 4400 // Load vector for vpermxor (to xor both 64 bit parts together) 4401 lvsl(Vtmp, buf); // 000102030405060708090a0b0c0d0e0f 4402 vspltisb(Vc, 4); 4403 vsl(Vc, Vtmp, Vc); // 00102030405060708090a0b0c0d0e0f0 4404 xxspltd(Vc->to_vsr(), Vc->to_vsr(), 0); 4405 vor(Vc, Vtmp, Vc); // 001122334455667708192a3b4c5d6e7f 4406 4407 #ifdef VM_LITTLE_ENDIAN 4408 #define BE_swap_bytes(x) 4409 #else 4410 vspltisb(Vtmp2, 0xf); 4411 vxor(swap_bytes, Vtmp, Vtmp2); 4412 #define BE_swap_bytes(x) vperm(x, x, x, swap_bytes) 4413 #endif 4414 4415 cmpd(CCR0, len, num_bytes); 4416 blt(CCR0, L_last); 4417 4418 // ********** Main loop start ********** 4419 align(32); 4420 bind(L_outer_loop); 4421 4422 // Begin of unrolled first iteration (no xor). 4423 lvx(data1[0], buf); 4424 mr(cur_const, constants); 4425 for (int i = 1; i < unroll_factor2 / 2; ++i) { 4426 lvx(data1[i], offs[i], buf); 4427 } 4428 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. 4429 lvx(consts1[0], cur_const); 4430 mtctr(loop_count); 4431 for (int i = 0; i < unroll_factor2 / 2; ++i) { 4432 BE_swap_bytes(data1[i]); 4433 if (i == 0) { vxor(data1[0], data1[0], VCRC); } // xor in previous CRC. 4434 lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf); 4435 vpmsumw(data0[i], data1[i], consts1[0]); 4436 } 4437 addi(buf, buf, 16 * unroll_factor2); 4438 subf(len, num_bytes, len); 4439 lvx(consts1[1], offs[1], cur_const); 4440 addi(cur_const, cur_const, 32); 4441 // Begin of unrolled second iteration (head). 4442 for (int i = 0; i < unroll_factor2 / 2; ++i) { 4443 BE_swap_bytes(data1[i + unroll_factor2 / 2]); 4444 if (i == 0) { lvx(data1[0], buf); } else { lvx(data1[i], offs[i], buf); } 4445 vpmsumw(data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[0]); 4446 } 4447 for (int i = 0; i < unroll_factor2 / 2; ++i) { 4448 BE_swap_bytes(data1[i]); 4449 lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf); 4450 vpmsumw(data1[i], data1[i], consts1[1]); 4451 } 4452 addi(buf, buf, 16 * unroll_factor2); 4453 4454 // Generate most performance relevant code. Loads + half of the vpmsumw have been generated. 4455 // Double-iteration allows using the 2 constant registers alternatingly. 4456 align(32); 4457 bind(L_inner_loop); 4458 for (int j = 1; j < 3; ++j) { // j < unroll_factor / unroll_factor2 - 1 for complete unrolling. 4459 if (j & 1) { 4460 lvx(consts1[0], cur_const); 4461 } else { 4462 lvx(consts1[1], offs[1], cur_const); 4463 addi(cur_const, cur_const, 32); 4464 } 4465 for (int i = 0; i < unroll_factor2; ++i) { 4466 int idx = i + unroll_factor2 / 2, inc = 0; // For modulo-scheduled input. 4467 if (idx >= unroll_factor2) { idx -= unroll_factor2; inc = 1; } 4468 BE_swap_bytes(data1[idx]); 4469 vxor(data0[i], data0[i], data1[i]); 4470 if (i == 0) lvx(data1[0], buf); else lvx(data1[i], offs[i], buf); 4471 vpmsumw(data1[idx], data1[idx], consts1[(j + inc) & 1]); 4472 } 4473 addi(buf, buf, 16 * unroll_factor2); 4474 } 4475 bdnz(L_inner_loop); 4476 4477 // Tail of last iteration (no loads). 4478 for (int i = 0; i < unroll_factor2 / 2; ++i) { 4479 BE_swap_bytes(data1[i + unroll_factor2 / 2]); 4480 vxor(data0[i], data0[i], data1[i]); 4481 vpmsumw(data1[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[1]); 4482 } 4483 for (int i = 0; i < unroll_factor2 / 2; ++i) { 4484 vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); // First half of fixup shifts. 4485 vxor(data0[i + unroll_factor2 / 2], data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2]); 4486 } 4487 4488 // Last data register is ok, other ones need fixup shift. 4489 for (int i = unroll_factor2 / 2; i < unroll_factor2 - 1; ++i) { 4490 vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); 4491 } 4492 4493 // Combine to 128 bit result vector VCRC = data0[0]. 4494 for (int i = 1; i < unroll_factor2; i<<=1) { 4495 for (int j = 0; j <= unroll_factor2 - 2*i; j+=2*i) { 4496 vxor(data0[j], data0[j], data0[j+i]); 4497 } 4498 } 4499 cmpd(CCR0, len, num_bytes); 4500 bge(CCR0, L_outer_loop); 4501 4502 // Last chance with lower num_bytes. 4503 bind(L_last); 4504 srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations. 4505 add_const_optimized(constants, constants, 16 * (unroll_factor / unroll_factor2)); // Point behind last one. 4506 sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used. 4507 clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2)); 4508 subf(constants, R0, constants); // Point to constant to be used first. 4509 4510 addic_(loop_count, loop_count, -1); // One double-iteration peeled off. 4511 bgt(CCR0, L_outer_loop); 4512 // ********** Main loop end ********** 4513 #undef BE_swap_bytes 4514 4515 // Restore DSCR pre-fetch value. 4516 if (VM_Version::has_mfdscr()) { 4517 load_const_optimized(t0, VM_Version::_dscr_val); 4518 mtdscr(t0); 4519 } 4520 4521 vspltisb(zeroes, 0); 4522 4523 // Combine to 64 bit result. 4524 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. 4525 4526 // Reduce to 32 bit CRC: Remainder by multiply-high. 4527 lvx(Vtmp, barretConstants); 4528 vsldoi(Vtmp2, zeroes, VCRC, 12); // Extract high 32 bit. 4529 vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply by inverse long poly. 4530 vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit. 4531 vsldoi(Vtmp, zeroes, Vtmp, 8); 4532 vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply quotient by long poly. 4533 vxor(VCRC, VCRC, Vtmp2); // Remainder fits into 32 bit. 4534 4535 // Move result. len is already updated. 4536 vsldoi(VCRC, VCRC, zeroes, 8); 4537 mfvrd(crc, VCRC); 4538 4539 // Restore non-volatile Vector registers (frameless). 4540 offsetInt = 0; 4541 offsetInt -= 16; li(offset, offsetInt); lvx(VR20, offset, R1_SP); 4542 offsetInt -= 16; li(offset, offsetInt); lvx(VR21, offset, R1_SP); 4543 offsetInt -= 16; li(offset, offsetInt); lvx(VR22, offset, R1_SP); 4544 offsetInt -= 16; li(offset, offsetInt); lvx(VR23, offset, R1_SP); 4545 offsetInt -= 16; li(offset, offsetInt); lvx(VR24, offset, R1_SP); 4546 offsetInt -= 16; li(offset, offsetInt); lvx(VR25, offset, R1_SP); 4547 #ifndef VM_LITTLE_ENDIAN 4548 offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP); 4549 #endif 4550 offsetInt -= 8; ld(R14, offsetInt, R1_SP); 4551 offsetInt -= 8; ld(R15, offsetInt, R1_SP); 4552 offsetInt -= 8; ld(R16, offsetInt, R1_SP); 4553 offsetInt -= 8; ld(R17, offsetInt, R1_SP); 4554 } 4555 4556 void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp, bool invertCRC) { 4557 assert_different_registers(crc, buf, /* len, not used!! */ table, tmp); 4558 4559 BLOCK_COMMENT("kernel_crc32_singleByte:"); 4560 if (invertCRC) { 4561 nand(crc, crc, crc); // 1s complement of crc 4562 } 4563 4564 lbz(tmp, 0, buf); // Byte from buffer, zero-extended. 4565 update_byte_crc32(crc, tmp, table); 4566 4567 if (invertCRC) { 4568 nand(crc, crc, crc); // 1s complement of crc 4569 } 4570 } 4571 4572 void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) { 4573 assert_different_registers(crc, val, table); 4574 4575 BLOCK_COMMENT("kernel_crc32_singleByteReg:"); 4576 if (invertCRC) { 4577 nand(crc, crc, crc); // 1s complement of crc 4578 } 4579 4580 update_byte_crc32(crc, val, table); 4581 4582 if (invertCRC) { 4583 nand(crc, crc, crc); // 1s complement of crc 4584 } 4585 } 4586 4587 // dest_lo += src1 + src2 4588 // dest_hi += carry1 + carry2 4589 void MacroAssembler::add2_with_carry(Register dest_hi, 4590 Register dest_lo, 4591 Register src1, Register src2) { 4592 li(R0, 0); 4593 addc(dest_lo, dest_lo, src1); 4594 adde(dest_hi, dest_hi, R0); 4595 addc(dest_lo, dest_lo, src2); 4596 adde(dest_hi, dest_hi, R0); 4597 } 4598 4599 // Multiply 64 bit by 64 bit first loop. 4600 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, 4601 Register x_xstart, 4602 Register y, Register y_idx, 4603 Register z, 4604 Register carry, 4605 Register product_high, Register product, 4606 Register idx, Register kdx, 4607 Register tmp) { 4608 // jlong carry, x[], y[], z[]; 4609 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) { 4610 // huge_128 product = y[idx] * x[xstart] + carry; 4611 // z[kdx] = (jlong)product; 4612 // carry = (jlong)(product >>> 64); 4613 // } 4614 // z[xstart] = carry; 4615 4616 Label L_first_loop, L_first_loop_exit; 4617 Label L_one_x, L_one_y, L_multiply; 4618 4619 addic_(xstart, xstart, -1); 4620 blt(CCR0, L_one_x); // Special case: length of x is 1. 4621 4622 // Load next two integers of x. 4623 sldi(tmp, xstart, LogBytesPerInt); 4624 ldx(x_xstart, x, tmp); 4625 #ifdef VM_LITTLE_ENDIAN 4626 rldicl(x_xstart, x_xstart, 32, 0); 4627 #endif 4628 4629 align(32, 16); 4630 bind(L_first_loop); 4631 4632 cmpdi(CCR0, idx, 1); 4633 blt(CCR0, L_first_loop_exit); 4634 addi(idx, idx, -2); 4635 beq(CCR0, L_one_y); 4636 4637 // Load next two integers of y. 4638 sldi(tmp, idx, LogBytesPerInt); 4639 ldx(y_idx, y, tmp); 4640 #ifdef VM_LITTLE_ENDIAN 4641 rldicl(y_idx, y_idx, 32, 0); 4642 #endif 4643 4644 4645 bind(L_multiply); 4646 multiply64(product_high, product, x_xstart, y_idx); 4647 4648 li(tmp, 0); 4649 addc(product, product, carry); // Add carry to result. 4650 adde(product_high, product_high, tmp); // Add carry of the last addition. 4651 addi(kdx, kdx, -2); 4652 4653 // Store result. 4654 #ifdef VM_LITTLE_ENDIAN 4655 rldicl(product, product, 32, 0); 4656 #endif 4657 sldi(tmp, kdx, LogBytesPerInt); 4658 stdx(product, z, tmp); 4659 mr_if_needed(carry, product_high); 4660 b(L_first_loop); 4661 4662 4663 bind(L_one_y); // Load one 32 bit portion of y as (0,value). 4664 4665 lwz(y_idx, 0, y); 4666 b(L_multiply); 4667 4668 4669 bind(L_one_x); // Load one 32 bit portion of x as (0,value). 4670 4671 lwz(x_xstart, 0, x); 4672 b(L_first_loop); 4673 4674 bind(L_first_loop_exit); 4675 } 4676 4677 // Multiply 64 bit by 64 bit and add 128 bit. 4678 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, 4679 Register z, Register yz_idx, 4680 Register idx, Register carry, 4681 Register product_high, Register product, 4682 Register tmp, int offset) { 4683 4684 // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry; 4685 // z[kdx] = (jlong)product; 4686 4687 sldi(tmp, idx, LogBytesPerInt); 4688 if (offset) { 4689 addi(tmp, tmp, offset); 4690 } 4691 ldx(yz_idx, y, tmp); 4692 #ifdef VM_LITTLE_ENDIAN 4693 rldicl(yz_idx, yz_idx, 32, 0); 4694 #endif 4695 4696 multiply64(product_high, product, x_xstart, yz_idx); 4697 ldx(yz_idx, z, tmp); 4698 #ifdef VM_LITTLE_ENDIAN 4699 rldicl(yz_idx, yz_idx, 32, 0); 4700 #endif 4701 4702 add2_with_carry(product_high, product, carry, yz_idx); 4703 4704 sldi(tmp, idx, LogBytesPerInt); 4705 if (offset) { 4706 addi(tmp, tmp, offset); 4707 } 4708 #ifdef VM_LITTLE_ENDIAN 4709 rldicl(product, product, 32, 0); 4710 #endif 4711 stdx(product, z, tmp); 4712 } 4713 4714 // Multiply 128 bit by 128 bit. Unrolled inner loop. 4715 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, 4716 Register y, Register z, 4717 Register yz_idx, Register idx, Register carry, 4718 Register product_high, Register product, 4719 Register carry2, Register tmp) { 4720 4721 // jlong carry, x[], y[], z[]; 4722 // int kdx = ystart+1; 4723 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 4724 // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry; 4725 // z[kdx+idx+1] = (jlong)product; 4726 // jlong carry2 = (jlong)(product >>> 64); 4727 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2; 4728 // z[kdx+idx] = (jlong)product; 4729 // carry = (jlong)(product >>> 64); 4730 // } 4731 // idx += 2; 4732 // if (idx > 0) { 4733 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry; 4734 // z[kdx+idx] = (jlong)product; 4735 // carry = (jlong)(product >>> 64); 4736 // } 4737 4738 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 4739 const Register jdx = R0; 4740 4741 // Scale the index. 4742 srdi_(jdx, idx, 2); 4743 beq(CCR0, L_third_loop_exit); 4744 mtctr(jdx); 4745 4746 align(32, 16); 4747 bind(L_third_loop); 4748 4749 addi(idx, idx, -4); 4750 4751 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8); 4752 mr_if_needed(carry2, product_high); 4753 4754 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0); 4755 mr_if_needed(carry, product_high); 4756 bdnz(L_third_loop); 4757 4758 bind(L_third_loop_exit); // Handle any left-over operand parts. 4759 4760 andi_(idx, idx, 0x3); 4761 beq(CCR0, L_post_third_loop_done); 4762 4763 Label L_check_1; 4764 4765 addic_(idx, idx, -2); 4766 blt(CCR0, L_check_1); 4767 4768 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0); 4769 mr_if_needed(carry, product_high); 4770 4771 bind(L_check_1); 4772 4773 addi(idx, idx, 0x2); 4774 andi_(idx, idx, 0x1); 4775 addic_(idx, idx, -1); 4776 blt(CCR0, L_post_third_loop_done); 4777 4778 sldi(tmp, idx, LogBytesPerInt); 4779 lwzx(yz_idx, y, tmp); 4780 multiply64(product_high, product, x_xstart, yz_idx); 4781 lwzx(yz_idx, z, tmp); 4782 4783 add2_with_carry(product_high, product, yz_idx, carry); 4784 4785 sldi(tmp, idx, LogBytesPerInt); 4786 stwx(product, z, tmp); 4787 srdi(product, product, 32); 4788 4789 sldi(product_high, product_high, 32); 4790 orr(product, product, product_high); 4791 mr_if_needed(carry, product); 4792 4793 bind(L_post_third_loop_done); 4794 } // multiply_128_x_128_loop 4795 4796 void MacroAssembler::muladd(Register out, Register in, 4797 Register offset, Register len, Register k, 4798 Register tmp1, Register tmp2, Register carry) { 4799 4800 // Labels 4801 Label LOOP, SKIP; 4802 4803 // Make sure length is positive. 4804 cmpdi (CCR0, len, 0); 4805 4806 // Prepare variables 4807 subi (offset, offset, 4); 4808 li (carry, 0); 4809 ble (CCR0, SKIP); 4810 4811 mtctr (len); 4812 subi (len, len, 1 ); 4813 sldi (len, len, 2 ); 4814 4815 // Main loop 4816 bind(LOOP); 4817 lwzx (tmp1, len, in ); 4818 lwzx (tmp2, offset, out ); 4819 mulld (tmp1, tmp1, k ); 4820 add (tmp2, carry, tmp2 ); 4821 add (tmp2, tmp1, tmp2 ); 4822 stwx (tmp2, offset, out ); 4823 srdi (carry, tmp2, 32 ); 4824 subi (offset, offset, 4 ); 4825 subi (len, len, 4 ); 4826 bdnz (LOOP); 4827 bind(SKIP); 4828 } 4829 4830 void MacroAssembler::multiply_to_len(Register x, Register xlen, 4831 Register y, Register ylen, 4832 Register z, Register zlen, 4833 Register tmp1, Register tmp2, 4834 Register tmp3, Register tmp4, 4835 Register tmp5, Register tmp6, 4836 Register tmp7, Register tmp8, 4837 Register tmp9, Register tmp10, 4838 Register tmp11, Register tmp12, 4839 Register tmp13) { 4840 4841 ShortBranchVerifier sbv(this); 4842 4843 assert_different_registers(x, xlen, y, ylen, z, zlen, 4844 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6); 4845 assert_different_registers(x, xlen, y, ylen, z, zlen, 4846 tmp1, tmp2, tmp3, tmp4, tmp5, tmp7); 4847 assert_different_registers(x, xlen, y, ylen, z, zlen, 4848 tmp1, tmp2, tmp3, tmp4, tmp5, tmp8); 4849 4850 const Register idx = tmp1; 4851 const Register kdx = tmp2; 4852 const Register xstart = tmp3; 4853 4854 const Register y_idx = tmp4; 4855 const Register carry = tmp5; 4856 const Register product = tmp6; 4857 const Register product_high = tmp7; 4858 const Register x_xstart = tmp8; 4859 const Register tmp = tmp9; 4860 4861 // First Loop. 4862 // 4863 // final static long LONG_MASK = 0xffffffffL; 4864 // int xstart = xlen - 1; 4865 // int ystart = ylen - 1; 4866 // long carry = 0; 4867 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 4868 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; 4869 // z[kdx] = (int)product; 4870 // carry = product >>> 32; 4871 // } 4872 // z[xstart] = (int)carry; 4873 4874 mr_if_needed(idx, ylen); // idx = ylen 4875 mr_if_needed(kdx, zlen); // kdx = xlen + ylen 4876 li(carry, 0); // carry = 0 4877 4878 Label L_done; 4879 4880 addic_(xstart, xlen, -1); 4881 blt(CCR0, L_done); 4882 4883 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, 4884 carry, product_high, product, idx, kdx, tmp); 4885 4886 Label L_second_loop; 4887 4888 cmpdi(CCR0, kdx, 0); 4889 beq(CCR0, L_second_loop); 4890 4891 Label L_carry; 4892 4893 addic_(kdx, kdx, -1); 4894 beq(CCR0, L_carry); 4895 4896 // Store lower 32 bits of carry. 4897 sldi(tmp, kdx, LogBytesPerInt); 4898 stwx(carry, z, tmp); 4899 srdi(carry, carry, 32); 4900 addi(kdx, kdx, -1); 4901 4902 4903 bind(L_carry); 4904 4905 // Store upper 32 bits of carry. 4906 sldi(tmp, kdx, LogBytesPerInt); 4907 stwx(carry, z, tmp); 4908 4909 // Second and third (nested) loops. 4910 // 4911 // for (int i = xstart-1; i >= 0; i--) { // Second loop 4912 // carry = 0; 4913 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop 4914 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + 4915 // (z[k] & LONG_MASK) + carry; 4916 // z[k] = (int)product; 4917 // carry = product >>> 32; 4918 // } 4919 // z[i] = (int)carry; 4920 // } 4921 // 4922 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx 4923 4924 bind(L_second_loop); 4925 4926 li(carry, 0); // carry = 0; 4927 4928 addic_(xstart, xstart, -1); // i = xstart-1; 4929 blt(CCR0, L_done); 4930 4931 Register zsave = tmp10; 4932 4933 mr(zsave, z); 4934 4935 4936 Label L_last_x; 4937 4938 sldi(tmp, xstart, LogBytesPerInt); 4939 add(z, z, tmp); // z = z + k - j 4940 addi(z, z, 4); 4941 addic_(xstart, xstart, -1); // i = xstart-1; 4942 blt(CCR0, L_last_x); 4943 4944 sldi(tmp, xstart, LogBytesPerInt); 4945 ldx(x_xstart, x, tmp); 4946 #ifdef VM_LITTLE_ENDIAN 4947 rldicl(x_xstart, x_xstart, 32, 0); 4948 #endif 4949 4950 4951 Label L_third_loop_prologue; 4952 4953 bind(L_third_loop_prologue); 4954 4955 Register xsave = tmp11; 4956 Register xlensave = tmp12; 4957 Register ylensave = tmp13; 4958 4959 mr(xsave, x); 4960 mr(xlensave, xstart); 4961 mr(ylensave, ylen); 4962 4963 4964 multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen, 4965 carry, product_high, product, x, tmp); 4966 4967 mr(z, zsave); 4968 mr(x, xsave); 4969 mr(xlen, xlensave); // This is the decrement of the loop counter! 4970 mr(ylen, ylensave); 4971 4972 addi(tmp3, xlen, 1); 4973 sldi(tmp, tmp3, LogBytesPerInt); 4974 stwx(carry, z, tmp); 4975 addic_(tmp3, tmp3, -1); 4976 blt(CCR0, L_done); 4977 4978 srdi(carry, carry, 32); 4979 sldi(tmp, tmp3, LogBytesPerInt); 4980 stwx(carry, z, tmp); 4981 b(L_second_loop); 4982 4983 // Next infrequent code is moved outside loops. 4984 bind(L_last_x); 4985 4986 lwz(x_xstart, 0, x); 4987 b(L_third_loop_prologue); 4988 4989 bind(L_done); 4990 } // multiply_to_len 4991 4992 void MacroAssembler::asm_assert(bool check_equal, const char *msg, int id) { 4993 #ifdef ASSERT 4994 Label ok; 4995 if (check_equal) { 4996 beq(CCR0, ok); 4997 } else { 4998 bne(CCR0, ok); 4999 } 5000 stop(msg, id); 5001 bind(ok); 5002 #endif 5003 } 5004 5005 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset, 5006 Register mem_base, const char* msg, int id) { 5007 #ifdef ASSERT 5008 switch (size) { 5009 case 4: 5010 lwz(R0, mem_offset, mem_base); 5011 cmpwi(CCR0, R0, 0); 5012 break; 5013 case 8: 5014 ld(R0, mem_offset, mem_base); 5015 cmpdi(CCR0, R0, 0); 5016 break; 5017 default: 5018 ShouldNotReachHere(); 5019 } 5020 asm_assert(check_equal, msg, id); 5021 #endif // ASSERT 5022 } 5023 5024 void MacroAssembler::verify_thread() { 5025 if (VerifyThread) { 5026 unimplemented("'VerifyThread' currently not implemented on PPC"); 5027 } 5028 } 5029 5030 // READ: oop. KILL: R0. Volatile floats perhaps. 5031 void MacroAssembler::verify_oop(Register oop, const char* msg) { 5032 if (!VerifyOops) { 5033 return; 5034 } 5035 5036 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address(); 5037 const Register tmp = R11; // Will be preserved. 5038 const int nbytes_save = MacroAssembler::num_volatile_regs * 8; 5039 save_volatile_gprs(R1_SP, -nbytes_save); // except R0 5040 5041 mr_if_needed(R4_ARG2, oop); 5042 save_LR_CR(tmp); // save in old frame 5043 push_frame_reg_args(nbytes_save, tmp); 5044 // load FunctionDescriptor** / entry_address * 5045 load_const_optimized(tmp, fd, R0); 5046 // load FunctionDescriptor* / entry_address 5047 ld(tmp, 0, tmp); 5048 load_const_optimized(R3_ARG1, (address)msg, R0); 5049 // Call destination for its side effect. 5050 call_c(tmp); 5051 5052 pop_frame(); 5053 restore_LR_CR(tmp); 5054 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0 5055 } 5056 5057 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) { 5058 if (!VerifyOops) { 5059 return; 5060 } 5061 5062 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address(); 5063 const Register tmp = R11; // Will be preserved. 5064 const int nbytes_save = MacroAssembler::num_volatile_regs * 8; 5065 save_volatile_gprs(R1_SP, -nbytes_save); // except R0 5066 5067 ld(R4_ARG2, offs, base); 5068 save_LR_CR(tmp); // save in old frame 5069 push_frame_reg_args(nbytes_save, tmp); 5070 // load FunctionDescriptor** / entry_address * 5071 load_const_optimized(tmp, fd, R0); 5072 // load FunctionDescriptor* / entry_address 5073 ld(tmp, 0, tmp); 5074 load_const_optimized(R3_ARG1, (address)msg, R0); 5075 // Call destination for its side effect. 5076 call_c(tmp); 5077 5078 pop_frame(); 5079 restore_LR_CR(tmp); 5080 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0 5081 } 5082 5083 const char* stop_types[] = { 5084 "stop", 5085 "untested", 5086 "unimplemented", 5087 "shouldnotreachhere" 5088 }; 5089 5090 static void stop_on_request(int tp, const char* msg) { 5091 tty->print("PPC assembly code requires stop: (%s) %s\n", stop_types[tp%/*stop_end*/4], msg); 5092 guarantee(false, "PPC assembly code requires stop: %s", msg); 5093 } 5094 5095 // Call a C-function that prints output. 5096 void MacroAssembler::stop(int type, const char* msg, int id) { 5097 #ifndef PRODUCT 5098 block_comment(err_msg("stop: %s %s {", stop_types[type%stop_end], msg)); 5099 #else 5100 block_comment("stop {"); 5101 #endif 5102 5103 // setup arguments 5104 load_const_optimized(R3_ARG1, type); 5105 load_const_optimized(R4_ARG2, (void *)msg, /*tmp=*/R0); 5106 call_VM_leaf(CAST_FROM_FN_PTR(address, stop_on_request), R3_ARG1, R4_ARG2); 5107 illtrap(); 5108 emit_int32(id); 5109 block_comment("} stop;"); 5110 } 5111 5112 #ifndef PRODUCT 5113 // Write pattern 0x0101010101010101 in memory region [low-before, high+after]. 5114 // Val, addr are temp registers. 5115 // If low == addr, addr is killed. 5116 // High is preserved. 5117 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) { 5118 if (!ZapMemory) return; 5119 5120 assert_different_registers(low, val); 5121 5122 BLOCK_COMMENT("zap memory region {"); 5123 load_const_optimized(val, 0x0101010101010101); 5124 int size = before + after; 5125 if (low == high && size < 5 && size > 0) { 5126 int offset = -before*BytesPerWord; 5127 for (int i = 0; i < size; ++i) { 5128 std(val, offset, low); 5129 offset += (1*BytesPerWord); 5130 } 5131 } else { 5132 addi(addr, low, -before*BytesPerWord); 5133 assert_different_registers(high, val); 5134 if (after) addi(high, high, after * BytesPerWord); 5135 Label loop; 5136 bind(loop); 5137 std(val, 0, addr); 5138 addi(addr, addr, 8); 5139 cmpd(CCR6, addr, high); 5140 ble(CCR6, loop); 5141 if (after) addi(high, high, -after * BytesPerWord); // Correct back to old value. 5142 } 5143 BLOCK_COMMENT("} zap memory region"); 5144 } 5145 5146 #endif // !PRODUCT 5147 5148 void SkipIfEqualZero::skip_to_label_if_equal_zero(MacroAssembler* masm, Register temp, 5149 const bool* flag_addr, Label& label) { 5150 int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true); 5151 assert(sizeof(bool) == 1, "PowerPC ABI"); 5152 masm->lbz(temp, simm16_offset, temp); 5153 masm->cmpwi(CCR0, temp, 0); 5154 masm->beq(CCR0, label); 5155 } 5156 5157 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() { 5158 skip_to_label_if_equal_zero(masm, temp, flag_addr, _label); 5159 } 5160 5161 SkipIfEqualZero::~SkipIfEqualZero() { 5162 _masm->bind(_label); 5163 }