1 /* 2 * Copyright (c) 1997, 2016, Oracle and/or its affiliates. All rights reserved. 3 * Copyright 2012, 2015 SAP AG. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.inline.hpp" 28 #include "compiler/disassembler.hpp" 29 #include "gc/shared/cardTableModRefBS.hpp" 30 #include "gc/shared/collectedHeap.inline.hpp" 31 #include "interpreter/interpreter.hpp" 32 #include "memory/resourceArea.hpp" 33 #include "nativeInst_ppc.hpp" 34 #include "prims/methodHandles.hpp" 35 #include "runtime/biasedLocking.hpp" 36 #include "runtime/icache.hpp" 37 #include "runtime/interfaceSupport.hpp" 38 #include "runtime/objectMonitor.hpp" 39 #include "runtime/os.hpp" 40 #include "runtime/sharedRuntime.hpp" 41 #include "runtime/stubRoutines.hpp" 42 #include "utilities/macros.hpp" 43 #if INCLUDE_ALL_GCS 44 #include "gc/g1/g1CollectedHeap.inline.hpp" 45 #include "gc/g1/g1SATBCardTableModRefBS.hpp" 46 #include "gc/g1/heapRegion.hpp" 47 #endif // INCLUDE_ALL_GCS 48 49 #ifdef PRODUCT 50 #define BLOCK_COMMENT(str) // nothing 51 #else 52 #define BLOCK_COMMENT(str) block_comment(str) 53 #endif 54 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 55 56 #ifdef ASSERT 57 // On RISC, there's no benefit to verifying instruction boundaries. 58 bool AbstractAssembler::pd_check_instruction_mark() { return false; } 59 #endif 60 61 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) { 62 assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range"); 63 if (Assembler::is_simm(si31, 16)) { 64 ld(d, si31, a); 65 if (emit_filler_nop) nop(); 66 } else { 67 const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31); 68 const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31); 69 addis(d, a, hi); 70 ld(d, lo, d); 71 } 72 } 73 74 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) { 75 assert_different_registers(d, a); 76 ld_largeoffset_unchecked(d, si31, a, emit_filler_nop); 77 } 78 79 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base, 80 size_t size_in_bytes, bool is_signed) { 81 switch (size_in_bytes) { 82 case 8: ld(dst, offs, base); break; 83 case 4: is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break; 84 case 2: is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break; 85 case 1: lbz(dst, offs, base); if (is_signed) extsb(dst, dst); break; // lba doesn't exist :( 86 default: ShouldNotReachHere(); 87 } 88 } 89 90 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base, 91 size_t size_in_bytes) { 92 switch (size_in_bytes) { 93 case 8: std(dst, offs, base); break; 94 case 4: stw(dst, offs, base); break; 95 case 2: sth(dst, offs, base); break; 96 case 1: stb(dst, offs, base); break; 97 default: ShouldNotReachHere(); 98 } 99 } 100 101 void MacroAssembler::align(int modulus, int max, int rem) { 102 int padding = (rem + modulus - (offset() % modulus)) % modulus; 103 if (padding > max) return; 104 for (int c = (padding >> 2); c > 0; --c) { nop(); } 105 } 106 107 // Issue instructions that calculate given TOC from global TOC. 108 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16, 109 bool add_relocation, bool emit_dummy_addr) { 110 int offset = -1; 111 if (emit_dummy_addr) { 112 offset = -128; // dummy address 113 } else if (addr != (address)(intptr_t)-1) { 114 offset = MacroAssembler::offset_to_global_toc(addr); 115 } 116 117 if (hi16) { 118 addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset)); 119 } 120 if (lo16) { 121 if (add_relocation) { 122 // Relocate at the addi to avoid confusion with a load from the method's TOC. 123 relocate(internal_word_Relocation::spec(addr)); 124 } 125 addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset)); 126 } 127 } 128 129 int MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) { 130 const int offset = MacroAssembler::offset_to_global_toc(addr); 131 132 const address inst2_addr = a; 133 const int inst2 = *(int *)inst2_addr; 134 135 // The relocation points to the second instruction, the addi, 136 // and the addi reads and writes the same register dst. 137 const int dst = inv_rt_field(inst2); 138 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 139 140 // Now, find the preceding addis which writes to dst. 141 int inst1 = 0; 142 address inst1_addr = inst2_addr - BytesPerInstWord; 143 while (inst1_addr >= bound) { 144 inst1 = *(int *) inst1_addr; 145 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 146 // Stop, found the addis which writes dst. 147 break; 148 } 149 inst1_addr -= BytesPerInstWord; 150 } 151 152 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 153 set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset)); 154 set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset)); 155 return (int)((intptr_t)addr - (intptr_t)inst1_addr); 156 } 157 158 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) { 159 const address inst2_addr = a; 160 const int inst2 = *(int *)inst2_addr; 161 162 // The relocation points to the second instruction, the addi, 163 // and the addi reads and writes the same register dst. 164 const int dst = inv_rt_field(inst2); 165 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 166 167 // Now, find the preceding addis which writes to dst. 168 int inst1 = 0; 169 address inst1_addr = inst2_addr - BytesPerInstWord; 170 while (inst1_addr >= bound) { 171 inst1 = *(int *) inst1_addr; 172 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 173 // stop, found the addis which writes dst 174 break; 175 } 176 inst1_addr -= BytesPerInstWord; 177 } 178 179 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 180 181 int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0); 182 // -1 is a special case 183 if (offset == -1) { 184 return (address)(intptr_t)-1; 185 } else { 186 return global_toc() + offset; 187 } 188 } 189 190 #ifdef _LP64 191 // Patch compressed oops or klass constants. 192 // Assembler sequence is 193 // 1) compressed oops: 194 // lis rx = const.hi 195 // ori rx = rx | const.lo 196 // 2) compressed klass: 197 // lis rx = const.hi 198 // clrldi rx = rx & 0xFFFFffff // clearMS32b, optional 199 // ori rx = rx | const.lo 200 // Clrldi will be passed by. 201 int MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) { 202 assert(UseCompressedOops, "Should only patch compressed oops"); 203 204 const address inst2_addr = a; 205 const int inst2 = *(int *)inst2_addr; 206 207 // The relocation points to the second instruction, the ori, 208 // and the ori reads and writes the same register dst. 209 const int dst = inv_rta_field(inst2); 210 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 211 // Now, find the preceding addis which writes to dst. 212 int inst1 = 0; 213 address inst1_addr = inst2_addr - BytesPerInstWord; 214 bool inst1_found = false; 215 while (inst1_addr >= bound) { 216 inst1 = *(int *)inst1_addr; 217 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; } 218 inst1_addr -= BytesPerInstWord; 219 } 220 assert(inst1_found, "inst is not lis"); 221 222 int xc = (data >> 16) & 0xffff; 223 int xd = (data >> 0) & 0xffff; 224 225 set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo 226 set_imm((int *)inst2_addr, (xd)); // unsigned int 227 return (int)((intptr_t)inst2_addr - (intptr_t)inst1_addr); 228 } 229 230 // Get compressed oop or klass constant. 231 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) { 232 assert(UseCompressedOops, "Should only patch compressed oops"); 233 234 const address inst2_addr = a; 235 const int inst2 = *(int *)inst2_addr; 236 237 // The relocation points to the second instruction, the ori, 238 // and the ori reads and writes the same register dst. 239 const int dst = inv_rta_field(inst2); 240 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 241 // Now, find the preceding lis which writes to dst. 242 int inst1 = 0; 243 address inst1_addr = inst2_addr - BytesPerInstWord; 244 bool inst1_found = false; 245 246 while (inst1_addr >= bound) { 247 inst1 = *(int *) inst1_addr; 248 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;} 249 inst1_addr -= BytesPerInstWord; 250 } 251 assert(inst1_found, "inst is not lis"); 252 253 uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff)); 254 uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16); 255 256 return (int) (xl | xh); 257 } 258 #endif // _LP64 259 260 // Returns true if successful. 261 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a, 262 Register toc, bool fixed_size) { 263 int toc_offset = 0; 264 // Use RelocationHolder::none for the constant pool entry, otherwise 265 // we will end up with a failing NativeCall::verify(x) where x is 266 // the address of the constant pool entry. 267 // FIXME: We should insert relocation information for oops at the constant 268 // pool entries instead of inserting it at the loads; patching of a constant 269 // pool entry should be less expensive. 270 address const_address = address_constant((address)a.value(), RelocationHolder::none); 271 if (const_address == NULL) { return false; } // allocation failure 272 // Relocate at the pc of the load. 273 relocate(a.rspec()); 274 toc_offset = (int)(const_address - code()->consts()->start()); 275 ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size); 276 return true; 277 } 278 279 bool MacroAssembler::is_load_const_from_method_toc_at(address a) { 280 const address inst1_addr = a; 281 const int inst1 = *(int *)inst1_addr; 282 283 // The relocation points to the ld or the addis. 284 return (is_ld(inst1)) || 285 (is_addis(inst1) && inv_ra_field(inst1) != 0); 286 } 287 288 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) { 289 assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc"); 290 291 const address inst1_addr = a; 292 const int inst1 = *(int *)inst1_addr; 293 294 if (is_ld(inst1)) { 295 return inv_d1_field(inst1); 296 } else if (is_addis(inst1)) { 297 const int dst = inv_rt_field(inst1); 298 299 // Now, find the succeeding ld which reads and writes to dst. 300 address inst2_addr = inst1_addr + BytesPerInstWord; 301 int inst2 = 0; 302 while (true) { 303 inst2 = *(int *) inst2_addr; 304 if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) { 305 // Stop, found the ld which reads and writes dst. 306 break; 307 } 308 inst2_addr += BytesPerInstWord; 309 } 310 return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2); 311 } 312 ShouldNotReachHere(); 313 return 0; 314 } 315 316 // Get the constant from a `load_const' sequence. 317 long MacroAssembler::get_const(address a) { 318 assert(is_load_const_at(a), "not a load of a constant"); 319 const int *p = (const int*) a; 320 unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48); 321 if (is_ori(*(p+1))) { 322 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32); 323 x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16); 324 x |= (((unsigned long) (get_imm(a,4) & 0xffff))); 325 } else if (is_lis(*(p+1))) { 326 x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32); 327 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16); 328 x |= (((unsigned long) (get_imm(a,3) & 0xffff))); 329 } else { 330 ShouldNotReachHere(); 331 return (long) 0; 332 } 333 return (long) x; 334 } 335 336 // Patch the 64 bit constant of a `load_const' sequence. This is a low 337 // level procedure. It neither flushes the instruction cache nor is it 338 // mt safe. 339 void MacroAssembler::patch_const(address a, long x) { 340 assert(is_load_const_at(a), "not a load of a constant"); 341 int *p = (int*) a; 342 if (is_ori(*(p+1))) { 343 set_imm(0 + p, (x >> 48) & 0xffff); 344 set_imm(1 + p, (x >> 32) & 0xffff); 345 set_imm(3 + p, (x >> 16) & 0xffff); 346 set_imm(4 + p, x & 0xffff); 347 } else if (is_lis(*(p+1))) { 348 set_imm(0 + p, (x >> 48) & 0xffff); 349 set_imm(2 + p, (x >> 32) & 0xffff); 350 set_imm(1 + p, (x >> 16) & 0xffff); 351 set_imm(3 + p, x & 0xffff); 352 } else { 353 ShouldNotReachHere(); 354 } 355 } 356 357 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) { 358 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 359 int index = oop_recorder()->allocate_metadata_index(obj); 360 RelocationHolder rspec = metadata_Relocation::spec(index); 361 return AddressLiteral((address)obj, rspec); 362 } 363 364 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) { 365 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 366 int index = oop_recorder()->find_index(obj); 367 RelocationHolder rspec = metadata_Relocation::spec(index); 368 return AddressLiteral((address)obj, rspec); 369 } 370 371 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) { 372 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 373 int oop_index = oop_recorder()->allocate_oop_index(obj); 374 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 375 } 376 377 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) { 378 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 379 int oop_index = oop_recorder()->find_index(obj); 380 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 381 } 382 383 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr, 384 Register tmp, int offset) { 385 intptr_t value = *delayed_value_addr; 386 if (value != 0) { 387 return RegisterOrConstant(value + offset); 388 } 389 390 // Load indirectly to solve generation ordering problem. 391 // static address, no relocation 392 int simm16_offset = load_const_optimized(tmp, delayed_value_addr, noreg, true); 393 ld(tmp, simm16_offset, tmp); // must be aligned ((xa & 3) == 0) 394 395 if (offset != 0) { 396 addi(tmp, tmp, offset); 397 } 398 399 return RegisterOrConstant(tmp); 400 } 401 402 #ifndef PRODUCT 403 void MacroAssembler::pd_print_patched_instruction(address branch) { 404 Unimplemented(); // TODO: PPC port 405 } 406 #endif // ndef PRODUCT 407 408 // Conditional far branch for destinations encodable in 24+2 bits. 409 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) { 410 411 // If requested by flag optimize, relocate the bc_far as a 412 // runtime_call and prepare for optimizing it when the code gets 413 // relocated. 414 if (optimize == bc_far_optimize_on_relocate) { 415 relocate(relocInfo::runtime_call_type); 416 } 417 418 // variant 2: 419 // 420 // b!cxx SKIP 421 // bxx DEST 422 // SKIP: 423 // 424 425 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 426 opposite_bcond(inv_boint_bcond(boint))); 427 428 // We emit two branches. 429 // First, a conditional branch which jumps around the far branch. 430 const address not_taken_pc = pc() + 2 * BytesPerInstWord; 431 const address bc_pc = pc(); 432 bc(opposite_boint, biint, not_taken_pc); 433 434 const int bc_instr = *(int*)bc_pc; 435 assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition"); 436 assert(opposite_boint == inv_bo_field(bc_instr), "postcondition"); 437 assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))), 438 opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))), 439 "postcondition"); 440 assert(biint == inv_bi_field(bc_instr), "postcondition"); 441 442 // Second, an unconditional far branch which jumps to dest. 443 // Note: target(dest) remembers the current pc (see CodeSection::target) 444 // and returns the current pc if the label is not bound yet; when 445 // the label gets bound, the unconditional far branch will be patched. 446 const address target_pc = target(dest); 447 const address b_pc = pc(); 448 b(target_pc); 449 450 assert(not_taken_pc == pc(), "postcondition"); 451 assert(dest.is_bound() || target_pc == b_pc, "postcondition"); 452 } 453 454 // 1 or 2 instructions 455 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) { 456 if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) { 457 bc(boint, biint, dest); 458 } else { 459 bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate); 460 } 461 } 462 463 bool MacroAssembler::is_bc_far_at(address instruction_addr) { 464 return is_bc_far_variant1_at(instruction_addr) || 465 is_bc_far_variant2_at(instruction_addr) || 466 is_bc_far_variant3_at(instruction_addr); 467 } 468 469 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) { 470 if (is_bc_far_variant1_at(instruction_addr)) { 471 const address instruction_1_addr = instruction_addr; 472 const int instruction_1 = *(int*)instruction_1_addr; 473 return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr); 474 } else if (is_bc_far_variant2_at(instruction_addr)) { 475 const address instruction_2_addr = instruction_addr + 4; 476 return bxx_destination(instruction_2_addr); 477 } else if (is_bc_far_variant3_at(instruction_addr)) { 478 return instruction_addr + 8; 479 } 480 // variant 4 ??? 481 ShouldNotReachHere(); 482 return NULL; 483 } 484 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) { 485 486 if (is_bc_far_variant3_at(instruction_addr)) { 487 // variant 3, far cond branch to the next instruction, already patched to nops: 488 // 489 // nop 490 // endgroup 491 // SKIP/DEST: 492 // 493 return; 494 } 495 496 // first, extract boint and biint from the current branch 497 int boint = 0; 498 int biint = 0; 499 500 ResourceMark rm; 501 const int code_size = 2 * BytesPerInstWord; 502 CodeBuffer buf(instruction_addr, code_size); 503 MacroAssembler masm(&buf); 504 if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) { 505 // Far branch to next instruction: Optimize it by patching nops (produce variant 3). 506 masm.nop(); 507 masm.endgroup(); 508 } else { 509 if (is_bc_far_variant1_at(instruction_addr)) { 510 // variant 1, the 1st instruction contains the destination address: 511 // 512 // bcxx DEST 513 // nop 514 // 515 const int instruction_1 = *(int*)(instruction_addr); 516 boint = inv_bo_field(instruction_1); 517 biint = inv_bi_field(instruction_1); 518 } else if (is_bc_far_variant2_at(instruction_addr)) { 519 // variant 2, the 2nd instruction contains the destination address: 520 // 521 // b!cxx SKIP 522 // bxx DEST 523 // SKIP: 524 // 525 const int instruction_1 = *(int*)(instruction_addr); 526 boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))), 527 opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1)))); 528 biint = inv_bi_field(instruction_1); 529 } else { 530 // variant 4??? 531 ShouldNotReachHere(); 532 } 533 534 // second, set the new branch destination and optimize the code 535 if (dest != instruction_addr + 4 && // the bc_far is still unbound! 536 masm.is_within_range_of_bcxx(dest, instruction_addr)) { 537 // variant 1: 538 // 539 // bcxx DEST 540 // nop 541 // 542 masm.bc(boint, biint, dest); 543 masm.nop(); 544 } else { 545 // variant 2: 546 // 547 // b!cxx SKIP 548 // bxx DEST 549 // SKIP: 550 // 551 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 552 opposite_bcond(inv_boint_bcond(boint))); 553 const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord; 554 masm.bc(opposite_boint, biint, not_taken_pc); 555 masm.b(dest); 556 } 557 } 558 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 559 } 560 561 // Emit a NOT mt-safe patchable 64 bit absolute call/jump. 562 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) { 563 // get current pc 564 uint64_t start_pc = (uint64_t) pc(); 565 566 const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last 567 const address pc_of_b = (address) (start_pc + (0*BytesPerInstWord)); // b is first 568 569 // relocate here 570 if (rt != relocInfo::none) { 571 relocate(rt); 572 } 573 574 if ( ReoptimizeCallSequences && 575 (( link && is_within_range_of_b(dest, pc_of_bl)) || 576 (!link && is_within_range_of_b(dest, pc_of_b)))) { 577 // variant 2: 578 // Emit an optimized, pc-relative call/jump. 579 580 if (link) { 581 // some padding 582 nop(); 583 nop(); 584 nop(); 585 nop(); 586 nop(); 587 nop(); 588 589 // do the call 590 assert(pc() == pc_of_bl, "just checking"); 591 bl(dest, relocInfo::none); 592 } else { 593 // do the jump 594 assert(pc() == pc_of_b, "just checking"); 595 b(dest, relocInfo::none); 596 597 // some padding 598 nop(); 599 nop(); 600 nop(); 601 nop(); 602 nop(); 603 nop(); 604 } 605 606 // Assert that we can identify the emitted call/jump. 607 assert(is_bxx64_patchable_variant2_at((address)start_pc, link), 608 "can't identify emitted call"); 609 } else { 610 // variant 1: 611 mr(R0, R11); // spill R11 -> R0. 612 613 // Load the destination address into CTR, 614 // calculate destination relative to global toc. 615 calculate_address_from_global_toc(R11, dest, true, true, false); 616 617 mtctr(R11); 618 mr(R11, R0); // spill R11 <- R0. 619 nop(); 620 621 // do the call/jump 622 if (link) { 623 bctrl(); 624 } else{ 625 bctr(); 626 } 627 // Assert that we can identify the emitted call/jump. 628 assert(is_bxx64_patchable_variant1b_at((address)start_pc, link), 629 "can't identify emitted call"); 630 } 631 632 // Assert that we can identify the emitted call/jump. 633 assert(is_bxx64_patchable_at((address)start_pc, link), 634 "can't identify emitted call"); 635 assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest, 636 "wrong encoding of dest address"); 637 } 638 639 // Identify a bxx64_patchable instruction. 640 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) { 641 return is_bxx64_patchable_variant1b_at(instruction_addr, link) 642 //|| is_bxx64_patchable_variant1_at(instruction_addr, link) 643 || is_bxx64_patchable_variant2_at(instruction_addr, link); 644 } 645 646 // Does the call64_patchable instruction use a pc-relative encoding of 647 // the call destination? 648 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) { 649 // variant 2 is pc-relative 650 return is_bxx64_patchable_variant2_at(instruction_addr, link); 651 } 652 653 // Identify variant 1. 654 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) { 655 unsigned int* instr = (unsigned int*) instruction_addr; 656 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 657 && is_mtctr(instr[5]) // mtctr 658 && is_load_const_at(instruction_addr); 659 } 660 661 // Identify variant 1b: load destination relative to global toc. 662 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) { 663 unsigned int* instr = (unsigned int*) instruction_addr; 664 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 665 && is_mtctr(instr[3]) // mtctr 666 && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr); 667 } 668 669 // Identify variant 2. 670 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) { 671 unsigned int* instr = (unsigned int*) instruction_addr; 672 if (link) { 673 return is_bl (instr[6]) // bl dest is last 674 && is_nop(instr[0]) // nop 675 && is_nop(instr[1]) // nop 676 && is_nop(instr[2]) // nop 677 && is_nop(instr[3]) // nop 678 && is_nop(instr[4]) // nop 679 && is_nop(instr[5]); // nop 680 } else { 681 return is_b (instr[0]) // b dest is first 682 && is_nop(instr[1]) // nop 683 && is_nop(instr[2]) // nop 684 && is_nop(instr[3]) // nop 685 && is_nop(instr[4]) // nop 686 && is_nop(instr[5]) // nop 687 && is_nop(instr[6]); // nop 688 } 689 } 690 691 // Set dest address of a bxx64_patchable instruction. 692 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) { 693 ResourceMark rm; 694 int code_size = MacroAssembler::bxx64_patchable_size; 695 CodeBuffer buf(instruction_addr, code_size); 696 MacroAssembler masm(&buf); 697 masm.bxx64_patchable(dest, relocInfo::none, link); 698 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 699 } 700 701 // Get dest address of a bxx64_patchable instruction. 702 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) { 703 if (is_bxx64_patchable_variant1_at(instruction_addr, link)) { 704 return (address) (unsigned long) get_const(instruction_addr); 705 } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) { 706 unsigned int* instr = (unsigned int*) instruction_addr; 707 if (link) { 708 const int instr_idx = 6; // bl is last 709 int branchoffset = branch_destination(instr[instr_idx], 0); 710 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 711 } else { 712 const int instr_idx = 0; // b is first 713 int branchoffset = branch_destination(instr[instr_idx], 0); 714 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 715 } 716 // Load dest relative to global toc. 717 } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) { 718 return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, 719 instruction_addr); 720 } else { 721 ShouldNotReachHere(); 722 return NULL; 723 } 724 } 725 726 // Uses ordering which corresponds to ABI: 727 // _savegpr0_14: std r14,-144(r1) 728 // _savegpr0_15: std r15,-136(r1) 729 // _savegpr0_16: std r16,-128(r1) 730 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) { 731 std(R14, offset, dst); offset += 8; 732 std(R15, offset, dst); offset += 8; 733 std(R16, offset, dst); offset += 8; 734 std(R17, offset, dst); offset += 8; 735 std(R18, offset, dst); offset += 8; 736 std(R19, offset, dst); offset += 8; 737 std(R20, offset, dst); offset += 8; 738 std(R21, offset, dst); offset += 8; 739 std(R22, offset, dst); offset += 8; 740 std(R23, offset, dst); offset += 8; 741 std(R24, offset, dst); offset += 8; 742 std(R25, offset, dst); offset += 8; 743 std(R26, offset, dst); offset += 8; 744 std(R27, offset, dst); offset += 8; 745 std(R28, offset, dst); offset += 8; 746 std(R29, offset, dst); offset += 8; 747 std(R30, offset, dst); offset += 8; 748 std(R31, offset, dst); offset += 8; 749 750 stfd(F14, offset, dst); offset += 8; 751 stfd(F15, offset, dst); offset += 8; 752 stfd(F16, offset, dst); offset += 8; 753 stfd(F17, offset, dst); offset += 8; 754 stfd(F18, offset, dst); offset += 8; 755 stfd(F19, offset, dst); offset += 8; 756 stfd(F20, offset, dst); offset += 8; 757 stfd(F21, offset, dst); offset += 8; 758 stfd(F22, offset, dst); offset += 8; 759 stfd(F23, offset, dst); offset += 8; 760 stfd(F24, offset, dst); offset += 8; 761 stfd(F25, offset, dst); offset += 8; 762 stfd(F26, offset, dst); offset += 8; 763 stfd(F27, offset, dst); offset += 8; 764 stfd(F28, offset, dst); offset += 8; 765 stfd(F29, offset, dst); offset += 8; 766 stfd(F30, offset, dst); offset += 8; 767 stfd(F31, offset, dst); 768 } 769 770 // Uses ordering which corresponds to ABI: 771 // _restgpr0_14: ld r14,-144(r1) 772 // _restgpr0_15: ld r15,-136(r1) 773 // _restgpr0_16: ld r16,-128(r1) 774 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) { 775 ld(R14, offset, src); offset += 8; 776 ld(R15, offset, src); offset += 8; 777 ld(R16, offset, src); offset += 8; 778 ld(R17, offset, src); offset += 8; 779 ld(R18, offset, src); offset += 8; 780 ld(R19, offset, src); offset += 8; 781 ld(R20, offset, src); offset += 8; 782 ld(R21, offset, src); offset += 8; 783 ld(R22, offset, src); offset += 8; 784 ld(R23, offset, src); offset += 8; 785 ld(R24, offset, src); offset += 8; 786 ld(R25, offset, src); offset += 8; 787 ld(R26, offset, src); offset += 8; 788 ld(R27, offset, src); offset += 8; 789 ld(R28, offset, src); offset += 8; 790 ld(R29, offset, src); offset += 8; 791 ld(R30, offset, src); offset += 8; 792 ld(R31, offset, src); offset += 8; 793 794 // FP registers 795 lfd(F14, offset, src); offset += 8; 796 lfd(F15, offset, src); offset += 8; 797 lfd(F16, offset, src); offset += 8; 798 lfd(F17, offset, src); offset += 8; 799 lfd(F18, offset, src); offset += 8; 800 lfd(F19, offset, src); offset += 8; 801 lfd(F20, offset, src); offset += 8; 802 lfd(F21, offset, src); offset += 8; 803 lfd(F22, offset, src); offset += 8; 804 lfd(F23, offset, src); offset += 8; 805 lfd(F24, offset, src); offset += 8; 806 lfd(F25, offset, src); offset += 8; 807 lfd(F26, offset, src); offset += 8; 808 lfd(F27, offset, src); offset += 8; 809 lfd(F28, offset, src); offset += 8; 810 lfd(F29, offset, src); offset += 8; 811 lfd(F30, offset, src); offset += 8; 812 lfd(F31, offset, src); 813 } 814 815 // For verify_oops. 816 void MacroAssembler::save_volatile_gprs(Register dst, int offset) { 817 std(R2, offset, dst); offset += 8; 818 std(R3, offset, dst); offset += 8; 819 std(R4, offset, dst); offset += 8; 820 std(R5, offset, dst); offset += 8; 821 std(R6, offset, dst); offset += 8; 822 std(R7, offset, dst); offset += 8; 823 std(R8, offset, dst); offset += 8; 824 std(R9, offset, dst); offset += 8; 825 std(R10, offset, dst); offset += 8; 826 std(R11, offset, dst); offset += 8; 827 std(R12, offset, dst); offset += 8; 828 829 stfd(F0, offset, dst); offset += 8; 830 stfd(F1, offset, dst); offset += 8; 831 stfd(F2, offset, dst); offset += 8; 832 stfd(F3, offset, dst); offset += 8; 833 stfd(F4, offset, dst); offset += 8; 834 stfd(F5, offset, dst); offset += 8; 835 stfd(F6, offset, dst); offset += 8; 836 stfd(F7, offset, dst); offset += 8; 837 stfd(F8, offset, dst); offset += 8; 838 stfd(F9, offset, dst); offset += 8; 839 stfd(F10, offset, dst); offset += 8; 840 stfd(F11, offset, dst); offset += 8; 841 stfd(F12, offset, dst); offset += 8; 842 stfd(F13, offset, dst); 843 } 844 845 // For verify_oops. 846 void MacroAssembler::restore_volatile_gprs(Register src, int offset) { 847 ld(R2, offset, src); offset += 8; 848 ld(R3, offset, src); offset += 8; 849 ld(R4, offset, src); offset += 8; 850 ld(R5, offset, src); offset += 8; 851 ld(R6, offset, src); offset += 8; 852 ld(R7, offset, src); offset += 8; 853 ld(R8, offset, src); offset += 8; 854 ld(R9, offset, src); offset += 8; 855 ld(R10, offset, src); offset += 8; 856 ld(R11, offset, src); offset += 8; 857 ld(R12, offset, src); offset += 8; 858 859 lfd(F0, offset, src); offset += 8; 860 lfd(F1, offset, src); offset += 8; 861 lfd(F2, offset, src); offset += 8; 862 lfd(F3, offset, src); offset += 8; 863 lfd(F4, offset, src); offset += 8; 864 lfd(F5, offset, src); offset += 8; 865 lfd(F6, offset, src); offset += 8; 866 lfd(F7, offset, src); offset += 8; 867 lfd(F8, offset, src); offset += 8; 868 lfd(F9, offset, src); offset += 8; 869 lfd(F10, offset, src); offset += 8; 870 lfd(F11, offset, src); offset += 8; 871 lfd(F12, offset, src); offset += 8; 872 lfd(F13, offset, src); 873 } 874 875 void MacroAssembler::save_LR_CR(Register tmp) { 876 mfcr(tmp); 877 std(tmp, _abi(cr), R1_SP); 878 mflr(tmp); 879 std(tmp, _abi(lr), R1_SP); 880 // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad) 881 } 882 883 void MacroAssembler::restore_LR_CR(Register tmp) { 884 assert(tmp != R1_SP, "must be distinct"); 885 ld(tmp, _abi(lr), R1_SP); 886 mtlr(tmp); 887 ld(tmp, _abi(cr), R1_SP); 888 mtcr(tmp); 889 } 890 891 address MacroAssembler::get_PC_trash_LR(Register result) { 892 Label L; 893 bl(L); 894 bind(L); 895 address lr_pc = pc(); 896 mflr(result); 897 return lr_pc; 898 } 899 900 void MacroAssembler::resize_frame(Register offset, Register tmp) { 901 #ifdef ASSERT 902 assert_different_registers(offset, tmp, R1_SP); 903 andi_(tmp, offset, frame::alignment_in_bytes-1); 904 asm_assert_eq("resize_frame: unaligned", 0x204); 905 #endif 906 907 // tmp <- *(SP) 908 ld(tmp, _abi(callers_sp), R1_SP); 909 // addr <- SP + offset; 910 // *(addr) <- tmp; 911 // SP <- addr 912 stdux(tmp, R1_SP, offset); 913 } 914 915 void MacroAssembler::resize_frame(int offset, Register tmp) { 916 assert(is_simm(offset, 16), "too big an offset"); 917 assert_different_registers(tmp, R1_SP); 918 assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned"); 919 // tmp <- *(SP) 920 ld(tmp, _abi(callers_sp), R1_SP); 921 // addr <- SP + offset; 922 // *(addr) <- tmp; 923 // SP <- addr 924 stdu(tmp, offset, R1_SP); 925 } 926 927 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) { 928 // (addr == tmp1) || (addr == tmp2) is allowed here! 929 assert(tmp1 != tmp2, "must be distinct"); 930 931 // compute offset w.r.t. current stack pointer 932 // tmp_1 <- addr - SP (!) 933 subf(tmp1, R1_SP, addr); 934 935 // atomically update SP keeping back link. 936 resize_frame(tmp1/* offset */, tmp2/* tmp */); 937 } 938 939 void MacroAssembler::push_frame(Register bytes, Register tmp) { 940 #ifdef ASSERT 941 assert(bytes != R0, "r0 not allowed here"); 942 andi_(R0, bytes, frame::alignment_in_bytes-1); 943 asm_assert_eq("push_frame(Reg, Reg): unaligned", 0x203); 944 #endif 945 neg(tmp, bytes); 946 stdux(R1_SP, R1_SP, tmp); 947 } 948 949 // Push a frame of size `bytes'. 950 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) { 951 long offset = align_addr(bytes, frame::alignment_in_bytes); 952 if (is_simm(-offset, 16)) { 953 stdu(R1_SP, -offset, R1_SP); 954 } else { 955 load_const_optimized(tmp, -offset); 956 stdux(R1_SP, R1_SP, tmp); 957 } 958 } 959 960 // Push a frame of size `bytes' plus abi_reg_args on top. 961 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) { 962 push_frame(bytes + frame::abi_reg_args_size, tmp); 963 } 964 965 // Setup up a new C frame with a spill area for non-volatile GPRs and 966 // additional space for local variables. 967 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes, 968 Register tmp) { 969 push_frame(bytes + frame::abi_reg_args_size + frame::spill_nonvolatiles_size, tmp); 970 } 971 972 // Pop current C frame. 973 void MacroAssembler::pop_frame() { 974 ld(R1_SP, _abi(callers_sp), R1_SP); 975 } 976 977 #if defined(ABI_ELFv2) 978 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) { 979 // TODO(asmundak): make sure the caller uses R12 as function descriptor 980 // most of the times. 981 if (R12 != r_function_entry) { 982 mr(R12, r_function_entry); 983 } 984 mtctr(R12); 985 // Do a call or a branch. 986 if (and_link) { 987 bctrl(); 988 } else { 989 bctr(); 990 } 991 _last_calls_return_pc = pc(); 992 993 return _last_calls_return_pc; 994 } 995 996 // Call a C function via a function descriptor and use full C 997 // calling conventions. Updates and returns _last_calls_return_pc. 998 address MacroAssembler::call_c(Register r_function_entry) { 999 return branch_to(r_function_entry, /*and_link=*/true); 1000 } 1001 1002 // For tail calls: only branch, don't link, so callee returns to caller of this function. 1003 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) { 1004 return branch_to(r_function_entry, /*and_link=*/false); 1005 } 1006 1007 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) { 1008 load_const(R12, function_entry, R0); 1009 return branch_to(R12, /*and_link=*/true); 1010 } 1011 1012 #else 1013 // Generic version of a call to C function via a function descriptor 1014 // with variable support for C calling conventions (TOC, ENV, etc.). 1015 // Updates and returns _last_calls_return_pc. 1016 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call, 1017 bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) { 1018 // we emit standard ptrgl glue code here 1019 assert((function_descriptor != R0), "function_descriptor cannot be R0"); 1020 1021 // retrieve necessary entries from the function descriptor 1022 ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor); 1023 mtctr(R0); 1024 1025 if (load_toc_of_callee) { 1026 ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor); 1027 } 1028 if (load_env_of_callee) { 1029 ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor); 1030 } else if (load_toc_of_callee) { 1031 li(R11, 0); 1032 } 1033 1034 // do a call or a branch 1035 if (and_link) { 1036 bctrl(); 1037 } else { 1038 bctr(); 1039 } 1040 _last_calls_return_pc = pc(); 1041 1042 return _last_calls_return_pc; 1043 } 1044 1045 // Call a C function via a function descriptor and use full C calling 1046 // conventions. 1047 // We don't use the TOC in generated code, so there is no need to save 1048 // and restore its value. 1049 address MacroAssembler::call_c(Register fd) { 1050 return branch_to(fd, /*and_link=*/true, 1051 /*save toc=*/false, 1052 /*restore toc=*/false, 1053 /*load toc=*/true, 1054 /*load env=*/true); 1055 } 1056 1057 address MacroAssembler::call_c_and_return_to_caller(Register fd) { 1058 return branch_to(fd, /*and_link=*/false, 1059 /*save toc=*/false, 1060 /*restore toc=*/false, 1061 /*load toc=*/true, 1062 /*load env=*/true); 1063 } 1064 1065 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) { 1066 if (rt != relocInfo::none) { 1067 // this call needs to be relocatable 1068 if (!ReoptimizeCallSequences 1069 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1070 || fd == NULL // support code-size estimation 1071 || !fd->is_friend_function() 1072 || fd->entry() == NULL) { 1073 // it's not a friend function as defined by class FunctionDescriptor, 1074 // so do a full call-c here. 1075 load_const(R11, (address)fd, R0); 1076 1077 bool has_env = (fd != NULL && fd->env() != NULL); 1078 return branch_to(R11, /*and_link=*/true, 1079 /*save toc=*/false, 1080 /*restore toc=*/false, 1081 /*load toc=*/true, 1082 /*load env=*/has_env); 1083 } else { 1084 // It's a friend function. Load the entry point and don't care about 1085 // toc and env. Use an optimizable call instruction, but ensure the 1086 // same code-size as in the case of a non-friend function. 1087 nop(); 1088 nop(); 1089 nop(); 1090 bl64_patchable(fd->entry(), rt); 1091 _last_calls_return_pc = pc(); 1092 return _last_calls_return_pc; 1093 } 1094 } else { 1095 // This call does not need to be relocatable, do more aggressive 1096 // optimizations. 1097 if (!ReoptimizeCallSequences 1098 || !fd->is_friend_function()) { 1099 // It's not a friend function as defined by class FunctionDescriptor, 1100 // so do a full call-c here. 1101 load_const(R11, (address)fd, R0); 1102 return branch_to(R11, /*and_link=*/true, 1103 /*save toc=*/false, 1104 /*restore toc=*/false, 1105 /*load toc=*/true, 1106 /*load env=*/true); 1107 } else { 1108 // it's a friend function, load the entry point and don't care about 1109 // toc and env. 1110 address dest = fd->entry(); 1111 if (is_within_range_of_b(dest, pc())) { 1112 bl(dest); 1113 } else { 1114 bl64_patchable(dest, rt); 1115 } 1116 _last_calls_return_pc = pc(); 1117 return _last_calls_return_pc; 1118 } 1119 } 1120 } 1121 1122 // Call a C function. All constants needed reside in TOC. 1123 // 1124 // Read the address to call from the TOC. 1125 // Read env from TOC, if fd specifies an env. 1126 // Read new TOC from TOC. 1127 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd, 1128 relocInfo::relocType rt, Register toc) { 1129 if (!ReoptimizeCallSequences 1130 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1131 || !fd->is_friend_function()) { 1132 // It's not a friend function as defined by class FunctionDescriptor, 1133 // so do a full call-c here. 1134 assert(fd->entry() != NULL, "function must be linked"); 1135 1136 AddressLiteral fd_entry(fd->entry()); 1137 bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true); 1138 mtctr(R11); 1139 if (fd->env() == NULL) { 1140 li(R11, 0); 1141 nop(); 1142 } else { 1143 AddressLiteral fd_env(fd->env()); 1144 success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true); 1145 } 1146 AddressLiteral fd_toc(fd->toc()); 1147 // Set R2_TOC (load from toc) 1148 success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true); 1149 bctrl(); 1150 _last_calls_return_pc = pc(); 1151 if (!success) { return NULL; } 1152 } else { 1153 // It's a friend function, load the entry point and don't care about 1154 // toc and env. Use an optimizable call instruction, but ensure the 1155 // same code-size as in the case of a non-friend function. 1156 nop(); 1157 bl64_patchable(fd->entry(), rt); 1158 _last_calls_return_pc = pc(); 1159 } 1160 return _last_calls_return_pc; 1161 } 1162 #endif // ABI_ELFv2 1163 1164 void MacroAssembler::call_VM_base(Register oop_result, 1165 Register last_java_sp, 1166 address entry_point, 1167 bool check_exceptions) { 1168 BLOCK_COMMENT("call_VM {"); 1169 // Determine last_java_sp register. 1170 if (!last_java_sp->is_valid()) { 1171 last_java_sp = R1_SP; 1172 } 1173 set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1); 1174 1175 // ARG1 must hold thread address. 1176 mr(R3_ARG1, R16_thread); 1177 #if defined(ABI_ELFv2) 1178 address return_pc = call_c(entry_point, relocInfo::none); 1179 #else 1180 address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none); 1181 #endif 1182 1183 reset_last_Java_frame(); 1184 1185 // Check for pending exceptions. 1186 if (check_exceptions) { 1187 // We don't check for exceptions here. 1188 ShouldNotReachHere(); 1189 } 1190 1191 // Get oop result if there is one and reset the value in the thread. 1192 if (oop_result->is_valid()) { 1193 get_vm_result(oop_result); 1194 } 1195 1196 _last_calls_return_pc = return_pc; 1197 BLOCK_COMMENT("} call_VM"); 1198 } 1199 1200 void MacroAssembler::call_VM_leaf_base(address entry_point) { 1201 BLOCK_COMMENT("call_VM_leaf {"); 1202 #if defined(ABI_ELFv2) 1203 call_c(entry_point, relocInfo::none); 1204 #else 1205 call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none); 1206 #endif 1207 BLOCK_COMMENT("} call_VM_leaf"); 1208 } 1209 1210 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) { 1211 call_VM_base(oop_result, noreg, entry_point, check_exceptions); 1212 } 1213 1214 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, 1215 bool check_exceptions) { 1216 // R3_ARG1 is reserved for the thread. 1217 mr_if_needed(R4_ARG2, arg_1); 1218 call_VM(oop_result, entry_point, check_exceptions); 1219 } 1220 1221 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, 1222 bool check_exceptions) { 1223 // R3_ARG1 is reserved for the thread 1224 mr_if_needed(R4_ARG2, arg_1); 1225 assert(arg_2 != R4_ARG2, "smashed argument"); 1226 mr_if_needed(R5_ARG3, arg_2); 1227 call_VM(oop_result, entry_point, check_exceptions); 1228 } 1229 1230 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3, 1231 bool check_exceptions) { 1232 // R3_ARG1 is reserved for the thread 1233 mr_if_needed(R4_ARG2, arg_1); 1234 assert(arg_2 != R4_ARG2, "smashed argument"); 1235 mr_if_needed(R5_ARG3, arg_2); 1236 mr_if_needed(R6_ARG4, arg_3); 1237 call_VM(oop_result, entry_point, check_exceptions); 1238 } 1239 1240 void MacroAssembler::call_VM_leaf(address entry_point) { 1241 call_VM_leaf_base(entry_point); 1242 } 1243 1244 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) { 1245 mr_if_needed(R3_ARG1, arg_1); 1246 call_VM_leaf(entry_point); 1247 } 1248 1249 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) { 1250 mr_if_needed(R3_ARG1, arg_1); 1251 assert(arg_2 != R3_ARG1, "smashed argument"); 1252 mr_if_needed(R4_ARG2, arg_2); 1253 call_VM_leaf(entry_point); 1254 } 1255 1256 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) { 1257 mr_if_needed(R3_ARG1, arg_1); 1258 assert(arg_2 != R3_ARG1, "smashed argument"); 1259 mr_if_needed(R4_ARG2, arg_2); 1260 assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument"); 1261 mr_if_needed(R5_ARG3, arg_3); 1262 call_VM_leaf(entry_point); 1263 } 1264 1265 // Check whether instruction is a read access to the polling page 1266 // which was emitted by load_from_polling_page(..). 1267 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext, 1268 address* polling_address_ptr) { 1269 if (!is_ld(instruction)) 1270 return false; // It's not a ld. Fail. 1271 1272 int rt = inv_rt_field(instruction); 1273 int ra = inv_ra_field(instruction); 1274 int ds = inv_ds_field(instruction); 1275 if (!(ds == 0 && ra != 0 && rt == 0)) { 1276 return false; // It's not a ld(r0, X, ra). Fail. 1277 } 1278 1279 if (!ucontext) { 1280 // Set polling address. 1281 if (polling_address_ptr != NULL) { 1282 *polling_address_ptr = NULL; 1283 } 1284 return true; // No ucontext given. Can't check value of ra. Assume true. 1285 } 1286 1287 #ifdef LINUX 1288 // Ucontext given. Check that register ra contains the address of 1289 // the safepoing polling page. 1290 ucontext_t* uc = (ucontext_t*) ucontext; 1291 // Set polling address. 1292 address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds; 1293 if (polling_address_ptr != NULL) { 1294 *polling_address_ptr = addr; 1295 } 1296 return os::is_poll_address(addr); 1297 #else 1298 // Not on Linux, ucontext must be NULL. 1299 ShouldNotReachHere(); 1300 return false; 1301 #endif 1302 } 1303 1304 bool MacroAssembler::is_memory_serialization(int instruction, JavaThread* thread, void* ucontext) { 1305 #ifdef LINUX 1306 ucontext_t* uc = (ucontext_t*) ucontext; 1307 1308 if (is_stwx(instruction) || is_stwux(instruction)) { 1309 int ra = inv_ra_field(instruction); 1310 int rb = inv_rb_field(instruction); 1311 1312 // look up content of ra and rb in ucontext 1313 address ra_val=(address)uc->uc_mcontext.regs->gpr[ra]; 1314 long rb_val=(long)uc->uc_mcontext.regs->gpr[rb]; 1315 return os::is_memory_serialize_page(thread, ra_val+rb_val); 1316 } else if (is_stw(instruction) || is_stwu(instruction)) { 1317 int ra = inv_ra_field(instruction); 1318 int d1 = inv_d1_field(instruction); 1319 1320 // look up content of ra in ucontext 1321 address ra_val=(address)uc->uc_mcontext.regs->gpr[ra]; 1322 return os::is_memory_serialize_page(thread, ra_val+d1); 1323 } else { 1324 return false; 1325 } 1326 #else 1327 // workaround not needed on !LINUX :-) 1328 ShouldNotCallThis(); 1329 return false; 1330 #endif 1331 } 1332 1333 void MacroAssembler::bang_stack_with_offset(int offset) { 1334 // When increasing the stack, the old stack pointer will be written 1335 // to the new top of stack according to the PPC64 abi. 1336 // Therefore, stack banging is not necessary when increasing 1337 // the stack by <= os::vm_page_size() bytes. 1338 // When increasing the stack by a larger amount, this method is 1339 // called repeatedly to bang the intermediate pages. 1340 1341 // Stack grows down, caller passes positive offset. 1342 assert(offset > 0, "must bang with positive offset"); 1343 1344 long stdoffset = -offset; 1345 1346 if (is_simm(stdoffset, 16)) { 1347 // Signed 16 bit offset, a simple std is ok. 1348 if (UseLoadInstructionsForStackBangingPPC64) { 1349 ld(R0, (int)(signed short)stdoffset, R1_SP); 1350 } else { 1351 std(R0,(int)(signed short)stdoffset, R1_SP); 1352 } 1353 } else if (is_simm(stdoffset, 31)) { 1354 const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset); 1355 const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset); 1356 1357 Register tmp = R11; 1358 addis(tmp, R1_SP, hi); 1359 if (UseLoadInstructionsForStackBangingPPC64) { 1360 ld(R0, lo, tmp); 1361 } else { 1362 std(R0, lo, tmp); 1363 } 1364 } else { 1365 ShouldNotReachHere(); 1366 } 1367 } 1368 1369 // If instruction is a stack bang of the form 1370 // std R0, x(Ry), (see bang_stack_with_offset()) 1371 // stdu R1_SP, x(R1_SP), (see push_frame(), resize_frame()) 1372 // or stdux R1_SP, Rx, R1_SP (see push_frame(), resize_frame()) 1373 // return the banged address. Otherwise, return 0. 1374 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) { 1375 #ifdef LINUX 1376 ucontext_t* uc = (ucontext_t*) ucontext; 1377 int rs = inv_rs_field(instruction); 1378 int ra = inv_ra_field(instruction); 1379 if ( (is_ld(instruction) && rs == 0 && UseLoadInstructionsForStackBangingPPC64) 1380 || (is_std(instruction) && rs == 0 && !UseLoadInstructionsForStackBangingPPC64) 1381 || (is_stdu(instruction) && rs == 1)) { 1382 int ds = inv_ds_field(instruction); 1383 // return banged address 1384 return ds+(address)uc->uc_mcontext.regs->gpr[ra]; 1385 } else if (is_stdux(instruction) && rs == 1) { 1386 int rb = inv_rb_field(instruction); 1387 address sp = (address)uc->uc_mcontext.regs->gpr[1]; 1388 long rb_val = (long)uc->uc_mcontext.regs->gpr[rb]; 1389 return ra != 1 || rb_val >= 0 ? NULL // not a stack bang 1390 : sp + rb_val; // banged address 1391 } 1392 return NULL; // not a stack bang 1393 #else 1394 // workaround not needed on !LINUX :-) 1395 ShouldNotCallThis(); 1396 return NULL; 1397 #endif 1398 } 1399 1400 // CmpxchgX sets condition register to cmpX(current, compare). 1401 void MacroAssembler::cmpxchgw(ConditionRegister flag, Register dest_current_value, 1402 Register compare_value, Register exchange_value, 1403 Register addr_base, int semantics, bool cmpxchgx_hint, 1404 Register int_flag_success, bool contention_hint) { 1405 Label retry; 1406 Label failed; 1407 Label done; 1408 1409 // Save one branch if result is returned via register and 1410 // result register is different from the other ones. 1411 bool use_result_reg = (int_flag_success != noreg); 1412 bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value && 1413 int_flag_success != exchange_value && int_flag_success != addr_base); 1414 1415 if (use_result_reg && preset_result_reg) { 1416 li(int_flag_success, 0); // preset (assume cas failed) 1417 } 1418 1419 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1420 if (contention_hint) { // Don't try to reserve if cmp fails. 1421 lwz(dest_current_value, 0, addr_base); 1422 cmpw(flag, dest_current_value, compare_value); 1423 bne(flag, failed); 1424 } 1425 1426 // release/fence semantics 1427 if (semantics & MemBarRel) { 1428 release(); 1429 } 1430 1431 // atomic emulation loop 1432 bind(retry); 1433 1434 lwarx(dest_current_value, addr_base, cmpxchgx_hint); 1435 cmpw(flag, dest_current_value, compare_value); 1436 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1437 bne_predict_not_taken(flag, failed); 1438 } else { 1439 bne( flag, failed); 1440 } 1441 // branch to done => (flag == ne), (dest_current_value != compare_value) 1442 // fall through => (flag == eq), (dest_current_value == compare_value) 1443 1444 stwcx_(exchange_value, addr_base); 1445 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1446 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1447 } else { 1448 bne( CCR0, retry); // StXcx_ sets CCR0. 1449 } 1450 // fall through => (flag == eq), (dest_current_value == compare_value), (swapped) 1451 1452 // Result in register (must do this at the end because int_flag_success can be the 1453 // same register as one above). 1454 if (use_result_reg) { 1455 li(int_flag_success, 1); 1456 } 1457 1458 if (semantics & MemBarFenceAfter) { 1459 fence(); 1460 } else if (semantics & MemBarAcq) { 1461 isync(); 1462 } 1463 1464 if (use_result_reg && !preset_result_reg) { 1465 b(done); 1466 } 1467 1468 bind(failed); 1469 if (use_result_reg && !preset_result_reg) { 1470 li(int_flag_success, 0); 1471 } 1472 1473 bind(done); 1474 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1475 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1476 } 1477 1478 // Preforms atomic compare exchange: 1479 // if (compare_value == *addr_base) 1480 // *addr_base = exchange_value 1481 // int_flag_success = 1; 1482 // else 1483 // int_flag_success = 0; 1484 // 1485 // ConditionRegister flag = cmp(compare_value, *addr_base) 1486 // Register dest_current_value = *addr_base 1487 // Register compare_value Used to compare with value in memory 1488 // Register exchange_value Written to memory if compare_value == *addr_base 1489 // Register addr_base The memory location to compareXChange 1490 // Register int_flag_success Set to 1 if exchange_value was written to *addr_base 1491 // 1492 // To avoid the costly compare exchange the value is tested beforehand. 1493 // Several special cases exist to avoid that unnecessary information is generated. 1494 // 1495 void MacroAssembler::cmpxchgd(ConditionRegister flag, 1496 Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value, 1497 Register addr_base, int semantics, bool cmpxchgx_hint, 1498 Register int_flag_success, Label* failed_ext, bool contention_hint) { 1499 Label retry; 1500 Label failed_int; 1501 Label& failed = (failed_ext != NULL) ? *failed_ext : failed_int; 1502 Label done; 1503 1504 // Save one branch if result is returned via register and result register is different from the other ones. 1505 bool use_result_reg = (int_flag_success!=noreg); 1506 bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() && 1507 int_flag_success!=exchange_value && int_flag_success!=addr_base); 1508 assert(int_flag_success == noreg || failed_ext == NULL, "cannot have both"); 1509 1510 if (use_result_reg && preset_result_reg) { 1511 li(int_flag_success, 0); // preset (assume cas failed) 1512 } 1513 1514 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1515 if (contention_hint) { // Don't try to reserve if cmp fails. 1516 ld(dest_current_value, 0, addr_base); 1517 cmpd(flag, compare_value, dest_current_value); 1518 bne(flag, failed); 1519 } 1520 1521 // release/fence semantics 1522 if (semantics & MemBarRel) { 1523 release(); 1524 } 1525 1526 // atomic emulation loop 1527 bind(retry); 1528 1529 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1530 cmpd(flag, compare_value, dest_current_value); 1531 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1532 bne_predict_not_taken(flag, failed); 1533 } else { 1534 bne( flag, failed); 1535 } 1536 1537 stdcx_(exchange_value, addr_base); 1538 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1539 bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0 1540 } else { 1541 bne( CCR0, retry); // stXcx_ sets CCR0 1542 } 1543 1544 // result in register (must do this at the end because int_flag_success can be the same register as one above) 1545 if (use_result_reg) { 1546 li(int_flag_success, 1); 1547 } 1548 1549 if (semantics & MemBarFenceAfter) { 1550 fence(); 1551 } else if (semantics & MemBarAcq) { 1552 isync(); 1553 } 1554 1555 if (use_result_reg && !preset_result_reg) { 1556 b(done); 1557 } 1558 1559 bind(failed_int); 1560 if (use_result_reg && !preset_result_reg) { 1561 li(int_flag_success, 0); 1562 } 1563 1564 bind(done); 1565 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1566 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1567 } 1568 1569 // Look up the method for a megamorphic invokeinterface call. 1570 // The target method is determined by <intf_klass, itable_index>. 1571 // The receiver klass is in recv_klass. 1572 // On success, the result will be in method_result, and execution falls through. 1573 // On failure, execution transfers to the given label. 1574 void MacroAssembler::lookup_interface_method(Register recv_klass, 1575 Register intf_klass, 1576 RegisterOrConstant itable_index, 1577 Register method_result, 1578 Register scan_temp, 1579 Register sethi_temp, 1580 Label& L_no_such_interface) { 1581 assert_different_registers(recv_klass, intf_klass, method_result, scan_temp); 1582 assert(itable_index.is_constant() || itable_index.as_register() == method_result, 1583 "caller must use same register for non-constant itable index as for method"); 1584 1585 // Compute start of first itableOffsetEntry (which is at the end of the vtable). 1586 int vtable_base = in_bytes(InstanceKlass::vtable_start_offset()); 1587 int itentry_off = itableMethodEntry::method_offset_in_bytes(); 1588 int logMEsize = exact_log2(itableMethodEntry::size() * wordSize); 1589 int scan_step = itableOffsetEntry::size() * wordSize; 1590 int log_vte_size= exact_log2(vtableEntry::size_in_bytes()); 1591 1592 lwz(scan_temp, in_bytes(InstanceKlass::vtable_length_offset()), recv_klass); 1593 // %%% We should store the aligned, prescaled offset in the klassoop. 1594 // Then the next several instructions would fold away. 1595 1596 sldi(scan_temp, scan_temp, log_vte_size); 1597 addi(scan_temp, scan_temp, vtable_base); 1598 add(scan_temp, recv_klass, scan_temp); 1599 1600 // Adjust recv_klass by scaled itable_index, so we can free itable_index. 1601 if (itable_index.is_register()) { 1602 Register itable_offset = itable_index.as_register(); 1603 sldi(itable_offset, itable_offset, logMEsize); 1604 if (itentry_off) addi(itable_offset, itable_offset, itentry_off); 1605 add(recv_klass, itable_offset, recv_klass); 1606 } else { 1607 long itable_offset = (long)itable_index.as_constant(); 1608 load_const_optimized(sethi_temp, (itable_offset<<logMEsize)+itentry_off); // static address, no relocation 1609 add(recv_klass, sethi_temp, recv_klass); 1610 } 1611 1612 // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) { 1613 // if (scan->interface() == intf) { 1614 // result = (klass + scan->offset() + itable_index); 1615 // } 1616 // } 1617 Label search, found_method; 1618 1619 for (int peel = 1; peel >= 0; peel--) { 1620 // %%%% Could load both offset and interface in one ldx, if they were 1621 // in the opposite order. This would save a load. 1622 ld(method_result, itableOffsetEntry::interface_offset_in_bytes(), scan_temp); 1623 1624 // Check that this entry is non-null. A null entry means that 1625 // the receiver class doesn't implement the interface, and wasn't the 1626 // same as when the caller was compiled. 1627 cmpd(CCR0, method_result, intf_klass); 1628 1629 if (peel) { 1630 beq(CCR0, found_method); 1631 } else { 1632 bne(CCR0, search); 1633 // (invert the test to fall through to found_method...) 1634 } 1635 1636 if (!peel) break; 1637 1638 bind(search); 1639 1640 cmpdi(CCR0, method_result, 0); 1641 beq(CCR0, L_no_such_interface); 1642 addi(scan_temp, scan_temp, scan_step); 1643 } 1644 1645 bind(found_method); 1646 1647 // Got a hit. 1648 int ito_offset = itableOffsetEntry::offset_offset_in_bytes(); 1649 lwz(scan_temp, ito_offset, scan_temp); 1650 ldx(method_result, scan_temp, recv_klass); 1651 } 1652 1653 // virtual method calling 1654 void MacroAssembler::lookup_virtual_method(Register recv_klass, 1655 RegisterOrConstant vtable_index, 1656 Register method_result) { 1657 1658 assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg()); 1659 1660 const int base = in_bytes(InstanceKlass::vtable_start_offset()); 1661 assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); 1662 1663 if (vtable_index.is_register()) { 1664 sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord); 1665 add(recv_klass, vtable_index.as_register(), recv_klass); 1666 } else { 1667 addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord); 1668 } 1669 ld(R19_method, base + vtableEntry::method_offset_in_bytes(), recv_klass); 1670 } 1671 1672 /////////////////////////////////////////// subtype checking //////////////////////////////////////////// 1673 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, 1674 Register super_klass, 1675 Register temp1_reg, 1676 Register temp2_reg, 1677 Label* L_success, 1678 Label* L_failure, 1679 Label* L_slow_path, 1680 RegisterOrConstant super_check_offset) { 1681 1682 const Register check_cache_offset = temp1_reg; 1683 const Register cached_super = temp2_reg; 1684 1685 assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super); 1686 1687 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1688 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1689 1690 bool must_load_sco = (super_check_offset.constant_or_zero() == -1); 1691 bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset); 1692 1693 Label L_fallthrough; 1694 int label_nulls = 0; 1695 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 1696 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 1697 if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; } 1698 assert(label_nulls <= 1 || 1699 (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path), 1700 "at most one NULL in the batch, usually"); 1701 1702 // If the pointers are equal, we are done (e.g., String[] elements). 1703 // This self-check enables sharing of secondary supertype arrays among 1704 // non-primary types such as array-of-interface. Otherwise, each such 1705 // type would need its own customized SSA. 1706 // We move this check to the front of the fast path because many 1707 // type checks are in fact trivially successful in this manner, 1708 // so we get a nicely predicted branch right at the start of the check. 1709 cmpd(CCR0, sub_klass, super_klass); 1710 beq(CCR0, *L_success); 1711 1712 // Check the supertype display: 1713 if (must_load_sco) { 1714 // The super check offset is always positive... 1715 lwz(check_cache_offset, sco_offset, super_klass); 1716 super_check_offset = RegisterOrConstant(check_cache_offset); 1717 // super_check_offset is register. 1718 assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register()); 1719 } 1720 // The loaded value is the offset from KlassOopDesc. 1721 1722 ld(cached_super, super_check_offset, sub_klass); 1723 cmpd(CCR0, cached_super, super_klass); 1724 1725 // This check has worked decisively for primary supers. 1726 // Secondary supers are sought in the super_cache ('super_cache_addr'). 1727 // (Secondary supers are interfaces and very deeply nested subtypes.) 1728 // This works in the same check above because of a tricky aliasing 1729 // between the super_cache and the primary super display elements. 1730 // (The 'super_check_addr' can address either, as the case requires.) 1731 // Note that the cache is updated below if it does not help us find 1732 // what we need immediately. 1733 // So if it was a primary super, we can just fail immediately. 1734 // Otherwise, it's the slow path for us (no success at this point). 1735 1736 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); } 1737 1738 if (super_check_offset.is_register()) { 1739 beq(CCR0, *L_success); 1740 cmpwi(CCR0, super_check_offset.as_register(), sc_offset); 1741 if (L_failure == &L_fallthrough) { 1742 beq(CCR0, *L_slow_path); 1743 } else { 1744 bne(CCR0, *L_failure); 1745 FINAL_JUMP(*L_slow_path); 1746 } 1747 } else { 1748 if (super_check_offset.as_constant() == sc_offset) { 1749 // Need a slow path; fast failure is impossible. 1750 if (L_slow_path == &L_fallthrough) { 1751 beq(CCR0, *L_success); 1752 } else { 1753 bne(CCR0, *L_slow_path); 1754 FINAL_JUMP(*L_success); 1755 } 1756 } else { 1757 // No slow path; it's a fast decision. 1758 if (L_failure == &L_fallthrough) { 1759 beq(CCR0, *L_success); 1760 } else { 1761 bne(CCR0, *L_failure); 1762 FINAL_JUMP(*L_success); 1763 } 1764 } 1765 } 1766 1767 bind(L_fallthrough); 1768 #undef FINAL_JUMP 1769 } 1770 1771 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 1772 Register super_klass, 1773 Register temp1_reg, 1774 Register temp2_reg, 1775 Label* L_success, 1776 Register result_reg) { 1777 const Register array_ptr = temp1_reg; // current value from cache array 1778 const Register temp = temp2_reg; 1779 1780 assert_different_registers(sub_klass, super_klass, array_ptr, temp); 1781 1782 int source_offset = in_bytes(Klass::secondary_supers_offset()); 1783 int target_offset = in_bytes(Klass::secondary_super_cache_offset()); 1784 1785 int length_offset = Array<Klass*>::length_offset_in_bytes(); 1786 int base_offset = Array<Klass*>::base_offset_in_bytes(); 1787 1788 Label hit, loop, failure, fallthru; 1789 1790 ld(array_ptr, source_offset, sub_klass); 1791 1792 // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated."); 1793 lwz(temp, length_offset, array_ptr); 1794 cmpwi(CCR0, temp, 0); 1795 beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0 1796 1797 mtctr(temp); // load ctr 1798 1799 bind(loop); 1800 // Oops in table are NO MORE compressed. 1801 ld(temp, base_offset, array_ptr); 1802 cmpd(CCR0, temp, super_klass); 1803 beq(CCR0, hit); 1804 addi(array_ptr, array_ptr, BytesPerWord); 1805 bdnz(loop); 1806 1807 bind(failure); 1808 if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss) 1809 b(fallthru); 1810 1811 bind(hit); 1812 std(super_klass, target_offset, sub_klass); // save result to cache 1813 if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit) 1814 if (L_success != NULL) { b(*L_success); } 1815 else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided 1816 1817 bind(fallthru); 1818 } 1819 1820 // Try fast path, then go to slow one if not successful 1821 void MacroAssembler::check_klass_subtype(Register sub_klass, 1822 Register super_klass, 1823 Register temp1_reg, 1824 Register temp2_reg, 1825 Label& L_success) { 1826 Label L_failure; 1827 check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure); 1828 check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success); 1829 bind(L_failure); // Fallthru if not successful. 1830 } 1831 1832 void MacroAssembler::check_method_handle_type(Register mtype_reg, Register mh_reg, 1833 Register temp_reg, 1834 Label& wrong_method_type) { 1835 assert_different_registers(mtype_reg, mh_reg, temp_reg); 1836 // Compare method type against that of the receiver. 1837 load_heap_oop_not_null(temp_reg, delayed_value(java_lang_invoke_MethodHandle::type_offset_in_bytes, temp_reg), mh_reg); 1838 cmpd(CCR0, temp_reg, mtype_reg); 1839 bne(CCR0, wrong_method_type); 1840 } 1841 1842 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot, 1843 Register temp_reg, 1844 int extra_slot_offset) { 1845 // cf. TemplateTable::prepare_invoke(), if (load_receiver). 1846 int stackElementSize = Interpreter::stackElementSize; 1847 int offset = extra_slot_offset * stackElementSize; 1848 if (arg_slot.is_constant()) { 1849 offset += arg_slot.as_constant() * stackElementSize; 1850 return offset; 1851 } else { 1852 assert(temp_reg != noreg, "must specify"); 1853 sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize)); 1854 if (offset != 0) 1855 addi(temp_reg, temp_reg, offset); 1856 return temp_reg; 1857 } 1858 } 1859 1860 // Supports temp2_reg = R0. 1861 void MacroAssembler::biased_locking_enter(ConditionRegister cr_reg, Register obj_reg, 1862 Register mark_reg, Register temp_reg, 1863 Register temp2_reg, Label& done, Label* slow_case) { 1864 assert(UseBiasedLocking, "why call this otherwise?"); 1865 1866 #ifdef ASSERT 1867 assert_different_registers(obj_reg, mark_reg, temp_reg, temp2_reg); 1868 #endif 1869 1870 Label cas_label; 1871 1872 // Branch to done if fast path fails and no slow_case provided. 1873 Label *slow_case_int = (slow_case != NULL) ? slow_case : &done; 1874 1875 // Biased locking 1876 // See whether the lock is currently biased toward our thread and 1877 // whether the epoch is still valid 1878 // Note that the runtime guarantees sufficient alignment of JavaThread 1879 // pointers to allow age to be placed into low bits 1880 assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, 1881 "biased locking makes assumptions about bit layout"); 1882 1883 if (PrintBiasedLockingStatistics) { 1884 load_const(temp2_reg, (address) BiasedLocking::total_entry_count_addr(), temp_reg); 1885 lwzx(temp_reg, temp2_reg); 1886 addi(temp_reg, temp_reg, 1); 1887 stwx(temp_reg, temp2_reg); 1888 } 1889 1890 andi(temp_reg, mark_reg, markOopDesc::biased_lock_mask_in_place); 1891 cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern); 1892 bne(cr_reg, cas_label); 1893 1894 load_klass(temp_reg, obj_reg); 1895 1896 load_const_optimized(temp2_reg, ~((int) markOopDesc::age_mask_in_place)); 1897 ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg); 1898 orr(temp_reg, R16_thread, temp_reg); 1899 xorr(temp_reg, mark_reg, temp_reg); 1900 andr(temp_reg, temp_reg, temp2_reg); 1901 cmpdi(cr_reg, temp_reg, 0); 1902 if (PrintBiasedLockingStatistics) { 1903 Label l; 1904 bne(cr_reg, l); 1905 load_const(temp2_reg, (address) BiasedLocking::biased_lock_entry_count_addr()); 1906 lwzx(mark_reg, temp2_reg); 1907 addi(mark_reg, mark_reg, 1); 1908 stwx(mark_reg, temp2_reg); 1909 // restore mark_reg 1910 ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg); 1911 bind(l); 1912 } 1913 beq(cr_reg, done); 1914 1915 Label try_revoke_bias; 1916 Label try_rebias; 1917 1918 // At this point we know that the header has the bias pattern and 1919 // that we are not the bias owner in the current epoch. We need to 1920 // figure out more details about the state of the header in order to 1921 // know what operations can be legally performed on the object's 1922 // header. 1923 1924 // If the low three bits in the xor result aren't clear, that means 1925 // the prototype header is no longer biased and we have to revoke 1926 // the bias on this object. 1927 andi(temp2_reg, temp_reg, markOopDesc::biased_lock_mask_in_place); 1928 cmpwi(cr_reg, temp2_reg, 0); 1929 bne(cr_reg, try_revoke_bias); 1930 1931 // Biasing is still enabled for this data type. See whether the 1932 // epoch of the current bias is still valid, meaning that the epoch 1933 // bits of the mark word are equal to the epoch bits of the 1934 // prototype header. (Note that the prototype header's epoch bits 1935 // only change at a safepoint.) If not, attempt to rebias the object 1936 // toward the current thread. Note that we must be absolutely sure 1937 // that the current epoch is invalid in order to do this because 1938 // otherwise the manipulations it performs on the mark word are 1939 // illegal. 1940 1941 int shift_amount = 64 - markOopDesc::epoch_shift; 1942 // rotate epoch bits to right (little) end and set other bits to 0 1943 // [ big part | epoch | little part ] -> [ 0..0 | epoch ] 1944 rldicl_(temp2_reg, temp_reg, shift_amount, 64 - markOopDesc::epoch_bits); 1945 // branch if epoch bits are != 0, i.e. they differ, because the epoch has been incremented 1946 bne(CCR0, try_rebias); 1947 1948 // The epoch of the current bias is still valid but we know nothing 1949 // about the owner; it might be set or it might be clear. Try to 1950 // acquire the bias of the object using an atomic operation. If this 1951 // fails we will go in to the runtime to revoke the object's bias. 1952 // Note that we first construct the presumed unbiased header so we 1953 // don't accidentally blow away another thread's valid bias. 1954 andi(mark_reg, mark_reg, (markOopDesc::biased_lock_mask_in_place | 1955 markOopDesc::age_mask_in_place | 1956 markOopDesc::epoch_mask_in_place)); 1957 orr(temp_reg, R16_thread, mark_reg); 1958 1959 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 1960 1961 // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg). 1962 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg, 1963 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg, 1964 /*where=*/obj_reg, 1965 MacroAssembler::MemBarAcq, 1966 MacroAssembler::cmpxchgx_hint_acquire_lock(), 1967 noreg, slow_case_int); // bail out if failed 1968 1969 // If the biasing toward our thread failed, this means that 1970 // another thread succeeded in biasing it toward itself and we 1971 // need to revoke that bias. The revocation will occur in the 1972 // interpreter runtime in the slow case. 1973 if (PrintBiasedLockingStatistics) { 1974 load_const(temp2_reg, (address) BiasedLocking::anonymously_biased_lock_entry_count_addr(), temp_reg); 1975 lwzx(temp_reg, temp2_reg); 1976 addi(temp_reg, temp_reg, 1); 1977 stwx(temp_reg, temp2_reg); 1978 } 1979 b(done); 1980 1981 bind(try_rebias); 1982 // At this point we know the epoch has expired, meaning that the 1983 // current "bias owner", if any, is actually invalid. Under these 1984 // circumstances _only_, we are allowed to use the current header's 1985 // value as the comparison value when doing the cas to acquire the 1986 // bias in the current epoch. In other words, we allow transfer of 1987 // the bias from one thread to another directly in this situation. 1988 load_klass(temp_reg, obj_reg); 1989 andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place); 1990 orr(temp2_reg, R16_thread, temp2_reg); 1991 ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg); 1992 orr(temp_reg, temp2_reg, temp_reg); 1993 1994 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 1995 1996 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg, 1997 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg, 1998 /*where=*/obj_reg, 1999 MacroAssembler::MemBarAcq, 2000 MacroAssembler::cmpxchgx_hint_acquire_lock(), 2001 noreg, slow_case_int); // bail out if failed 2002 2003 // If the biasing toward our thread failed, this means that 2004 // another thread succeeded in biasing it toward itself and we 2005 // need to revoke that bias. The revocation will occur in the 2006 // interpreter runtime in the slow case. 2007 if (PrintBiasedLockingStatistics) { 2008 load_const(temp2_reg, (address) BiasedLocking::rebiased_lock_entry_count_addr(), temp_reg); 2009 lwzx(temp_reg, temp2_reg); 2010 addi(temp_reg, temp_reg, 1); 2011 stwx(temp_reg, temp2_reg); 2012 } 2013 b(done); 2014 2015 bind(try_revoke_bias); 2016 // The prototype mark in the klass doesn't have the bias bit set any 2017 // more, indicating that objects of this data type are not supposed 2018 // to be biased any more. We are going to try to reset the mark of 2019 // this object to the prototype value and fall through to the 2020 // CAS-based locking scheme. Note that if our CAS fails, it means 2021 // that another thread raced us for the privilege of revoking the 2022 // bias of this particular object, so it's okay to continue in the 2023 // normal locking code. 2024 load_klass(temp_reg, obj_reg); 2025 ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg); 2026 andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place); 2027 orr(temp_reg, temp_reg, temp2_reg); 2028 2029 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2030 2031 // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg). 2032 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg, 2033 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg, 2034 /*where=*/obj_reg, 2035 MacroAssembler::MemBarAcq, 2036 MacroAssembler::cmpxchgx_hint_acquire_lock()); 2037 2038 // reload markOop in mark_reg before continuing with lightweight locking 2039 ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg); 2040 2041 // Fall through to the normal CAS-based lock, because no matter what 2042 // the result of the above CAS, some thread must have succeeded in 2043 // removing the bias bit from the object's header. 2044 if (PrintBiasedLockingStatistics) { 2045 Label l; 2046 bne(cr_reg, l); 2047 load_const(temp2_reg, (address) BiasedLocking::revoked_lock_entry_count_addr(), temp_reg); 2048 lwzx(temp_reg, temp2_reg); 2049 addi(temp_reg, temp_reg, 1); 2050 stwx(temp_reg, temp2_reg); 2051 bind(l); 2052 } 2053 2054 bind(cas_label); 2055 } 2056 2057 void MacroAssembler::biased_locking_exit (ConditionRegister cr_reg, Register mark_addr, Register temp_reg, Label& done) { 2058 // Check for biased locking unlock case, which is a no-op 2059 // Note: we do not have to check the thread ID for two reasons. 2060 // First, the interpreter checks for IllegalMonitorStateException at 2061 // a higher level. Second, if the bias was revoked while we held the 2062 // lock, the object could not be rebiased toward another thread, so 2063 // the bias bit would be clear. 2064 2065 ld(temp_reg, 0, mark_addr); 2066 andi(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place); 2067 2068 cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern); 2069 beq(cr_reg, done); 2070 } 2071 2072 // allocation (for C1) 2073 void MacroAssembler::eden_allocate( 2074 Register obj, // result: pointer to object after successful allocation 2075 Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise 2076 int con_size_in_bytes, // object size in bytes if known at compile time 2077 Register t1, // temp register 2078 Register t2, // temp register 2079 Label& slow_case // continuation point if fast allocation fails 2080 ) { 2081 b(slow_case); 2082 } 2083 2084 void MacroAssembler::tlab_allocate( 2085 Register obj, // result: pointer to object after successful allocation 2086 Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise 2087 int con_size_in_bytes, // object size in bytes if known at compile time 2088 Register t1, // temp register 2089 Label& slow_case // continuation point if fast allocation fails 2090 ) { 2091 // make sure arguments make sense 2092 assert_different_registers(obj, var_size_in_bytes, t1); 2093 assert(0 <= con_size_in_bytes && is_simm13(con_size_in_bytes), "illegal object size"); 2094 assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment"); 2095 2096 const Register new_top = t1; 2097 //verify_tlab(); not implemented 2098 2099 ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread); 2100 ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread); 2101 if (var_size_in_bytes == noreg) { 2102 addi(new_top, obj, con_size_in_bytes); 2103 } else { 2104 add(new_top, obj, var_size_in_bytes); 2105 } 2106 cmpld(CCR0, new_top, R0); 2107 bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case); 2108 2109 #ifdef ASSERT 2110 // make sure new free pointer is properly aligned 2111 { 2112 Label L; 2113 andi_(R0, new_top, MinObjAlignmentInBytesMask); 2114 beq(CCR0, L); 2115 stop("updated TLAB free is not properly aligned", 0x934); 2116 bind(L); 2117 } 2118 #endif // ASSERT 2119 2120 // update the tlab top pointer 2121 std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread); 2122 //verify_tlab(); not implemented 2123 } 2124 void MacroAssembler::tlab_refill(Label& retry_tlab, Label& try_eden, Label& slow_case) { 2125 unimplemented("tlab_refill"); 2126 } 2127 void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register t1, Register t2) { 2128 unimplemented("incr_allocated_bytes"); 2129 } 2130 2131 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset, 2132 int insts_call_instruction_offset, Register Rtoc) { 2133 // Start the stub. 2134 address stub = start_a_stub(64); 2135 if (stub == NULL) { return NULL; } // CodeCache full: bail out 2136 2137 // Create a trampoline stub relocation which relates this trampoline stub 2138 // with the call instruction at insts_call_instruction_offset in the 2139 // instructions code-section. 2140 relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset)); 2141 const int stub_start_offset = offset(); 2142 2143 // For java_to_interp stubs we use R11_scratch1 as scratch register 2144 // and in call trampoline stubs we use R12_scratch2. This way we 2145 // can distinguish them (see is_NativeCallTrampolineStub_at()). 2146 Register reg_scratch = R12_scratch2; 2147 2148 // Now, create the trampoline stub's code: 2149 // - load the TOC 2150 // - load the call target from the constant pool 2151 // - call 2152 if (Rtoc == noreg) { 2153 calculate_address_from_global_toc(reg_scratch, method_toc()); 2154 Rtoc = reg_scratch; 2155 } 2156 2157 ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false); 2158 mtctr(reg_scratch); 2159 bctr(); 2160 2161 const address stub_start_addr = addr_at(stub_start_offset); 2162 2163 // Assert that the encoded destination_toc_offset can be identified and that it is correct. 2164 assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(), 2165 "encoded offset into the constant pool must match"); 2166 // Trampoline_stub_size should be good. 2167 assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size"); 2168 assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline"); 2169 2170 // End the stub. 2171 end_a_stub(); 2172 return stub; 2173 } 2174 2175 // TM on PPC64. 2176 void MacroAssembler::atomic_inc_ptr(Register addr, Register result, int simm16) { 2177 Label retry; 2178 bind(retry); 2179 ldarx(result, addr, /*hint*/ false); 2180 addi(result, result, simm16); 2181 stdcx_(result, addr); 2182 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 2183 bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0 2184 } else { 2185 bne( CCR0, retry); // stXcx_ sets CCR0 2186 } 2187 } 2188 2189 void MacroAssembler::atomic_ori_int(Register addr, Register result, int uimm16) { 2190 Label retry; 2191 bind(retry); 2192 lwarx(result, addr, /*hint*/ false); 2193 ori(result, result, uimm16); 2194 stwcx_(result, addr); 2195 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 2196 bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0 2197 } else { 2198 bne( CCR0, retry); // stXcx_ sets CCR0 2199 } 2200 } 2201 2202 #if INCLUDE_RTM_OPT 2203 2204 // Update rtm_counters based on abort status 2205 // input: abort_status 2206 // rtm_counters (RTMLockingCounters*) 2207 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters_Reg) { 2208 // Mapping to keep PreciseRTMLockingStatistics similar to x86. 2209 // x86 ppc (! means inverted, ? means not the same) 2210 // 0 31 Set if abort caused by XABORT instruction. 2211 // 1 ! 7 If set, the transaction may succeed on a retry. This bit is always clear if bit 0 is set. 2212 // 2 13 Set if another logical processor conflicted with a memory address that was part of the transaction that aborted. 2213 // 3 10 Set if an internal buffer overflowed. 2214 // 4 ?12 Set if a debug breakpoint was hit. 2215 // 5 ?32 Set if an abort occurred during execution of a nested transaction. 2216 const int tm_failure_bit[] = {Assembler::tm_tabort, // Note: Seems like signal handler sets this, too. 2217 Assembler::tm_failure_persistent, // inverted: transient 2218 Assembler::tm_trans_cf, 2219 Assembler::tm_footprint_of, 2220 Assembler::tm_non_trans_cf, 2221 Assembler::tm_suspended}; 2222 const bool tm_failure_inv[] = {false, true, false, false, false, false}; 2223 assert(sizeof(tm_failure_bit)/sizeof(int) == RTMLockingCounters::ABORT_STATUS_LIMIT, "adapt mapping!"); 2224 2225 const Register addr_Reg = R0; 2226 // Keep track of offset to where rtm_counters_Reg had pointed to. 2227 int counters_offs = RTMLockingCounters::abort_count_offset(); 2228 addi(addr_Reg, rtm_counters_Reg, counters_offs); 2229 const Register temp_Reg = rtm_counters_Reg; 2230 2231 //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically 2232 ldx(temp_Reg, addr_Reg); 2233 addi(temp_Reg, temp_Reg, 1); 2234 stdx(temp_Reg, addr_Reg); 2235 2236 if (PrintPreciseRTMLockingStatistics) { 2237 int counters_offs_delta = RTMLockingCounters::abortX_count_offset() - counters_offs; 2238 2239 //mftexasr(abort_status); done by caller 2240 for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) { 2241 counters_offs += counters_offs_delta; 2242 li(temp_Reg, counters_offs_delta); // can't use addi with R0 2243 add(addr_Reg, addr_Reg, temp_Reg); // point to next counter 2244 counters_offs_delta = sizeof(uintx); 2245 2246 Label check_abort; 2247 rldicr_(temp_Reg, abort_status, tm_failure_bit[i], 0); 2248 if (tm_failure_inv[i]) { 2249 bne(CCR0, check_abort); 2250 } else { 2251 beq(CCR0, check_abort); 2252 } 2253 //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically 2254 ldx(temp_Reg, addr_Reg); 2255 addi(temp_Reg, temp_Reg, 1); 2256 stdx(temp_Reg, addr_Reg); 2257 bind(check_abort); 2258 } 2259 } 2260 li(temp_Reg, -counters_offs); // can't use addi with R0 2261 add(rtm_counters_Reg, addr_Reg, temp_Reg); // restore 2262 } 2263 2264 // Branch if (random & (count-1) != 0), count is 2^n 2265 // tmp and CR0 are killed 2266 void MacroAssembler::branch_on_random_using_tb(Register tmp, int count, Label& brLabel) { 2267 mftb(tmp); 2268 andi_(tmp, tmp, count-1); 2269 bne(CCR0, brLabel); 2270 } 2271 2272 // Perform abort ratio calculation, set no_rtm bit if high ratio. 2273 // input: rtm_counters_Reg (RTMLockingCounters* address) - KILLED 2274 void MacroAssembler::rtm_abort_ratio_calculation(Register rtm_counters_Reg, 2275 RTMLockingCounters* rtm_counters, 2276 Metadata* method_data) { 2277 Label L_done, L_check_always_rtm1, L_check_always_rtm2; 2278 2279 if (RTMLockingCalculationDelay > 0) { 2280 // Delay calculation. 2281 ld(rtm_counters_Reg, (RegisterOrConstant)(intptr_t)RTMLockingCounters::rtm_calculation_flag_addr()); 2282 cmpdi(CCR0, rtm_counters_Reg, 0); 2283 beq(CCR0, L_done); 2284 load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload 2285 } 2286 // Abort ratio calculation only if abort_count > RTMAbortThreshold. 2287 // Aborted transactions = abort_count * 100 2288 // All transactions = total_count * RTMTotalCountIncrRate 2289 // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio) 2290 ld(R0, RTMLockingCounters::abort_count_offset(), rtm_counters_Reg); 2291 cmpdi(CCR0, R0, RTMAbortThreshold); 2292 blt(CCR0, L_check_always_rtm2); 2293 mulli(R0, R0, 100); 2294 2295 const Register tmpReg = rtm_counters_Reg; 2296 ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg); 2297 mulli(tmpReg, tmpReg, RTMTotalCountIncrRate); 2298 mulli(tmpReg, tmpReg, RTMAbortRatio); 2299 cmpd(CCR0, R0, tmpReg); 2300 blt(CCR0, L_check_always_rtm1); // jump to reload 2301 if (method_data != NULL) { 2302 // Set rtm_state to "no rtm" in MDO. 2303 // Not using a metadata relocation. Method and Class Loader are kept alive anyway. 2304 // (See nmethod::metadata_do and CodeBuffer::finalize_oop_references.) 2305 load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg); 2306 atomic_ori_int(R0, tmpReg, NoRTM); 2307 } 2308 b(L_done); 2309 2310 bind(L_check_always_rtm1); 2311 load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload 2312 bind(L_check_always_rtm2); 2313 ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg); 2314 cmpdi(CCR0, tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate); 2315 blt(CCR0, L_done); 2316 if (method_data != NULL) { 2317 // Set rtm_state to "always rtm" in MDO. 2318 // Not using a metadata relocation. See above. 2319 load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg); 2320 atomic_ori_int(R0, tmpReg, UseRTM); 2321 } 2322 bind(L_done); 2323 } 2324 2325 // Update counters and perform abort ratio calculation. 2326 // input: abort_status_Reg 2327 void MacroAssembler::rtm_profiling(Register abort_status_Reg, Register temp_Reg, 2328 RTMLockingCounters* rtm_counters, 2329 Metadata* method_data, 2330 bool profile_rtm) { 2331 2332 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2333 // Update rtm counters based on state at abort. 2334 // Reads abort_status_Reg, updates flags. 2335 assert_different_registers(abort_status_Reg, temp_Reg); 2336 load_const_optimized(temp_Reg, (address)rtm_counters, R0); 2337 rtm_counters_update(abort_status_Reg, temp_Reg); 2338 if (profile_rtm) { 2339 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2340 rtm_abort_ratio_calculation(temp_Reg, rtm_counters, method_data); 2341 } 2342 } 2343 2344 // Retry on abort if abort's status indicates non-persistent failure. 2345 // inputs: retry_count_Reg 2346 // : abort_status_Reg 2347 // output: retry_count_Reg decremented by 1 2348 void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, 2349 Label& retryLabel, Label* checkRetry) { 2350 Label doneRetry; 2351 rldicr_(R0, abort_status_Reg, tm_failure_persistent, 0); 2352 bne(CCR0, doneRetry); 2353 if (checkRetry) { bind(*checkRetry); } 2354 addic_(retry_count_Reg, retry_count_Reg, -1); 2355 blt(CCR0, doneRetry); 2356 smt_yield(); // Can't use wait(). No permission (SIGILL). 2357 b(retryLabel); 2358 bind(doneRetry); 2359 } 2360 2361 // Spin and retry if lock is busy. 2362 // inputs: box_Reg (monitor address) 2363 // : retry_count_Reg 2364 // output: retry_count_Reg decremented by 1 2365 // CTR is killed 2366 void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register owner_addr_Reg, Label& retryLabel) { 2367 Label SpinLoop, doneRetry; 2368 addic_(retry_count_Reg, retry_count_Reg, -1); 2369 blt(CCR0, doneRetry); 2370 li(R0, RTMSpinLoopCount); 2371 mtctr(R0); 2372 2373 bind(SpinLoop); 2374 smt_yield(); // Can't use waitrsv(). No permission (SIGILL). 2375 bdz(retryLabel); 2376 ld(R0, 0, owner_addr_Reg); 2377 cmpdi(CCR0, R0, 0); 2378 bne(CCR0, SpinLoop); 2379 b(retryLabel); 2380 2381 bind(doneRetry); 2382 } 2383 2384 // Use RTM for normal stack locks. 2385 // Input: objReg (object to lock) 2386 void MacroAssembler::rtm_stack_locking(ConditionRegister flag, 2387 Register obj, Register mark_word, Register tmp, 2388 Register retry_on_abort_count_Reg, 2389 RTMLockingCounters* stack_rtm_counters, 2390 Metadata* method_data, bool profile_rtm, 2391 Label& DONE_LABEL, Label& IsInflated) { 2392 assert(UseRTMForStackLocks, "why call this otherwise?"); 2393 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); 2394 Label L_rtm_retry, L_decrement_retry, L_on_abort; 2395 2396 if (RTMRetryCount > 0) { 2397 load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 2398 bind(L_rtm_retry); 2399 } 2400 andi_(R0, mark_word, markOopDesc::monitor_value); // inflated vs stack-locked|neutral|biased 2401 bne(CCR0, IsInflated); 2402 2403 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2404 Label L_noincrement; 2405 if (RTMTotalCountIncrRate > 1) { 2406 branch_on_random_using_tb(tmp, (int)RTMTotalCountIncrRate, L_noincrement); 2407 } 2408 assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM"); 2409 load_const_optimized(tmp, (address)stack_rtm_counters->total_count_addr(), R0); 2410 //atomic_inc_ptr(tmp, /*temp, will be reloaded*/mark_word); We don't increment atomically 2411 ldx(mark_word, tmp); 2412 addi(mark_word, mark_word, 1); 2413 stdx(mark_word, tmp); 2414 bind(L_noincrement); 2415 } 2416 tbegin_(); 2417 beq(CCR0, L_on_abort); 2418 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // Reload in transaction, conflicts need to be tracked. 2419 andi(R0, mark_word, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits 2420 cmpwi(flag, R0, markOopDesc::unlocked_value); // bits = 001 unlocked 2421 beq(flag, DONE_LABEL); // all done if unlocked 2422 2423 if (UseRTMXendForLockBusy) { 2424 tend_(); 2425 b(L_decrement_retry); 2426 } else { 2427 tabort_(); 2428 } 2429 bind(L_on_abort); 2430 const Register abort_status_Reg = tmp; 2431 mftexasr(abort_status_Reg); 2432 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2433 rtm_profiling(abort_status_Reg, /*temp*/mark_word, stack_rtm_counters, method_data, profile_rtm); 2434 } 2435 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // reload 2436 if (RTMRetryCount > 0) { 2437 // Retry on lock abort if abort status is not permanent. 2438 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry, &L_decrement_retry); 2439 } else { 2440 bind(L_decrement_retry); 2441 } 2442 } 2443 2444 // Use RTM for inflating locks 2445 // inputs: obj (object to lock) 2446 // mark_word (current header - KILLED) 2447 // boxReg (on-stack box address (displaced header location) - KILLED) 2448 void MacroAssembler::rtm_inflated_locking(ConditionRegister flag, 2449 Register obj, Register mark_word, Register boxReg, 2450 Register retry_on_busy_count_Reg, Register retry_on_abort_count_Reg, 2451 RTMLockingCounters* rtm_counters, 2452 Metadata* method_data, bool profile_rtm, 2453 Label& DONE_LABEL) { 2454 assert(UseRTMLocking, "why call this otherwise?"); 2455 Label L_rtm_retry, L_decrement_retry, L_on_abort; 2456 // Clean monitor_value bit to get valid pointer. 2457 int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markOopDesc::monitor_value; 2458 2459 // Store non-null, using boxReg instead of (intptr_t)markOopDesc::unused_mark(). 2460 std(boxReg, BasicLock::displaced_header_offset_in_bytes(), boxReg); 2461 const Register tmpReg = boxReg; 2462 const Register owner_addr_Reg = mark_word; 2463 addi(owner_addr_Reg, mark_word, owner_offset); 2464 2465 if (RTMRetryCount > 0) { 2466 load_const_optimized(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy. 2467 load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort. 2468 bind(L_rtm_retry); 2469 } 2470 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2471 Label L_noincrement; 2472 if (RTMTotalCountIncrRate > 1) { 2473 branch_on_random_using_tb(R0, (int)RTMTotalCountIncrRate, L_noincrement); 2474 } 2475 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2476 load_const(R0, (address)rtm_counters->total_count_addr(), tmpReg); 2477 //atomic_inc_ptr(R0, tmpReg); We don't increment atomically 2478 ldx(tmpReg, R0); 2479 addi(tmpReg, tmpReg, 1); 2480 stdx(tmpReg, R0); 2481 bind(L_noincrement); 2482 } 2483 tbegin_(); 2484 beq(CCR0, L_on_abort); 2485 // We don't reload mark word. Will only be reset at safepoint. 2486 ld(R0, 0, owner_addr_Reg); // Load in transaction, conflicts need to be tracked. 2487 cmpdi(flag, R0, 0); 2488 beq(flag, DONE_LABEL); 2489 2490 if (UseRTMXendForLockBusy) { 2491 tend_(); 2492 b(L_decrement_retry); 2493 } else { 2494 tabort_(); 2495 } 2496 bind(L_on_abort); 2497 const Register abort_status_Reg = tmpReg; 2498 mftexasr(abort_status_Reg); 2499 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2500 rtm_profiling(abort_status_Reg, /*temp*/ owner_addr_Reg, rtm_counters, method_data, profile_rtm); 2501 // Restore owner_addr_Reg 2502 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); 2503 #ifdef ASSERT 2504 andi_(R0, mark_word, markOopDesc::monitor_value); 2505 asm_assert_ne("must be inflated", 0xa754); // Deflating only allowed at safepoint. 2506 #endif 2507 addi(owner_addr_Reg, mark_word, owner_offset); 2508 } 2509 if (RTMRetryCount > 0) { 2510 // Retry on lock abort if abort status is not permanent. 2511 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 2512 } 2513 2514 // Appears unlocked - try to swing _owner from null to non-null. 2515 cmpxchgd(flag, /*current val*/ R0, (intptr_t)0, /*new val*/ R16_thread, owner_addr_Reg, 2516 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2517 MacroAssembler::cmpxchgx_hint_acquire_lock(), noreg, &L_decrement_retry, true); 2518 2519 if (RTMRetryCount > 0) { 2520 // success done else retry 2521 b(DONE_LABEL); 2522 bind(L_decrement_retry); 2523 // Spin and retry if lock is busy. 2524 rtm_retry_lock_on_busy(retry_on_busy_count_Reg, owner_addr_Reg, L_rtm_retry); 2525 } else { 2526 bind(L_decrement_retry); 2527 } 2528 } 2529 2530 #endif // INCLUDE_RTM_OPT 2531 2532 // "The box" is the space on the stack where we copy the object mark. 2533 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box, 2534 Register temp, Register displaced_header, Register current_header, 2535 bool try_bias, 2536 RTMLockingCounters* rtm_counters, 2537 RTMLockingCounters* stack_rtm_counters, 2538 Metadata* method_data, 2539 bool use_rtm, bool profile_rtm) { 2540 assert_different_registers(oop, box, temp, displaced_header, current_header); 2541 assert(flag != CCR0, "bad condition register"); 2542 Label cont; 2543 Label object_has_monitor; 2544 Label cas_failed; 2545 2546 // Load markOop from object into displaced_header. 2547 ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop); 2548 2549 2550 // Always do locking in runtime. 2551 if (EmitSync & 0x01) { 2552 cmpdi(flag, oop, 0); // Oop can't be 0 here => always false. 2553 return; 2554 } 2555 2556 if (try_bias) { 2557 biased_locking_enter(flag, oop, displaced_header, temp, current_header, cont); 2558 } 2559 2560 #if INCLUDE_RTM_OPT 2561 if (UseRTMForStackLocks && use_rtm) { 2562 rtm_stack_locking(flag, oop, displaced_header, temp, /*temp*/ current_header, 2563 stack_rtm_counters, method_data, profile_rtm, 2564 cont, object_has_monitor); 2565 } 2566 #endif // INCLUDE_RTM_OPT 2567 2568 // Handle existing monitor. 2569 if ((EmitSync & 0x02) == 0) { 2570 // The object has an existing monitor iff (mark & monitor_value) != 0. 2571 andi_(temp, displaced_header, markOopDesc::monitor_value); 2572 bne(CCR0, object_has_monitor); 2573 } 2574 2575 // Set displaced_header to be (markOop of object | UNLOCK_VALUE). 2576 ori(displaced_header, displaced_header, markOopDesc::unlocked_value); 2577 2578 // Load Compare Value application register. 2579 2580 // Initialize the box. (Must happen before we update the object mark!) 2581 std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2582 2583 // Must fence, otherwise, preceding store(s) may float below cmpxchg. 2584 // Compare object markOop with mark and if equal exchange scratch1 with object markOop. 2585 cmpxchgd(/*flag=*/flag, 2586 /*current_value=*/current_header, 2587 /*compare_value=*/displaced_header, 2588 /*exchange_value=*/box, 2589 /*where=*/oop, 2590 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2591 MacroAssembler::cmpxchgx_hint_acquire_lock(), 2592 noreg, 2593 &cas_failed, 2594 /*check without membar and ldarx first*/true); 2595 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2596 2597 // If the compare-and-exchange succeeded, then we found an unlocked 2598 // object and we have now locked it. 2599 b(cont); 2600 2601 bind(cas_failed); 2602 // We did not see an unlocked object so try the fast recursive case. 2603 2604 // Check if the owner is self by comparing the value in the markOop of object 2605 // (current_header) with the stack pointer. 2606 sub(current_header, current_header, R1_SP); 2607 load_const_optimized(temp, ~(os::vm_page_size()-1) | markOopDesc::lock_mask_in_place); 2608 2609 and_(R0/*==0?*/, current_header, temp); 2610 // If condition is true we are cont and hence we can store 0 as the 2611 // displaced header in the box, which indicates that it is a recursive lock. 2612 mcrf(flag,CCR0); 2613 std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box); 2614 2615 // Handle existing monitor. 2616 if ((EmitSync & 0x02) == 0) { 2617 b(cont); 2618 2619 bind(object_has_monitor); 2620 // The object's monitor m is unlocked iff m->owner == NULL, 2621 // otherwise m->owner may contain a thread or a stack address. 2622 2623 #if INCLUDE_RTM_OPT 2624 // Use the same RTM locking code in 32- and 64-bit VM. 2625 if (use_rtm) { 2626 rtm_inflated_locking(flag, oop, displaced_header, box, temp, /*temp*/ current_header, 2627 rtm_counters, method_data, profile_rtm, cont); 2628 } else { 2629 #endif // INCLUDE_RTM_OPT 2630 2631 // Try to CAS m->owner from NULL to current thread. 2632 addi(temp, displaced_header, ObjectMonitor::owner_offset_in_bytes()-markOopDesc::monitor_value); 2633 cmpxchgd(/*flag=*/flag, 2634 /*current_value=*/current_header, 2635 /*compare_value=*/(intptr_t)0, 2636 /*exchange_value=*/R16_thread, 2637 /*where=*/temp, 2638 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2639 MacroAssembler::cmpxchgx_hint_acquire_lock()); 2640 2641 // Store a non-null value into the box. 2642 std(box, BasicLock::displaced_header_offset_in_bytes(), box); 2643 2644 # ifdef ASSERT 2645 bne(flag, cont); 2646 // We have acquired the monitor, check some invariants. 2647 addi(/*monitor=*/temp, temp, -ObjectMonitor::owner_offset_in_bytes()); 2648 // Invariant 1: _recursions should be 0. 2649 //assert(ObjectMonitor::recursions_size_in_bytes() == 8, "unexpected size"); 2650 asm_assert_mem8_is_zero(ObjectMonitor::recursions_offset_in_bytes(), temp, 2651 "monitor->_recursions should be 0", -1); 2652 // Invariant 2: OwnerIsThread shouldn't be 0. 2653 //assert(ObjectMonitor::OwnerIsThread_size_in_bytes() == 4, "unexpected size"); 2654 //asm_assert_mem4_isnot_zero(ObjectMonitor::OwnerIsThread_offset_in_bytes(), temp, 2655 // "monitor->OwnerIsThread shouldn't be 0", -1); 2656 # endif 2657 2658 #if INCLUDE_RTM_OPT 2659 } // use_rtm() 2660 #endif 2661 } 2662 2663 bind(cont); 2664 // flag == EQ indicates success 2665 // flag == NE indicates failure 2666 } 2667 2668 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box, 2669 Register temp, Register displaced_header, Register current_header, 2670 bool try_bias, bool use_rtm) { 2671 assert_different_registers(oop, box, temp, displaced_header, current_header); 2672 assert(flag != CCR0, "bad condition register"); 2673 Label cont; 2674 Label object_has_monitor; 2675 2676 // Always do locking in runtime. 2677 if (EmitSync & 0x01) { 2678 cmpdi(flag, oop, 0); // Oop can't be 0 here => always false. 2679 return; 2680 } 2681 2682 if (try_bias) { 2683 biased_locking_exit(flag, oop, current_header, cont); 2684 } 2685 2686 #if INCLUDE_RTM_OPT 2687 if (UseRTMForStackLocks && use_rtm) { 2688 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); 2689 Label L_regular_unlock; 2690 ld(current_header, oopDesc::mark_offset_in_bytes(), oop); // fetch markword 2691 andi(R0, current_header, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits 2692 cmpwi(flag, R0, markOopDesc::unlocked_value); // bits = 001 unlocked 2693 bne(flag, L_regular_unlock); // else RegularLock 2694 tend_(); // otherwise end... 2695 b(cont); // ... and we're done 2696 bind(L_regular_unlock); 2697 } 2698 #endif 2699 2700 // Find the lock address and load the displaced header from the stack. 2701 ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2702 2703 // If the displaced header is 0, we have a recursive unlock. 2704 cmpdi(flag, displaced_header, 0); 2705 beq(flag, cont); 2706 2707 // Handle existing monitor. 2708 if ((EmitSync & 0x02) == 0) { 2709 // The object has an existing monitor iff (mark & monitor_value) != 0. 2710 RTM_OPT_ONLY( if (!(UseRTMForStackLocks && use_rtm)) ) // skip load if already done 2711 ld(current_header, oopDesc::mark_offset_in_bytes(), oop); 2712 andi_(R0, current_header, markOopDesc::monitor_value); 2713 bne(CCR0, object_has_monitor); 2714 } 2715 2716 // Check if it is still a light weight lock, this is is true if we see 2717 // the stack address of the basicLock in the markOop of the object. 2718 // Cmpxchg sets flag to cmpd(current_header, box). 2719 cmpxchgd(/*flag=*/flag, 2720 /*current_value=*/current_header, 2721 /*compare_value=*/box, 2722 /*exchange_value=*/displaced_header, 2723 /*where=*/oop, 2724 MacroAssembler::MemBarRel, 2725 MacroAssembler::cmpxchgx_hint_release_lock(), 2726 noreg, 2727 &cont); 2728 2729 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2730 2731 // Handle existing monitor. 2732 if ((EmitSync & 0x02) == 0) { 2733 b(cont); 2734 2735 bind(object_has_monitor); 2736 addi(current_header, current_header, -markOopDesc::monitor_value); // monitor 2737 ld(temp, ObjectMonitor::owner_offset_in_bytes(), current_header); 2738 2739 // It's inflated. 2740 #if INCLUDE_RTM_OPT 2741 if (use_rtm) { 2742 Label L_regular_inflated_unlock; 2743 // Clean monitor_value bit to get valid pointer 2744 cmpdi(flag, temp, 0); 2745 bne(flag, L_regular_inflated_unlock); 2746 tend_(); 2747 b(cont); 2748 bind(L_regular_inflated_unlock); 2749 } 2750 #endif 2751 2752 ld(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header); 2753 xorr(temp, R16_thread, temp); // Will be 0 if we are the owner. 2754 orr(temp, temp, displaced_header); // Will be 0 if there are 0 recursions. 2755 cmpdi(flag, temp, 0); 2756 bne(flag, cont); 2757 2758 ld(temp, ObjectMonitor::EntryList_offset_in_bytes(), current_header); 2759 ld(displaced_header, ObjectMonitor::cxq_offset_in_bytes(), current_header); 2760 orr(temp, temp, displaced_header); // Will be 0 if both are 0. 2761 cmpdi(flag, temp, 0); 2762 bne(flag, cont); 2763 release(); 2764 std(temp, ObjectMonitor::owner_offset_in_bytes(), current_header); 2765 } 2766 2767 bind(cont); 2768 // flag == EQ indicates success 2769 // flag == NE indicates failure 2770 } 2771 2772 // Write serialization page so VM thread can do a pseudo remote membar. 2773 // We use the current thread pointer to calculate a thread specific 2774 // offset to write to within the page. This minimizes bus traffic 2775 // due to cache line collision. 2776 void MacroAssembler::serialize_memory(Register thread, Register tmp1, Register tmp2) { 2777 srdi(tmp2, thread, os::get_serialize_page_shift_count()); 2778 2779 int mask = os::vm_page_size() - sizeof(int); 2780 if (Assembler::is_simm(mask, 16)) { 2781 andi(tmp2, tmp2, mask); 2782 } else { 2783 lis(tmp1, (int)((signed short) (mask >> 16))); 2784 ori(tmp1, tmp1, mask & 0x0000ffff); 2785 andr(tmp2, tmp2, tmp1); 2786 } 2787 2788 load_const(tmp1, (long) os::get_memory_serialize_page()); 2789 release(); 2790 stwx(R0, tmp1, tmp2); 2791 } 2792 2793 2794 // GC barrier helper macros 2795 2796 // Write the card table byte if needed. 2797 void MacroAssembler::card_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp) { 2798 CardTableModRefBS* bs = 2799 barrier_set_cast<CardTableModRefBS>(Universe::heap()->barrier_set()); 2800 assert(bs->kind() == BarrierSet::CardTableForRS || 2801 bs->kind() == BarrierSet::CardTableExtension, "wrong barrier"); 2802 #ifdef ASSERT 2803 cmpdi(CCR0, Rnew_val, 0); 2804 asm_assert_ne("null oop not allowed", 0x321); 2805 #endif 2806 card_table_write(bs->byte_map_base, Rtmp, Rstore_addr); 2807 } 2808 2809 // Write the card table byte. 2810 void MacroAssembler::card_table_write(jbyte* byte_map_base, Register Rtmp, Register Robj) { 2811 assert_different_registers(Robj, Rtmp, R0); 2812 load_const_optimized(Rtmp, (address)byte_map_base, R0); 2813 srdi(Robj, Robj, CardTableModRefBS::card_shift); 2814 li(R0, 0); // dirty 2815 if (UseConcMarkSweepGC) membar(Assembler::StoreStore); 2816 stbx(R0, Rtmp, Robj); 2817 } 2818 2819 #if INCLUDE_ALL_GCS 2820 // General G1 pre-barrier generator. 2821 // Goal: record the previous value if it is not null. 2822 void MacroAssembler::g1_write_barrier_pre(Register Robj, RegisterOrConstant offset, Register Rpre_val, 2823 Register Rtmp1, Register Rtmp2, bool needs_frame) { 2824 Label runtime, filtered; 2825 2826 // Is marking active? 2827 if (in_bytes(SATBMarkQueue::byte_width_of_active()) == 4) { 2828 lwz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_active()), R16_thread); 2829 } else { 2830 guarantee(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, "Assumption"); 2831 lbz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_active()), R16_thread); 2832 } 2833 cmpdi(CCR0, Rtmp1, 0); 2834 beq(CCR0, filtered); 2835 2836 // Do we need to load the previous value? 2837 if (Robj != noreg) { 2838 // Load the previous value... 2839 if (UseCompressedOops) { 2840 lwz(Rpre_val, offset, Robj); 2841 } else { 2842 ld(Rpre_val, offset, Robj); 2843 } 2844 // Previous value has been loaded into Rpre_val. 2845 } 2846 assert(Rpre_val != noreg, "must have a real register"); 2847 2848 // Is the previous value null? 2849 cmpdi(CCR0, Rpre_val, 0); 2850 beq(CCR0, filtered); 2851 2852 if (Robj != noreg && UseCompressedOops) { 2853 decode_heap_oop_not_null(Rpre_val); 2854 } 2855 2856 // OK, it's not filtered, so we'll need to call enqueue. In the normal 2857 // case, pre_val will be a scratch G-reg, but there are some cases in 2858 // which it's an O-reg. In the first case, do a normal call. In the 2859 // latter, do a save here and call the frameless version. 2860 2861 // Can we store original value in the thread's buffer? 2862 // Is index == 0? 2863 // (The index field is typed as size_t.) 2864 const Register Rbuffer = Rtmp1, Rindex = Rtmp2; 2865 2866 ld(Rindex, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_index()), R16_thread); 2867 cmpdi(CCR0, Rindex, 0); 2868 beq(CCR0, runtime); // If index == 0, goto runtime. 2869 ld(Rbuffer, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_buf()), R16_thread); 2870 2871 addi(Rindex, Rindex, -wordSize); // Decrement index. 2872 std(Rindex, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_index()), R16_thread); 2873 2874 // Record the previous value. 2875 stdx(Rpre_val, Rbuffer, Rindex); 2876 b(filtered); 2877 2878 bind(runtime); 2879 2880 // VM call need frame to access(write) O register. 2881 if (needs_frame) { 2882 save_LR_CR(Rtmp1); 2883 push_frame_reg_args(0, Rtmp2); 2884 } 2885 2886 if (Rpre_val->is_volatile() && Robj == noreg) mr(R31, Rpre_val); // Save pre_val across C call if it was preloaded. 2887 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), Rpre_val, R16_thread); 2888 if (Rpre_val->is_volatile() && Robj == noreg) mr(Rpre_val, R31); // restore 2889 2890 if (needs_frame) { 2891 pop_frame(); 2892 restore_LR_CR(Rtmp1); 2893 } 2894 2895 bind(filtered); 2896 } 2897 2898 // General G1 post-barrier generator 2899 // Store cross-region card. 2900 void MacroAssembler::g1_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp1, Register Rtmp2, Register Rtmp3, Label *filtered_ext) { 2901 Label runtime, filtered_int; 2902 Label& filtered = (filtered_ext != NULL) ? *filtered_ext : filtered_int; 2903 assert_different_registers(Rstore_addr, Rnew_val, Rtmp1, Rtmp2); 2904 2905 G1SATBCardTableLoggingModRefBS* bs = 2906 barrier_set_cast<G1SATBCardTableLoggingModRefBS>(Universe::heap()->barrier_set()); 2907 2908 // Does store cross heap regions? 2909 if (G1RSBarrierRegionFilter) { 2910 xorr(Rtmp1, Rstore_addr, Rnew_val); 2911 srdi_(Rtmp1, Rtmp1, HeapRegion::LogOfHRGrainBytes); 2912 beq(CCR0, filtered); 2913 } 2914 2915 // Crosses regions, storing NULL? 2916 #ifdef ASSERT 2917 cmpdi(CCR0, Rnew_val, 0); 2918 asm_assert_ne("null oop not allowed (G1)", 0x322); // Checked by caller on PPC64, so following branch is obsolete: 2919 //beq(CCR0, filtered); 2920 #endif 2921 2922 // Storing region crossing non-NULL, is card already dirty? 2923 assert(sizeof(*bs->byte_map_base) == sizeof(jbyte), "adjust this code"); 2924 const Register Rcard_addr = Rtmp1; 2925 Register Rbase = Rtmp2; 2926 load_const_optimized(Rbase, (address)bs->byte_map_base, /*temp*/ Rtmp3); 2927 2928 srdi(Rcard_addr, Rstore_addr, CardTableModRefBS::card_shift); 2929 2930 // Get the address of the card. 2931 lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr); 2932 cmpwi(CCR0, Rtmp3, (int)G1SATBCardTableModRefBS::g1_young_card_val()); 2933 beq(CCR0, filtered); 2934 2935 membar(Assembler::StoreLoad); 2936 lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr); // Reload after membar. 2937 cmpwi(CCR0, Rtmp3 /* card value */, CardTableModRefBS::dirty_card_val()); 2938 beq(CCR0, filtered); 2939 2940 // Storing a region crossing, non-NULL oop, card is clean. 2941 // Dirty card and log. 2942 li(Rtmp3, CardTableModRefBS::dirty_card_val()); 2943 //release(); // G1: oops are allowed to get visible after dirty marking. 2944 stbx(Rtmp3, Rbase, Rcard_addr); 2945 2946 add(Rcard_addr, Rbase, Rcard_addr); // This is the address which needs to get enqueued. 2947 Rbase = noreg; // end of lifetime 2948 2949 const Register Rqueue_index = Rtmp2, 2950 Rqueue_buf = Rtmp3; 2951 ld(Rqueue_index, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_index()), R16_thread); 2952 cmpdi(CCR0, Rqueue_index, 0); 2953 beq(CCR0, runtime); // index == 0 then jump to runtime 2954 ld(Rqueue_buf, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_buf()), R16_thread); 2955 2956 addi(Rqueue_index, Rqueue_index, -wordSize); // decrement index 2957 std(Rqueue_index, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_index()), R16_thread); 2958 2959 stdx(Rcard_addr, Rqueue_buf, Rqueue_index); // store card 2960 b(filtered); 2961 2962 bind(runtime); 2963 2964 // Save the live input values. 2965 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), Rcard_addr, R16_thread); 2966 2967 bind(filtered_int); 2968 } 2969 #endif // INCLUDE_ALL_GCS 2970 2971 // Values for last_Java_pc, and last_Java_sp must comply to the rules 2972 // in frame_ppc.hpp. 2973 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) { 2974 // Always set last_Java_pc and flags first because once last_Java_sp 2975 // is visible has_last_Java_frame is true and users will look at the 2976 // rest of the fields. (Note: flags should always be zero before we 2977 // get here so doesn't need to be set.) 2978 2979 // Verify that last_Java_pc was zeroed on return to Java 2980 asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread, 2981 "last_Java_pc not zeroed before leaving Java", 0x200); 2982 2983 // When returning from calling out from Java mode the frame anchor's 2984 // last_Java_pc will always be set to NULL. It is set here so that 2985 // if we are doing a call to native (not VM) that we capture the 2986 // known pc and don't have to rely on the native call having a 2987 // standard frame linkage where we can find the pc. 2988 if (last_Java_pc != noreg) 2989 std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 2990 2991 // Set last_Java_sp last. 2992 std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 2993 } 2994 2995 void MacroAssembler::reset_last_Java_frame(void) { 2996 asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()), 2997 R16_thread, "SP was not set, still zero", 0x202); 2998 2999 BLOCK_COMMENT("reset_last_Java_frame {"); 3000 li(R0, 0); 3001 3002 // _last_Java_sp = 0 3003 std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 3004 3005 // _last_Java_pc = 0 3006 std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 3007 BLOCK_COMMENT("} reset_last_Java_frame"); 3008 } 3009 3010 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) { 3011 assert_different_registers(sp, tmp1); 3012 3013 // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via 3014 // TOP_IJAVA_FRAME_ABI. 3015 // FIXME: assert that we really have a TOP_IJAVA_FRAME here! 3016 address entry = pc(); 3017 load_const_optimized(tmp1, entry); 3018 3019 set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1); 3020 } 3021 3022 void MacroAssembler::get_vm_result(Register oop_result) { 3023 // Read: 3024 // R16_thread 3025 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 3026 // 3027 // Updated: 3028 // oop_result 3029 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 3030 3031 verify_thread(); 3032 3033 ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread); 3034 li(R0, 0); 3035 std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread); 3036 3037 verify_oop(oop_result); 3038 } 3039 3040 void MacroAssembler::get_vm_result_2(Register metadata_result) { 3041 // Read: 3042 // R16_thread 3043 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 3044 // 3045 // Updated: 3046 // metadata_result 3047 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 3048 3049 ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 3050 li(R0, 0); 3051 std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 3052 } 3053 3054 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) { 3055 Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided. 3056 if (Universe::narrow_klass_base() != 0) { 3057 // Use dst as temp if it is free. 3058 sub_const_optimized(dst, current, Universe::narrow_klass_base(), R0); 3059 current = dst; 3060 } 3061 if (Universe::narrow_klass_shift() != 0) { 3062 srdi(dst, current, Universe::narrow_klass_shift()); 3063 current = dst; 3064 } 3065 return current; 3066 } 3067 3068 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) { 3069 if (UseCompressedClassPointers) { 3070 Register compressedKlass = encode_klass_not_null(ck, klass); 3071 stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop); 3072 } else { 3073 std(klass, oopDesc::klass_offset_in_bytes(), dst_oop); 3074 } 3075 } 3076 3077 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) { 3078 if (UseCompressedClassPointers) { 3079 if (val == noreg) { 3080 val = R0; 3081 li(val, 0); 3082 } 3083 stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed 3084 } 3085 } 3086 3087 int MacroAssembler::instr_size_for_decode_klass_not_null() { 3088 if (!UseCompressedClassPointers) return 0; 3089 int num_instrs = 1; // shift or move 3090 if (Universe::narrow_klass_base() != 0) num_instrs = 7; // shift + load const + add 3091 return num_instrs * BytesPerInstWord; 3092 } 3093 3094 void MacroAssembler::decode_klass_not_null(Register dst, Register src) { 3095 assert(dst != R0, "Dst reg may not be R0, as R0 is used here."); 3096 if (src == noreg) src = dst; 3097 Register shifted_src = src; 3098 if (Universe::narrow_klass_shift() != 0 || 3099 Universe::narrow_klass_base() == 0 && src != dst) { // Move required. 3100 shifted_src = dst; 3101 sldi(shifted_src, src, Universe::narrow_klass_shift()); 3102 } 3103 if (Universe::narrow_klass_base() != 0) { 3104 add_const_optimized(dst, shifted_src, Universe::narrow_klass_base(), R0); 3105 } 3106 } 3107 3108 void MacroAssembler::load_klass(Register dst, Register src) { 3109 if (UseCompressedClassPointers) { 3110 lwz(dst, oopDesc::klass_offset_in_bytes(), src); 3111 // Attention: no null check here! 3112 decode_klass_not_null(dst, dst); 3113 } else { 3114 ld(dst, oopDesc::klass_offset_in_bytes(), src); 3115 } 3116 } 3117 3118 // Clear Array 3119 // Kills both input registers. tmp == R0 is allowed. 3120 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp) { 3121 // Procedure for large arrays (uses data cache block zero instruction). 3122 Label startloop, fast, fastloop, small_rest, restloop, done; 3123 const int cl_size = VM_Version::L1_data_cache_line_size(), 3124 cl_dwords = cl_size>>3, 3125 cl_dw_addr_bits = exact_log2(cl_dwords), 3126 dcbz_min = 1; // Min count of dcbz executions, needs to be >0. 3127 3128 //2: 3129 cmpdi(CCR1, cnt_dwords, ((dcbz_min+1)<<cl_dw_addr_bits)-1); // Big enough? (ensure >=dcbz_min lines included). 3130 blt(CCR1, small_rest); // Too small. 3131 rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line. 3132 beq(CCR0, fast); // Already 128byte aligned. 3133 3134 subfic(tmp, tmp, cl_dwords); 3135 mtctr(tmp); // Set ctr to hit 128byte boundary (0<ctr<cl_dwords). 3136 subf(cnt_dwords, tmp, cnt_dwords); // rest. 3137 li(tmp, 0); 3138 //10: 3139 bind(startloop); // Clear at the beginning to reach 128byte boundary. 3140 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 3141 addi(base_ptr, base_ptr, 8); 3142 bdnz(startloop); 3143 //13: 3144 bind(fast); // Clear 128byte blocks. 3145 srdi(tmp, cnt_dwords, cl_dw_addr_bits); // Loop count for 128byte loop (>0). 3146 andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords. 3147 mtctr(tmp); // Load counter. 3148 //16: 3149 bind(fastloop); 3150 dcbz(base_ptr); // Clear 128byte aligned block. 3151 addi(base_ptr, base_ptr, cl_size); 3152 bdnz(fastloop); 3153 if (InsertEndGroupPPC64) { endgroup(); } else { nop(); } 3154 //20: 3155 bind(small_rest); 3156 cmpdi(CCR0, cnt_dwords, 0); // size 0? 3157 beq(CCR0, done); // rest == 0 3158 li(tmp, 0); 3159 mtctr(cnt_dwords); // Load counter. 3160 //24: 3161 bind(restloop); // Clear rest. 3162 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 3163 addi(base_ptr, base_ptr, 8); 3164 bdnz(restloop); 3165 //27: 3166 bind(done); 3167 } 3168 3169 /////////////////////////////////////////// String intrinsics //////////////////////////////////////////// 3170 3171 // Search for a single jchar in an jchar[]. 3172 // 3173 // Assumes that result differs from all other registers. 3174 // 3175 // Haystack, needle are the addresses of jchar-arrays. 3176 // NeedleChar is needle[0] if it is known at compile time. 3177 // Haycnt is the length of the haystack. We assume haycnt >=1. 3178 // 3179 // Preserves haystack, haycnt, kills all other registers. 3180 // 3181 // If needle == R0, we search for the constant needleChar. 3182 void MacroAssembler::string_indexof_1(Register result, Register haystack, Register haycnt, 3183 Register needle, jchar needleChar, 3184 Register tmp1, Register tmp2) { 3185 3186 assert_different_registers(result, haystack, haycnt, needle, tmp1, tmp2); 3187 3188 Label L_InnerLoop, L_FinalCheck, L_Found1, L_Found2, L_Found3, L_NotFound, L_End; 3189 Register needle0 = needle, // Contains needle[0]. 3190 addr = tmp1, 3191 ch1 = tmp2, 3192 ch2 = R0; 3193 3194 //2 (variable) or 3 (const): 3195 if (needle != R0) lhz(needle0, 0, needle); // Preload needle character, needle has len==1. 3196 dcbtct(haystack, 0x00); // Indicate R/O access to haystack. 3197 3198 srwi_(tmp2, haycnt, 1); // Shift right by exact_log2(UNROLL_FACTOR). 3199 mr(addr, haystack); 3200 beq(CCR0, L_FinalCheck); 3201 mtctr(tmp2); // Move to count register. 3202 //8: 3203 bind(L_InnerLoop); // Main work horse (2x unrolled search loop). 3204 lhz(ch1, 0, addr); // Load characters from haystack. 3205 lhz(ch2, 2, addr); 3206 (needle != R0) ? cmpw(CCR0, ch1, needle0) : cmplwi(CCR0, ch1, needleChar); 3207 (needle != R0) ? cmpw(CCR1, ch2, needle0) : cmplwi(CCR1, ch2, needleChar); 3208 beq(CCR0, L_Found1); // Did we find the needle? 3209 beq(CCR1, L_Found2); 3210 addi(addr, addr, 4); 3211 bdnz(L_InnerLoop); 3212 //16: 3213 bind(L_FinalCheck); 3214 andi_(R0, haycnt, 1); 3215 beq(CCR0, L_NotFound); 3216 lhz(ch1, 0, addr); // One position left at which we have to compare. 3217 (needle != R0) ? cmpw(CCR1, ch1, needle0) : cmplwi(CCR1, ch1, needleChar); 3218 beq(CCR1, L_Found3); 3219 //21: 3220 bind(L_NotFound); 3221 li(result, -1); // Not found. 3222 b(L_End); 3223 3224 bind(L_Found2); 3225 addi(addr, addr, 2); 3226 //24: 3227 bind(L_Found1); 3228 bind(L_Found3); // Return index ... 3229 subf(addr, haystack, addr); // relative to haystack, 3230 srdi(result, addr, 1); // in characters. 3231 bind(L_End); 3232 } 3233 3234 3235 // Implementation of IndexOf for jchar arrays. 3236 // 3237 // The length of haystack and needle are not constant, i.e. passed in a register. 3238 // 3239 // Preserves registers haystack, needle. 3240 // Kills registers haycnt, needlecnt. 3241 // Assumes that result differs from all other registers. 3242 // Haystack, needle are the addresses of jchar-arrays. 3243 // Haycnt, needlecnt are the lengths of them, respectively. 3244 // 3245 // Needlecntval must be zero or 15-bit unsigned immediate and > 1. 3246 void MacroAssembler::string_indexof(Register result, Register haystack, Register haycnt, 3247 Register needle, ciTypeArray* needle_values, Register needlecnt, int needlecntval, 3248 Register tmp1, Register tmp2, Register tmp3, Register tmp4) { 3249 3250 // Ensure 0<needlecnt<=haycnt in ideal graph as prerequisite! 3251 Label L_TooShort, L_Found, L_NotFound, L_End; 3252 Register last_addr = haycnt, // Kill haycnt at the beginning. 3253 addr = tmp1, 3254 n_start = tmp2, 3255 ch1 = tmp3, 3256 ch2 = R0; 3257 3258 // ************************************************************************************************** 3259 // Prepare for main loop: optimized for needle count >=2, bail out otherwise. 3260 // ************************************************************************************************** 3261 3262 //1 (variable) or 3 (const): 3263 dcbtct(needle, 0x00); // Indicate R/O access to str1. 3264 dcbtct(haystack, 0x00); // Indicate R/O access to str2. 3265 3266 // Compute last haystack addr to use if no match gets found. 3267 if (needlecntval == 0) { // variable needlecnt 3268 //3: 3269 subf(ch1, needlecnt, haycnt); // Last character index to compare is haycnt-needlecnt. 3270 addi(addr, haystack, -2); // Accesses use pre-increment. 3271 cmpwi(CCR6, needlecnt, 2); 3272 blt(CCR6, L_TooShort); // Variable needlecnt: handle short needle separately. 3273 slwi(ch1, ch1, 1); // Scale to number of bytes. 3274 lwz(n_start, 0, needle); // Load first 2 characters of needle. 3275 add(last_addr, haystack, ch1); // Point to last address to compare (haystack+2*(haycnt-needlecnt)). 3276 addi(needlecnt, needlecnt, -2); // Rest of needle. 3277 } else { // constant needlecnt 3278 guarantee(needlecntval != 1, "IndexOf with single-character needle must be handled separately"); 3279 assert((needlecntval & 0x7fff) == needlecntval, "wrong immediate"); 3280 //5: 3281 addi(ch1, haycnt, -needlecntval); // Last character index to compare is haycnt-needlecnt. 3282 lwz(n_start, 0, needle); // Load first 2 characters of needle. 3283 addi(addr, haystack, -2); // Accesses use pre-increment. 3284 slwi(ch1, ch1, 1); // Scale to number of bytes. 3285 add(last_addr, haystack, ch1); // Point to last address to compare (haystack+2*(haycnt-needlecnt)). 3286 li(needlecnt, needlecntval-2); // Rest of needle. 3287 } 3288 3289 // Main Loop (now we have at least 3 characters). 3290 //11: 3291 Label L_OuterLoop, L_InnerLoop, L_FinalCheck, L_Comp1, L_Comp2, L_Comp3; 3292 bind(L_OuterLoop); // Search for 1st 2 characters. 3293 Register addr_diff = tmp4; 3294 subf(addr_diff, addr, last_addr); // Difference between already checked address and last address to check. 3295 addi(addr, addr, 2); // This is the new address we want to use for comparing. 3296 srdi_(ch2, addr_diff, 2); 3297 beq(CCR0, L_FinalCheck); // 2 characters left? 3298 mtctr(ch2); // addr_diff/4 3299 //16: 3300 bind(L_InnerLoop); // Main work horse (2x unrolled search loop) 3301 lwz(ch1, 0, addr); // Load 2 characters of haystack (ignore alignment). 3302 lwz(ch2, 2, addr); 3303 cmpw(CCR0, ch1, n_start); // Compare 2 characters (1 would be sufficient but try to reduce branches to CompLoop). 3304 cmpw(CCR1, ch2, n_start); 3305 beq(CCR0, L_Comp1); // Did we find the needle start? 3306 beq(CCR1, L_Comp2); 3307 addi(addr, addr, 4); 3308 bdnz(L_InnerLoop); 3309 //24: 3310 bind(L_FinalCheck); 3311 rldicl_(addr_diff, addr_diff, 64-1, 63); // Remaining characters not covered by InnerLoop: (addr_diff>>1)&1. 3312 beq(CCR0, L_NotFound); 3313 lwz(ch1, 0, addr); // One position left at which we have to compare. 3314 cmpw(CCR1, ch1, n_start); 3315 beq(CCR1, L_Comp3); 3316 //29: 3317 bind(L_NotFound); 3318 li(result, -1); // not found 3319 b(L_End); 3320 3321 3322 // ************************************************************************************************** 3323 // Special Case: unfortunately, the variable needle case can be called with needlecnt<2 3324 // ************************************************************************************************** 3325 //31: 3326 if ((needlecntval>>1) !=1 ) { // Const needlecnt is 2 or 3? Reduce code size. 3327 int nopcnt = 5; 3328 if (needlecntval !=0 ) ++nopcnt; // Balance alignment (other case: see below). 3329 if (needlecntval == 0) { // We have to handle these cases separately. 3330 Label L_OneCharLoop; 3331 bind(L_TooShort); 3332 mtctr(haycnt); 3333 lhz(n_start, 0, needle); // First character of needle 3334 bind(L_OneCharLoop); 3335 lhzu(ch1, 2, addr); 3336 cmpw(CCR1, ch1, n_start); 3337 beq(CCR1, L_Found); // Did we find the one character needle? 3338 bdnz(L_OneCharLoop); 3339 li(result, -1); // Not found. 3340 b(L_End); 3341 } // 8 instructions, so no impact on alignment. 3342 for (int x = 0; x < nopcnt; ++x) nop(); 3343 } 3344 3345 // ************************************************************************************************** 3346 // Regular Case Part II: compare rest of needle (first 2 characters have been compared already) 3347 // ************************************************************************************************** 3348 3349 // Compare the rest 3350 //36 if needlecntval==0, else 37: 3351 bind(L_Comp2); 3352 addi(addr, addr, 2); // First comparison has failed, 2nd one hit. 3353 bind(L_Comp1); // Addr points to possible needle start. 3354 bind(L_Comp3); // Could have created a copy and use a different return address but saving code size here. 3355 if (needlecntval != 2) { // Const needlecnt==2? 3356 if (needlecntval != 3) { 3357 if (needlecntval == 0) beq(CCR6, L_Found); // Variable needlecnt==2? 3358 Register ind_reg = tmp4; 3359 li(ind_reg, 2*2); // First 2 characters are already compared, use index 2. 3360 mtctr(needlecnt); // Decremented by 2, still > 0. 3361 //40: 3362 Label L_CompLoop; 3363 bind(L_CompLoop); 3364 lhzx(ch2, needle, ind_reg); 3365 lhzx(ch1, addr, ind_reg); 3366 cmpw(CCR1, ch1, ch2); 3367 bne(CCR1, L_OuterLoop); 3368 addi(ind_reg, ind_reg, 2); 3369 bdnz(L_CompLoop); 3370 } else { // No loop required if there's only one needle character left. 3371 lhz(ch2, 2*2, needle); 3372 lhz(ch1, 2*2, addr); 3373 cmpw(CCR1, ch1, ch2); 3374 bne(CCR1, L_OuterLoop); 3375 } 3376 } 3377 // Return index ... 3378 //46: 3379 bind(L_Found); 3380 subf(addr, haystack, addr); // relative to haystack, ... 3381 srdi(result, addr, 1); // in characters. 3382 //48: 3383 bind(L_End); 3384 } 3385 3386 // Implementation of Compare for jchar arrays. 3387 // 3388 // Kills the registers str1, str2, cnt1, cnt2. 3389 // Kills cr0, ctr. 3390 // Assumes that result differes from the input registers. 3391 void MacroAssembler::string_compare(Register str1_reg, Register str2_reg, Register cnt1_reg, Register cnt2_reg, 3392 Register result_reg, Register tmp_reg) { 3393 assert_different_registers(result_reg, str1_reg, str2_reg, cnt1_reg, cnt2_reg, tmp_reg); 3394 3395 Label Ldone, Lslow_case, Lslow_loop, Lfast_loop; 3396 Register cnt_diff = R0, 3397 limit_reg = cnt1_reg, 3398 chr1_reg = result_reg, 3399 chr2_reg = cnt2_reg, 3400 addr_diff = str2_reg; 3401 3402 // Offset 0 should be 32 byte aligned. 3403 //-4: 3404 dcbtct(str1_reg, 0x00); // Indicate R/O access to str1. 3405 dcbtct(str2_reg, 0x00); // Indicate R/O access to str2. 3406 //-2: 3407 // Compute min(cnt1, cnt2) and check if 0 (bail out if we don't need to compare characters). 3408 subf(result_reg, cnt2_reg, cnt1_reg); // difference between cnt1/2 3409 subf_(addr_diff, str1_reg, str2_reg); // alias? 3410 beq(CCR0, Ldone); // return cnt difference if both ones are identical 3411 srawi(limit_reg, result_reg, 31); // generate signmask (cnt1/2 must be non-negative so cnt_diff can't overflow) 3412 mr(cnt_diff, result_reg); 3413 andr(limit_reg, result_reg, limit_reg); // difference or zero (negative): cnt1<cnt2 ? cnt1-cnt2 : 0 3414 add_(limit_reg, cnt2_reg, limit_reg); // min(cnt1, cnt2)==0? 3415 beq(CCR0, Ldone); // return cnt difference if one has 0 length 3416 3417 lhz(chr1_reg, 0, str1_reg); // optional: early out if first characters mismatch 3418 lhzx(chr2_reg, str1_reg, addr_diff); // optional: early out if first characters mismatch 3419 addi(tmp_reg, limit_reg, -1); // min(cnt1, cnt2)-1 3420 subf_(result_reg, chr2_reg, chr1_reg); // optional: early out if first characters mismatch 3421 bne(CCR0, Ldone); // optional: early out if first characters mismatch 3422 3423 // Set loop counter by scaling down tmp_reg 3424 srawi_(chr2_reg, tmp_reg, exact_log2(4)); // (min(cnt1, cnt2)-1)/4 3425 ble(CCR0, Lslow_case); // need >4 characters for fast loop 3426 andi(limit_reg, tmp_reg, 4-1); // remaining characters 3427 3428 // Adapt str1_reg str2_reg for the first loop iteration 3429 mtctr(chr2_reg); // (min(cnt1, cnt2)-1)/4 3430 addi(limit_reg, limit_reg, 4+1); // compare last 5-8 characters in slow_case if mismatch found in fast_loop 3431 //16: 3432 // Compare the rest of the characters 3433 bind(Lfast_loop); 3434 ld(chr1_reg, 0, str1_reg); 3435 ldx(chr2_reg, str1_reg, addr_diff); 3436 cmpd(CCR0, chr2_reg, chr1_reg); 3437 bne(CCR0, Lslow_case); // return chr1_reg 3438 addi(str1_reg, str1_reg, 4*2); 3439 bdnz(Lfast_loop); 3440 addi(limit_reg, limit_reg, -4); // no mismatch found in fast_loop, only 1-4 characters missing 3441 //23: 3442 bind(Lslow_case); 3443 mtctr(limit_reg); 3444 //24: 3445 bind(Lslow_loop); 3446 lhz(chr1_reg, 0, str1_reg); 3447 lhzx(chr2_reg, str1_reg, addr_diff); 3448 subf_(result_reg, chr2_reg, chr1_reg); 3449 bne(CCR0, Ldone); // return chr1_reg 3450 addi(str1_reg, str1_reg, 1*2); 3451 bdnz(Lslow_loop); 3452 //30: 3453 // If strings are equal up to min length, return the length difference. 3454 mr(result_reg, cnt_diff); 3455 nop(); // alignment 3456 //32: 3457 // Otherwise, return the difference between the first mismatched chars. 3458 bind(Ldone); 3459 } 3460 3461 3462 // Compare char[] arrays. 3463 // 3464 // str1_reg USE only 3465 // str2_reg USE only 3466 // cnt_reg USE_DEF, due to tmp reg shortage 3467 // result_reg DEF only, might compromise USE only registers 3468 void MacroAssembler::char_arrays_equals(Register str1_reg, Register str2_reg, Register cnt_reg, Register result_reg, 3469 Register tmp1_reg, Register tmp2_reg, Register tmp3_reg, Register tmp4_reg, 3470 Register tmp5_reg) { 3471 3472 // Str1 may be the same register as str2 which can occur e.g. after scalar replacement. 3473 assert_different_registers(result_reg, str1_reg, cnt_reg, tmp1_reg, tmp2_reg, tmp3_reg, tmp4_reg, tmp5_reg); 3474 assert_different_registers(result_reg, str2_reg, cnt_reg, tmp1_reg, tmp2_reg, tmp3_reg, tmp4_reg, tmp5_reg); 3475 3476 // Offset 0 should be 32 byte aligned. 3477 Label Linit_cbc, Lcbc, Lloop, Ldone_true, Ldone_false; 3478 Register index_reg = tmp5_reg; 3479 Register cbc_iter = tmp4_reg; 3480 3481 //-1: 3482 dcbtct(str1_reg, 0x00); // Indicate R/O access to str1. 3483 dcbtct(str2_reg, 0x00); // Indicate R/O access to str2. 3484 //1: 3485 andi(cbc_iter, cnt_reg, 4-1); // Remaining iterations after 4 java characters per iteration loop. 3486 li(index_reg, 0); // init 3487 li(result_reg, 0); // assume false 3488 srwi_(tmp2_reg, cnt_reg, exact_log2(4)); // Div: 4 java characters per iteration (main loop). 3489 3490 cmpwi(CCR1, cbc_iter, 0); // CCR1 = (cbc_iter==0) 3491 beq(CCR0, Linit_cbc); // too short 3492 mtctr(tmp2_reg); 3493 //8: 3494 bind(Lloop); 3495 ldx(tmp1_reg, str1_reg, index_reg); 3496 ldx(tmp2_reg, str2_reg, index_reg); 3497 cmpd(CCR0, tmp1_reg, tmp2_reg); 3498 bne(CCR0, Ldone_false); // Unequal char pair found -> done. 3499 addi(index_reg, index_reg, 4*sizeof(jchar)); 3500 bdnz(Lloop); 3501 //14: 3502 bind(Linit_cbc); 3503 beq(CCR1, Ldone_true); 3504 mtctr(cbc_iter); 3505 //16: 3506 bind(Lcbc); 3507 lhzx(tmp1_reg, str1_reg, index_reg); 3508 lhzx(tmp2_reg, str2_reg, index_reg); 3509 cmpw(CCR0, tmp1_reg, tmp2_reg); 3510 bne(CCR0, Ldone_false); // Unequal char pair found -> done. 3511 addi(index_reg, index_reg, 1*sizeof(jchar)); 3512 bdnz(Lcbc); 3513 nop(); 3514 bind(Ldone_true); 3515 li(result_reg, 1); 3516 //24: 3517 bind(Ldone_false); 3518 } 3519 3520 3521 void MacroAssembler::char_arrays_equalsImm(Register str1_reg, Register str2_reg, int cntval, Register result_reg, 3522 Register tmp1_reg, Register tmp2_reg) { 3523 // Str1 may be the same register as str2 which can occur e.g. after scalar replacement. 3524 assert_different_registers(result_reg, str1_reg, tmp1_reg, tmp2_reg); 3525 assert_different_registers(result_reg, str2_reg, tmp1_reg, tmp2_reg); 3526 assert(sizeof(jchar) == 2, "must be"); 3527 assert(cntval >= 0 && ((cntval & 0x7fff) == cntval), "wrong immediate"); 3528 3529 Label Ldone_false; 3530 3531 if (cntval < 16) { // short case 3532 if (cntval != 0) li(result_reg, 0); // assume false 3533 3534 const int num_bytes = cntval*sizeof(jchar); 3535 int index = 0; 3536 for (int next_index; (next_index = index + 8) <= num_bytes; index = next_index) { 3537 ld(tmp1_reg, index, str1_reg); 3538 ld(tmp2_reg, index, str2_reg); 3539 cmpd(CCR0, tmp1_reg, tmp2_reg); 3540 bne(CCR0, Ldone_false); 3541 } 3542 if (cntval & 2) { 3543 lwz(tmp1_reg, index, str1_reg); 3544 lwz(tmp2_reg, index, str2_reg); 3545 cmpw(CCR0, tmp1_reg, tmp2_reg); 3546 bne(CCR0, Ldone_false); 3547 index += 4; 3548 } 3549 if (cntval & 1) { 3550 lhz(tmp1_reg, index, str1_reg); 3551 lhz(tmp2_reg, index, str2_reg); 3552 cmpw(CCR0, tmp1_reg, tmp2_reg); 3553 bne(CCR0, Ldone_false); 3554 } 3555 // fallthrough: true 3556 } else { 3557 Label Lloop; 3558 Register index_reg = tmp1_reg; 3559 const int loopcnt = cntval/4; 3560 assert(loopcnt > 0, "must be"); 3561 // Offset 0 should be 32 byte aligned. 3562 //2: 3563 dcbtct(str1_reg, 0x00); // Indicate R/O access to str1. 3564 dcbtct(str2_reg, 0x00); // Indicate R/O access to str2. 3565 li(tmp2_reg, loopcnt); 3566 li(index_reg, 0); // init 3567 li(result_reg, 0); // assume false 3568 mtctr(tmp2_reg); 3569 //8: 3570 bind(Lloop); 3571 ldx(R0, str1_reg, index_reg); 3572 ldx(tmp2_reg, str2_reg, index_reg); 3573 cmpd(CCR0, R0, tmp2_reg); 3574 bne(CCR0, Ldone_false); // Unequal char pair found -> done. 3575 addi(index_reg, index_reg, 4*sizeof(jchar)); 3576 bdnz(Lloop); 3577 //14: 3578 if (cntval & 2) { 3579 lwzx(R0, str1_reg, index_reg); 3580 lwzx(tmp2_reg, str2_reg, index_reg); 3581 cmpw(CCR0, R0, tmp2_reg); 3582 bne(CCR0, Ldone_false); 3583 if (cntval & 1) addi(index_reg, index_reg, 2*sizeof(jchar)); 3584 } 3585 if (cntval & 1) { 3586 lhzx(R0, str1_reg, index_reg); 3587 lhzx(tmp2_reg, str2_reg, index_reg); 3588 cmpw(CCR0, R0, tmp2_reg); 3589 bne(CCR0, Ldone_false); 3590 } 3591 // fallthru: true 3592 } 3593 li(result_reg, 1); 3594 bind(Ldone_false); 3595 } 3596 3597 // Helpers for Intrinsic Emitters 3598 // 3599 // Revert the byte order of a 32bit value in a register 3600 // src: 0x44556677 3601 // dst: 0x77665544 3602 // Three steps to obtain the result: 3603 // 1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word 3604 // into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared. 3605 // This value initializes dst. 3606 // 2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost 3607 // byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go. 3608 // This value is mask inserted into dst with a [0..23] mask of 1s. 3609 // 3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position. 3610 // This value is mask inserted into dst with a [8..15] mask of 1s. 3611 void MacroAssembler::load_reverse_32(Register dst, Register src) { 3612 assert_different_registers(dst, src); 3613 3614 rldicl(dst, src, (4+1)*8, 56); // Rotate byte 4 into position 7 (rightmost), clear all to the left. 3615 rlwimi(dst, src, 3*8, 0, 23); // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone. 3616 rlwimi(dst, src, 1*8, 8, 15); // Insert byte 6 into position 5, leave the rest alone. 3617 } 3618 3619 // Calculate the column addresses of the crc32 lookup table into distinct registers. 3620 // This loop-invariant calculation is moved out of the loop body, reducing the loop 3621 // body size from 20 to 16 instructions. 3622 // Returns the offset that was used to calculate the address of column tc3. 3623 // Due to register shortage, setting tc3 may overwrite table. With the return offset 3624 // at hand, the original table address can be easily reconstructed. 3625 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) { 3626 3627 #ifdef VM_LITTLE_ENDIAN 3628 // This is what we implement (the DOLIT4 part): 3629 // ========================================================================= */ 3630 // #define DOLIT4 c ^= *buf4++; \ 3631 // c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \ 3632 // crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24] 3633 // #define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4 3634 // ========================================================================= */ 3635 const int ix0 = 3*(4*CRC32_COLUMN_SIZE); 3636 const int ix1 = 2*(4*CRC32_COLUMN_SIZE); 3637 const int ix2 = 1*(4*CRC32_COLUMN_SIZE); 3638 const int ix3 = 0*(4*CRC32_COLUMN_SIZE); 3639 #else 3640 // This is what we implement (the DOBIG4 part): 3641 // ========================================================================= 3642 // #define DOBIG4 c ^= *++buf4; \ 3643 // c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \ 3644 // crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24] 3645 // #define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4 3646 // ========================================================================= 3647 const int ix0 = 4*(4*CRC32_COLUMN_SIZE); 3648 const int ix1 = 5*(4*CRC32_COLUMN_SIZE); 3649 const int ix2 = 6*(4*CRC32_COLUMN_SIZE); 3650 const int ix3 = 7*(4*CRC32_COLUMN_SIZE); 3651 #endif 3652 assert_different_registers(table, tc0, tc1, tc2); 3653 assert(table == tc3, "must be!"); 3654 3655 if (ix0 != 0) addi(tc0, table, ix0); 3656 if (ix1 != 0) addi(tc1, table, ix1); 3657 if (ix2 != 0) addi(tc2, table, ix2); 3658 if (ix3 != 0) addi(tc3, table, ix3); 3659 3660 return ix3; 3661 } 3662 3663 /** 3664 * uint32_t crc; 3665 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8); 3666 */ 3667 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) { 3668 assert_different_registers(crc, table, tmp); 3669 assert_different_registers(val, table); 3670 3671 if (crc == val) { // Must rotate first to use the unmodified value. 3672 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 3673 // As we use a word (4-byte) instruction, we have to adapt the mask bit positions. 3674 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 3675 } else { 3676 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 3677 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 3678 } 3679 lwzx(tmp, table, tmp); 3680 xorr(crc, crc, tmp); 3681 } 3682 3683 /** 3684 * uint32_t crc; 3685 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8); 3686 */ 3687 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) { 3688 fold_byte_crc32(crc, crc, table, tmp); 3689 } 3690 3691 /** 3692 * Emits code to update CRC-32 with a byte value according to constants in table. 3693 * 3694 * @param [in,out]crc Register containing the crc. 3695 * @param [in]val Register containing the byte to fold into the CRC. 3696 * @param [in]table Register containing the table of crc constants. 3697 * 3698 * uint32_t crc; 3699 * val = crc_table[(val ^ crc) & 0xFF]; 3700 * crc = val ^ (crc >> 8); 3701 */ 3702 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 3703 BLOCK_COMMENT("update_byte_crc32:"); 3704 xorr(val, val, crc); 3705 fold_byte_crc32(crc, val, table, val); 3706 } 3707 3708 /** 3709 * @param crc register containing existing CRC (32-bit) 3710 * @param buf register pointing to input byte buffer (byte*) 3711 * @param len register containing number of bytes 3712 * @param table register pointing to CRC table 3713 */ 3714 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table, 3715 Register data, bool loopAlignment, bool invertCRC) { 3716 assert_different_registers(crc, buf, len, table, data); 3717 3718 Label L_mainLoop, L_done; 3719 const int mainLoop_stepping = 1; 3720 const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4; 3721 3722 // Process all bytes in a single-byte loop. 3723 cmpdi(CCR0, len, 0); // Anything to do? 3724 mtctr(len); 3725 beq(CCR0, L_done); 3726 3727 if (invertCRC) { 3728 nand(crc, crc, crc); // ~c 3729 } 3730 3731 align(mainLoop_alignment); 3732 BIND(L_mainLoop); 3733 lbz(data, 0, buf); // Byte from buffer, zero-extended. 3734 addi(buf, buf, mainLoop_stepping); // Advance buffer position. 3735 update_byte_crc32(crc, data, table); 3736 bdnz(L_mainLoop); // Iterate. 3737 3738 if (invertCRC) { 3739 nand(crc, crc, crc); // ~c 3740 } 3741 3742 bind(L_done); 3743 } 3744 3745 /** 3746 * Emits code to update CRC-32 with a 4-byte value according to constants in table 3747 * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c 3748 */ 3749 // A not on the lookup table address(es): 3750 // The lookup table consists of two sets of four columns each. 3751 // The columns {0..3} are used for little-endian machines. 3752 // The columns {4..7} are used for big-endian machines. 3753 // To save the effort of adding the column offset to the table address each time 3754 // a table element is looked up, it is possible to pass the pre-calculated 3755 // column addresses. 3756 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary. 3757 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc, 3758 Register t0, Register t1, Register t2, Register t3, 3759 Register tc0, Register tc1, Register tc2, Register tc3) { 3760 assert_different_registers(crc, t3); 3761 3762 // XOR crc with next four bytes of buffer. 3763 lwz(t3, bufDisp, buf); 3764 if (bufInc != 0) { 3765 addi(buf, buf, bufInc); 3766 } 3767 xorr(t3, t3, crc); 3768 3769 // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices. 3770 rlwinm(t0, t3, 2, 24-2, 31-2); // ((t1 >> 0) & 0xff) << 2 3771 rlwinm(t1, t3, 32+(2- 8), 24-2, 31-2); // ((t1 >> 8) & 0xff) << 2 3772 rlwinm(t2, t3, 32+(2-16), 24-2, 31-2); // ((t1 >> 16) & 0xff) << 2 3773 rlwinm(t3, t3, 32+(2-24), 24-2, 31-2); // ((t1 >> 24) & 0xff) << 2 3774 3775 // Use the pre-calculated column addresses. 3776 // Load pre-calculated table values. 3777 lwzx(t0, tc0, t0); 3778 lwzx(t1, tc1, t1); 3779 lwzx(t2, tc2, t2); 3780 lwzx(t3, tc3, t3); 3781 3782 // Calculate new crc from table values. 3783 xorr(t0, t0, t1); 3784 xorr(t2, t2, t3); 3785 xorr(crc, t0, t2); // Now crc contains the final checksum value. 3786 } 3787 3788 /** 3789 * @param crc register containing existing CRC (32-bit) 3790 * @param buf register pointing to input byte buffer (byte*) 3791 * @param len register containing number of bytes 3792 * @param table register pointing to CRC table 3793 * 3794 * Uses R9..R12 as work register. Must be saved/restored by caller! 3795 */ 3796 void MacroAssembler::kernel_crc32_2word(Register crc, Register buf, Register len, Register table, 3797 Register t0, Register t1, Register t2, Register t3, 3798 Register tc0, Register tc1, Register tc2, Register tc3) { 3799 assert_different_registers(crc, buf, len, table); 3800 3801 Label L_mainLoop, L_tail; 3802 Register tmp = t0; 3803 Register data = t0; 3804 Register tmp2 = t1; 3805 const int mainLoop_stepping = 8; 3806 const int tailLoop_stepping = 1; 3807 const int log_stepping = exact_log2(mainLoop_stepping); 3808 const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32; 3809 const int complexThreshold = 2*mainLoop_stepping; 3810 3811 // Don't test for len <= 0 here. This pathological case should not occur anyway. 3812 // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles. 3813 // The situation itself is detected and handled correctly by the conditional branches 3814 // following aghi(len, -stepping) and aghi(len, +stepping). 3815 assert(tailLoop_stepping == 1, "check tailLoop_stepping!"); 3816 3817 BLOCK_COMMENT("kernel_crc32_2word {"); 3818 3819 nand(crc, crc, crc); // ~c 3820 3821 // Check for short (<mainLoop_stepping) buffer. 3822 cmpdi(CCR0, len, complexThreshold); 3823 blt(CCR0, L_tail); 3824 3825 // Pre-mainLoop alignment did show a slight (1%) positive effect on performance. 3826 // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions. 3827 { 3828 // Align buf addr to mainLoop_stepping boundary. 3829 neg(tmp2, buf); // Calculate # preLoop iterations for alignment. 3830 rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63. 3831 3832 if (complexThreshold > mainLoop_stepping) { 3833 sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 3834 } else { 3835 sub(tmp, len, tmp2); // Remaining bytes for main loop. 3836 cmpdi(CCR0, tmp, mainLoop_stepping); 3837 blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing 3838 mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 3839 } 3840 update_byteLoop_crc32(crc, buf, tmp2, table, data, false, false); 3841 } 3842 3843 srdi(tmp2, len, log_stepping); // #iterations for mainLoop 3844 andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop 3845 mtctr(tmp2); 3846 3847 #ifdef VM_LITTLE_ENDIAN 3848 Register crc_rv = crc; 3849 #else 3850 Register crc_rv = tmp; // Load_reverse needs separate registers to work on. 3851 // Occupies tmp, but frees up crc. 3852 load_reverse_32(crc_rv, crc); // Revert byte order because we are dealing with big-endian data. 3853 tmp = crc; 3854 #endif 3855 3856 int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3); 3857 3858 align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement. 3859 BIND(L_mainLoop); 3860 update_1word_crc32(crc_rv, buf, table, 0, 0, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 3861 update_1word_crc32(crc_rv, buf, table, 4, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 3862 bdnz(L_mainLoop); 3863 3864 #ifndef VM_LITTLE_ENDIAN 3865 load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data. 3866 tmp = crc_rv; // Tmp uses it's original register again. 3867 #endif 3868 3869 // Restore original table address for tailLoop. 3870 if (reconstructTableOffset != 0) { 3871 addi(table, table, -reconstructTableOffset); 3872 } 3873 3874 // Process last few (<complexThreshold) bytes of buffer. 3875 BIND(L_tail); 3876 update_byteLoop_crc32(crc, buf, len, table, data, false, false); 3877 3878 nand(crc, crc, crc); // ~c 3879 BLOCK_COMMENT("} kernel_crc32_2word"); 3880 } 3881 3882 /** 3883 * @param crc register containing existing CRC (32-bit) 3884 * @param buf register pointing to input byte buffer (byte*) 3885 * @param len register containing number of bytes 3886 * @param table register pointing to CRC table 3887 * 3888 * uses R9..R12 as work register. Must be saved/restored by caller! 3889 */ 3890 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table, 3891 Register t0, Register t1, Register t2, Register t3, 3892 Register tc0, Register tc1, Register tc2, Register tc3) { 3893 assert_different_registers(crc, buf, len, table); 3894 3895 Label L_mainLoop, L_tail; 3896 Register tmp = t0; 3897 Register data = t0; 3898 Register tmp2 = t1; 3899 const int mainLoop_stepping = 4; 3900 const int tailLoop_stepping = 1; 3901 const int log_stepping = exact_log2(mainLoop_stepping); 3902 const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32; 3903 const int complexThreshold = 2*mainLoop_stepping; 3904 3905 // Don't test for len <= 0 here. This pathological case should not occur anyway. 3906 // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles. 3907 // The situation itself is detected and handled correctly by the conditional branches 3908 // following aghi(len, -stepping) and aghi(len, +stepping). 3909 assert(tailLoop_stepping == 1, "check tailLoop_stepping!"); 3910 3911 BLOCK_COMMENT("kernel_crc32_1word {"); 3912 3913 nand(crc, crc, crc); // ~c 3914 3915 // Check for short (<mainLoop_stepping) buffer. 3916 cmpdi(CCR0, len, complexThreshold); 3917 blt(CCR0, L_tail); 3918 3919 // Pre-mainLoop alignment did show a slight (1%) positive effect on performance. 3920 // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions. 3921 { 3922 // Align buf addr to mainLoop_stepping boundary. 3923 neg(tmp2, buf); // Calculate # preLoop iterations for alignment. 3924 rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63. 3925 3926 if (complexThreshold > mainLoop_stepping) { 3927 sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 3928 } else { 3929 sub(tmp, len, tmp2); // Remaining bytes for main loop. 3930 cmpdi(CCR0, tmp, mainLoop_stepping); 3931 blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing 3932 mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 3933 } 3934 update_byteLoop_crc32(crc, buf, tmp2, table, data, false, false); 3935 } 3936 3937 srdi(tmp2, len, log_stepping); // #iterations for mainLoop 3938 andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop 3939 mtctr(tmp2); 3940 3941 #ifdef VM_LITTLE_ENDIAN 3942 Register crc_rv = crc; 3943 #else 3944 Register crc_rv = tmp; // Load_reverse needs separate registers to work on. 3945 // Occupies tmp, but frees up crc. 3946 load_reverse_32(crc_rv, crc); // evert byte order because we are dealing with big-endian data. 3947 tmp = crc; 3948 #endif 3949 3950 int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3); 3951 3952 align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement. 3953 BIND(L_mainLoop); 3954 update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 3955 bdnz(L_mainLoop); 3956 3957 #ifndef VM_LITTLE_ENDIAN 3958 load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data. 3959 tmp = crc_rv; // Tmp uses it's original register again. 3960 #endif 3961 3962 // Restore original table address for tailLoop. 3963 if (reconstructTableOffset != 0) { 3964 addi(table, table, -reconstructTableOffset); 3965 } 3966 3967 // Process last few (<complexThreshold) bytes of buffer. 3968 BIND(L_tail); 3969 update_byteLoop_crc32(crc, buf, len, table, data, false, false); 3970 3971 nand(crc, crc, crc); // ~c 3972 BLOCK_COMMENT("} kernel_crc32_1word"); 3973 } 3974 3975 /** 3976 * @param crc register containing existing CRC (32-bit) 3977 * @param buf register pointing to input byte buffer (byte*) 3978 * @param len register containing number of bytes 3979 * @param table register pointing to CRC table 3980 * 3981 * Uses R7_ARG5, R8_ARG6 as work registers. 3982 */ 3983 void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len, Register table, 3984 Register t0, Register t1, Register t2, Register t3) { 3985 assert_different_registers(crc, buf, len, table); 3986 3987 Register data = t0; // Holds the current byte to be folded into crc. 3988 3989 BLOCK_COMMENT("kernel_crc32_1byte {"); 3990 3991 // Process all bytes in a single-byte loop. 3992 update_byteLoop_crc32(crc, buf, len, table, data, true, true); 3993 3994 BLOCK_COMMENT("} kernel_crc32_1byte"); 3995 } 3996 3997 void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp) { 3998 assert_different_registers(crc, buf, /* len, not used!! */ table, tmp); 3999 4000 BLOCK_COMMENT("kernel_crc32_singleByte:"); 4001 nand(crc, crc, crc); // ~c 4002 4003 lbz(tmp, 0, buf); // Byte from buffer, zero-extended. 4004 update_byte_crc32(crc, tmp, table); 4005 4006 nand(crc, crc, crc); // ~c 4007 } 4008 4009 // dest_lo += src1 + src2 4010 // dest_hi += carry1 + carry2 4011 void MacroAssembler::add2_with_carry(Register dest_hi, 4012 Register dest_lo, 4013 Register src1, Register src2) { 4014 li(R0, 0); 4015 addc(dest_lo, dest_lo, src1); 4016 adde(dest_hi, dest_hi, R0); 4017 addc(dest_lo, dest_lo, src2); 4018 adde(dest_hi, dest_hi, R0); 4019 } 4020 4021 // Multiply 64 bit by 64 bit first loop. 4022 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, 4023 Register x_xstart, 4024 Register y, Register y_idx, 4025 Register z, 4026 Register carry, 4027 Register product_high, Register product, 4028 Register idx, Register kdx, 4029 Register tmp) { 4030 // jlong carry, x[], y[], z[]; 4031 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) { 4032 // huge_128 product = y[idx] * x[xstart] + carry; 4033 // z[kdx] = (jlong)product; 4034 // carry = (jlong)(product >>> 64); 4035 // } 4036 // z[xstart] = carry; 4037 4038 Label L_first_loop, L_first_loop_exit; 4039 Label L_one_x, L_one_y, L_multiply; 4040 4041 addic_(xstart, xstart, -1); 4042 blt(CCR0, L_one_x); // Special case: length of x is 1. 4043 4044 // Load next two integers of x. 4045 sldi(tmp, xstart, LogBytesPerInt); 4046 ldx(x_xstart, x, tmp); 4047 #ifdef VM_LITTLE_ENDIAN 4048 rldicl(x_xstart, x_xstart, 32, 0); 4049 #endif 4050 4051 align(32, 16); 4052 bind(L_first_loop); 4053 4054 cmpdi(CCR0, idx, 1); 4055 blt(CCR0, L_first_loop_exit); 4056 addi(idx, idx, -2); 4057 beq(CCR0, L_one_y); 4058 4059 // Load next two integers of y. 4060 sldi(tmp, idx, LogBytesPerInt); 4061 ldx(y_idx, y, tmp); 4062 #ifdef VM_LITTLE_ENDIAN 4063 rldicl(y_idx, y_idx, 32, 0); 4064 #endif 4065 4066 4067 bind(L_multiply); 4068 multiply64(product_high, product, x_xstart, y_idx); 4069 4070 li(tmp, 0); 4071 addc(product, product, carry); // Add carry to result. 4072 adde(product_high, product_high, tmp); // Add carry of the last addition. 4073 addi(kdx, kdx, -2); 4074 4075 // Store result. 4076 #ifdef VM_LITTLE_ENDIAN 4077 rldicl(product, product, 32, 0); 4078 #endif 4079 sldi(tmp, kdx, LogBytesPerInt); 4080 stdx(product, z, tmp); 4081 mr_if_needed(carry, product_high); 4082 b(L_first_loop); 4083 4084 4085 bind(L_one_y); // Load one 32 bit portion of y as (0,value). 4086 4087 lwz(y_idx, 0, y); 4088 b(L_multiply); 4089 4090 4091 bind(L_one_x); // Load one 32 bit portion of x as (0,value). 4092 4093 lwz(x_xstart, 0, x); 4094 b(L_first_loop); 4095 4096 bind(L_first_loop_exit); 4097 } 4098 4099 // Multiply 64 bit by 64 bit and add 128 bit. 4100 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, 4101 Register z, Register yz_idx, 4102 Register idx, Register carry, 4103 Register product_high, Register product, 4104 Register tmp, int offset) { 4105 4106 // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry; 4107 // z[kdx] = (jlong)product; 4108 4109 sldi(tmp, idx, LogBytesPerInt); 4110 if (offset) { 4111 addi(tmp, tmp, offset); 4112 } 4113 ldx(yz_idx, y, tmp); 4114 #ifdef VM_LITTLE_ENDIAN 4115 rldicl(yz_idx, yz_idx, 32, 0); 4116 #endif 4117 4118 multiply64(product_high, product, x_xstart, yz_idx); 4119 ldx(yz_idx, z, tmp); 4120 #ifdef VM_LITTLE_ENDIAN 4121 rldicl(yz_idx, yz_idx, 32, 0); 4122 #endif 4123 4124 add2_with_carry(product_high, product, carry, yz_idx); 4125 4126 sldi(tmp, idx, LogBytesPerInt); 4127 if (offset) { 4128 addi(tmp, tmp, offset); 4129 } 4130 #ifdef VM_LITTLE_ENDIAN 4131 rldicl(product, product, 32, 0); 4132 #endif 4133 stdx(product, z, tmp); 4134 } 4135 4136 // Multiply 128 bit by 128 bit. Unrolled inner loop. 4137 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, 4138 Register y, Register z, 4139 Register yz_idx, Register idx, Register carry, 4140 Register product_high, Register product, 4141 Register carry2, Register tmp) { 4142 4143 // jlong carry, x[], y[], z[]; 4144 // int kdx = ystart+1; 4145 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 4146 // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry; 4147 // z[kdx+idx+1] = (jlong)product; 4148 // jlong carry2 = (jlong)(product >>> 64); 4149 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2; 4150 // z[kdx+idx] = (jlong)product; 4151 // carry = (jlong)(product >>> 64); 4152 // } 4153 // idx += 2; 4154 // if (idx > 0) { 4155 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry; 4156 // z[kdx+idx] = (jlong)product; 4157 // carry = (jlong)(product >>> 64); 4158 // } 4159 4160 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 4161 const Register jdx = R0; 4162 4163 // Scale the index. 4164 srdi_(jdx, idx, 2); 4165 beq(CCR0, L_third_loop_exit); 4166 mtctr(jdx); 4167 4168 align(32, 16); 4169 bind(L_third_loop); 4170 4171 addi(idx, idx, -4); 4172 4173 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8); 4174 mr_if_needed(carry2, product_high); 4175 4176 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0); 4177 mr_if_needed(carry, product_high); 4178 bdnz(L_third_loop); 4179 4180 bind(L_third_loop_exit); // Handle any left-over operand parts. 4181 4182 andi_(idx, idx, 0x3); 4183 beq(CCR0, L_post_third_loop_done); 4184 4185 Label L_check_1; 4186 4187 addic_(idx, idx, -2); 4188 blt(CCR0, L_check_1); 4189 4190 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0); 4191 mr_if_needed(carry, product_high); 4192 4193 bind(L_check_1); 4194 4195 addi(idx, idx, 0x2); 4196 andi_(idx, idx, 0x1); 4197 addic_(idx, idx, -1); 4198 blt(CCR0, L_post_third_loop_done); 4199 4200 sldi(tmp, idx, LogBytesPerInt); 4201 lwzx(yz_idx, y, tmp); 4202 multiply64(product_high, product, x_xstart, yz_idx); 4203 lwzx(yz_idx, z, tmp); 4204 4205 add2_with_carry(product_high, product, yz_idx, carry); 4206 4207 sldi(tmp, idx, LogBytesPerInt); 4208 stwx(product, z, tmp); 4209 srdi(product, product, 32); 4210 4211 sldi(product_high, product_high, 32); 4212 orr(product, product, product_high); 4213 mr_if_needed(carry, product); 4214 4215 bind(L_post_third_loop_done); 4216 } // multiply_128_x_128_loop 4217 4218 void MacroAssembler::multiply_to_len(Register x, Register xlen, 4219 Register y, Register ylen, 4220 Register z, Register zlen, 4221 Register tmp1, Register tmp2, 4222 Register tmp3, Register tmp4, 4223 Register tmp5, Register tmp6, 4224 Register tmp7, Register tmp8, 4225 Register tmp9, Register tmp10, 4226 Register tmp11, Register tmp12, 4227 Register tmp13) { 4228 4229 ShortBranchVerifier sbv(this); 4230 4231 assert_different_registers(x, xlen, y, ylen, z, zlen, 4232 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6); 4233 assert_different_registers(x, xlen, y, ylen, z, zlen, 4234 tmp1, tmp2, tmp3, tmp4, tmp5, tmp7); 4235 assert_different_registers(x, xlen, y, ylen, z, zlen, 4236 tmp1, tmp2, tmp3, tmp4, tmp5, tmp8); 4237 4238 const Register idx = tmp1; 4239 const Register kdx = tmp2; 4240 const Register xstart = tmp3; 4241 4242 const Register y_idx = tmp4; 4243 const Register carry = tmp5; 4244 const Register product = tmp6; 4245 const Register product_high = tmp7; 4246 const Register x_xstart = tmp8; 4247 const Register tmp = tmp9; 4248 4249 // First Loop. 4250 // 4251 // final static long LONG_MASK = 0xffffffffL; 4252 // int xstart = xlen - 1; 4253 // int ystart = ylen - 1; 4254 // long carry = 0; 4255 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 4256 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; 4257 // z[kdx] = (int)product; 4258 // carry = product >>> 32; 4259 // } 4260 // z[xstart] = (int)carry; 4261 4262 mr_if_needed(idx, ylen); // idx = ylen 4263 mr_if_needed(kdx, zlen); // kdx = xlen + ylen 4264 li(carry, 0); // carry = 0 4265 4266 Label L_done; 4267 4268 addic_(xstart, xlen, -1); 4269 blt(CCR0, L_done); 4270 4271 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, 4272 carry, product_high, product, idx, kdx, tmp); 4273 4274 Label L_second_loop; 4275 4276 cmpdi(CCR0, kdx, 0); 4277 beq(CCR0, L_second_loop); 4278 4279 Label L_carry; 4280 4281 addic_(kdx, kdx, -1); 4282 beq(CCR0, L_carry); 4283 4284 // Store lower 32 bits of carry. 4285 sldi(tmp, kdx, LogBytesPerInt); 4286 stwx(carry, z, tmp); 4287 srdi(carry, carry, 32); 4288 addi(kdx, kdx, -1); 4289 4290 4291 bind(L_carry); 4292 4293 // Store upper 32 bits of carry. 4294 sldi(tmp, kdx, LogBytesPerInt); 4295 stwx(carry, z, tmp); 4296 4297 // Second and third (nested) loops. 4298 // 4299 // for (int i = xstart-1; i >= 0; i--) { // Second loop 4300 // carry = 0; 4301 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop 4302 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + 4303 // (z[k] & LONG_MASK) + carry; 4304 // z[k] = (int)product; 4305 // carry = product >>> 32; 4306 // } 4307 // z[i] = (int)carry; 4308 // } 4309 // 4310 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx 4311 4312 bind(L_second_loop); 4313 4314 li(carry, 0); // carry = 0; 4315 4316 addic_(xstart, xstart, -1); // i = xstart-1; 4317 blt(CCR0, L_done); 4318 4319 Register zsave = tmp10; 4320 4321 mr(zsave, z); 4322 4323 4324 Label L_last_x; 4325 4326 sldi(tmp, xstart, LogBytesPerInt); 4327 add(z, z, tmp); // z = z + k - j 4328 addi(z, z, 4); 4329 addic_(xstart, xstart, -1); // i = xstart-1; 4330 blt(CCR0, L_last_x); 4331 4332 sldi(tmp, xstart, LogBytesPerInt); 4333 ldx(x_xstart, x, tmp); 4334 #ifdef VM_LITTLE_ENDIAN 4335 rldicl(x_xstart, x_xstart, 32, 0); 4336 #endif 4337 4338 4339 Label L_third_loop_prologue; 4340 4341 bind(L_third_loop_prologue); 4342 4343 Register xsave = tmp11; 4344 Register xlensave = tmp12; 4345 Register ylensave = tmp13; 4346 4347 mr(xsave, x); 4348 mr(xlensave, xstart); 4349 mr(ylensave, ylen); 4350 4351 4352 multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen, 4353 carry, product_high, product, x, tmp); 4354 4355 mr(z, zsave); 4356 mr(x, xsave); 4357 mr(xlen, xlensave); // This is the decrement of the loop counter! 4358 mr(ylen, ylensave); 4359 4360 addi(tmp3, xlen, 1); 4361 sldi(tmp, tmp3, LogBytesPerInt); 4362 stwx(carry, z, tmp); 4363 addic_(tmp3, tmp3, -1); 4364 blt(CCR0, L_done); 4365 4366 srdi(carry, carry, 32); 4367 sldi(tmp, tmp3, LogBytesPerInt); 4368 stwx(carry, z, tmp); 4369 b(L_second_loop); 4370 4371 // Next infrequent code is moved outside loops. 4372 bind(L_last_x); 4373 4374 lwz(x_xstart, 0, x); 4375 b(L_third_loop_prologue); 4376 4377 bind(L_done); 4378 } // multiply_to_len 4379 4380 void MacroAssembler::asm_assert(bool check_equal, const char *msg, int id) { 4381 #ifdef ASSERT 4382 Label ok; 4383 if (check_equal) { 4384 beq(CCR0, ok); 4385 } else { 4386 bne(CCR0, ok); 4387 } 4388 stop(msg, id); 4389 bind(ok); 4390 #endif 4391 } 4392 4393 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset, 4394 Register mem_base, const char* msg, int id) { 4395 #ifdef ASSERT 4396 switch (size) { 4397 case 4: 4398 lwz(R0, mem_offset, mem_base); 4399 cmpwi(CCR0, R0, 0); 4400 break; 4401 case 8: 4402 ld(R0, mem_offset, mem_base); 4403 cmpdi(CCR0, R0, 0); 4404 break; 4405 default: 4406 ShouldNotReachHere(); 4407 } 4408 asm_assert(check_equal, msg, id); 4409 #endif // ASSERT 4410 } 4411 4412 void MacroAssembler::verify_thread() { 4413 if (VerifyThread) { 4414 unimplemented("'VerifyThread' currently not implemented on PPC"); 4415 } 4416 } 4417 4418 // READ: oop. KILL: R0. Volatile floats perhaps. 4419 void MacroAssembler::verify_oop(Register oop, const char* msg) { 4420 if (!VerifyOops) { 4421 return; 4422 } 4423 4424 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address(); 4425 const Register tmp = R11; // Will be preserved. 4426 const int nbytes_save = MacroAssembler::num_volatile_regs * 8; 4427 save_volatile_gprs(R1_SP, -nbytes_save); // except R0 4428 4429 mr_if_needed(R4_ARG2, oop); 4430 save_LR_CR(tmp); // save in old frame 4431 push_frame_reg_args(nbytes_save, tmp); 4432 // load FunctionDescriptor** / entry_address * 4433 load_const_optimized(tmp, fd, R0); 4434 // load FunctionDescriptor* / entry_address 4435 ld(tmp, 0, tmp); 4436 load_const_optimized(R3_ARG1, (address)msg, R0); 4437 // Call destination for its side effect. 4438 call_c(tmp); 4439 4440 pop_frame(); 4441 restore_LR_CR(tmp); 4442 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0 4443 } 4444 4445 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) { 4446 if (!VerifyOops) { 4447 return; 4448 } 4449 4450 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address(); 4451 const Register tmp = R11; // Will be preserved. 4452 const int nbytes_save = MacroAssembler::num_volatile_regs * 8; 4453 save_volatile_gprs(R1_SP, -nbytes_save); // except R0 4454 4455 ld(R4_ARG2, offs, base); 4456 save_LR_CR(tmp); // save in old frame 4457 push_frame_reg_args(nbytes_save, tmp); 4458 // load FunctionDescriptor** / entry_address * 4459 load_const_optimized(tmp, fd, R0); 4460 // load FunctionDescriptor* / entry_address 4461 ld(tmp, 0, tmp); 4462 load_const_optimized(R3_ARG1, (address)msg, R0); 4463 // Call destination for its side effect. 4464 call_c(tmp); 4465 4466 pop_frame(); 4467 restore_LR_CR(tmp); 4468 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0 4469 } 4470 4471 const char* stop_types[] = { 4472 "stop", 4473 "untested", 4474 "unimplemented", 4475 "shouldnotreachhere" 4476 }; 4477 4478 static void stop_on_request(int tp, const char* msg) { 4479 tty->print("PPC assembly code requires stop: (%s) %s\n", stop_types[tp%/*stop_end*/4], msg); 4480 guarantee(false, "PPC assembly code requires stop: %s", msg); 4481 } 4482 4483 // Call a C-function that prints output. 4484 void MacroAssembler::stop(int type, const char* msg, int id) { 4485 #ifndef PRODUCT 4486 block_comment(err_msg("stop: %s %s {", stop_types[type%stop_end], msg)); 4487 #else 4488 block_comment("stop {"); 4489 #endif 4490 4491 // setup arguments 4492 load_const_optimized(R3_ARG1, type); 4493 load_const_optimized(R4_ARG2, (void *)msg, /*tmp=*/R0); 4494 call_VM_leaf(CAST_FROM_FN_PTR(address, stop_on_request), R3_ARG1, R4_ARG2); 4495 illtrap(); 4496 emit_int32(id); 4497 block_comment("} stop;"); 4498 } 4499 4500 #ifndef PRODUCT 4501 // Write pattern 0x0101010101010101 in memory region [low-before, high+after]. 4502 // Val, addr are temp registers. 4503 // If low == addr, addr is killed. 4504 // High is preserved. 4505 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) { 4506 if (!ZapMemory) return; 4507 4508 assert_different_registers(low, val); 4509 4510 BLOCK_COMMENT("zap memory region {"); 4511 load_const_optimized(val, 0x0101010101010101); 4512 int size = before + after; 4513 if (low == high && size < 5 && size > 0) { 4514 int offset = -before*BytesPerWord; 4515 for (int i = 0; i < size; ++i) { 4516 std(val, offset, low); 4517 offset += (1*BytesPerWord); 4518 } 4519 } else { 4520 addi(addr, low, -before*BytesPerWord); 4521 assert_different_registers(high, val); 4522 if (after) addi(high, high, after * BytesPerWord); 4523 Label loop; 4524 bind(loop); 4525 std(val, 0, addr); 4526 addi(addr, addr, 8); 4527 cmpd(CCR6, addr, high); 4528 ble(CCR6, loop); 4529 if (after) addi(high, high, -after * BytesPerWord); // Correct back to old value. 4530 } 4531 BLOCK_COMMENT("} zap memory region"); 4532 } 4533 4534 #endif // !PRODUCT 4535 4536 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() { 4537 int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true); 4538 assert(sizeof(bool) == 1, "PowerPC ABI"); 4539 masm->lbz(temp, simm16_offset, temp); 4540 masm->cmpwi(CCR0, temp, 0); 4541 masm->beq(CCR0, _label); 4542 } 4543 4544 SkipIfEqualZero::~SkipIfEqualZero() { 4545 _masm->bind(_label); 4546 }