1 /* 2 * Copyright (c) 1997, 2016, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2012, 2016 SAP SE. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.inline.hpp" 28 #include "compiler/disassembler.hpp" 29 #include "gc/shared/cardTableModRefBS.hpp" 30 #include "gc/shared/collectedHeap.inline.hpp" 31 #include "interpreter/interpreter.hpp" 32 #include "memory/resourceArea.hpp" 33 #include "nativeInst_ppc.hpp" 34 #include "prims/methodHandles.hpp" 35 #include "runtime/biasedLocking.hpp" 36 #include "runtime/icache.hpp" 37 #include "runtime/interfaceSupport.hpp" 38 #include "runtime/objectMonitor.hpp" 39 #include "runtime/os.hpp" 40 #include "runtime/sharedRuntime.hpp" 41 #include "runtime/stubRoutines.hpp" 42 #include "utilities/macros.hpp" 43 #if INCLUDE_ALL_GCS 44 #include "gc/g1/g1CollectedHeap.inline.hpp" 45 #include "gc/g1/g1SATBCardTableModRefBS.hpp" 46 #include "gc/g1/heapRegion.hpp" 47 #endif // INCLUDE_ALL_GCS 48 #ifdef COMPILER2 49 #include "opto/intrinsicnode.hpp" 50 #endif 51 52 #ifdef PRODUCT 53 #define BLOCK_COMMENT(str) // nothing 54 #else 55 #define BLOCK_COMMENT(str) block_comment(str) 56 #endif 57 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 58 59 #ifdef ASSERT 60 // On RISC, there's no benefit to verifying instruction boundaries. 61 bool AbstractAssembler::pd_check_instruction_mark() { return false; } 62 #endif 63 64 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) { 65 assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range"); 66 if (Assembler::is_simm(si31, 16)) { 67 ld(d, si31, a); 68 if (emit_filler_nop) nop(); 69 } else { 70 const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31); 71 const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31); 72 addis(d, a, hi); 73 ld(d, lo, d); 74 } 75 } 76 77 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) { 78 assert_different_registers(d, a); 79 ld_largeoffset_unchecked(d, si31, a, emit_filler_nop); 80 } 81 82 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base, 83 size_t size_in_bytes, bool is_signed) { 84 switch (size_in_bytes) { 85 case 8: ld(dst, offs, base); break; 86 case 4: is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break; 87 case 2: is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break; 88 case 1: lbz(dst, offs, base); if (is_signed) extsb(dst, dst); break; // lba doesn't exist :( 89 default: ShouldNotReachHere(); 90 } 91 } 92 93 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base, 94 size_t size_in_bytes) { 95 switch (size_in_bytes) { 96 case 8: std(dst, offs, base); break; 97 case 4: stw(dst, offs, base); break; 98 case 2: sth(dst, offs, base); break; 99 case 1: stb(dst, offs, base); break; 100 default: ShouldNotReachHere(); 101 } 102 } 103 104 void MacroAssembler::align(int modulus, int max, int rem) { 105 int padding = (rem + modulus - (offset() % modulus)) % modulus; 106 if (padding > max) return; 107 for (int c = (padding >> 2); c > 0; --c) { nop(); } 108 } 109 110 // Issue instructions that calculate given TOC from global TOC. 111 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16, 112 bool add_relocation, bool emit_dummy_addr) { 113 int offset = -1; 114 if (emit_dummy_addr) { 115 offset = -128; // dummy address 116 } else if (addr != (address)(intptr_t)-1) { 117 offset = MacroAssembler::offset_to_global_toc(addr); 118 } 119 120 if (hi16) { 121 addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset)); 122 } 123 if (lo16) { 124 if (add_relocation) { 125 // Relocate at the addi to avoid confusion with a load from the method's TOC. 126 relocate(internal_word_Relocation::spec(addr)); 127 } 128 addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset)); 129 } 130 } 131 132 int MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) { 133 const int offset = MacroAssembler::offset_to_global_toc(addr); 134 135 const address inst2_addr = a; 136 const int inst2 = *(int *)inst2_addr; 137 138 // The relocation points to the second instruction, the addi, 139 // and the addi reads and writes the same register dst. 140 const int dst = inv_rt_field(inst2); 141 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 142 143 // Now, find the preceding addis which writes to dst. 144 int inst1 = 0; 145 address inst1_addr = inst2_addr - BytesPerInstWord; 146 while (inst1_addr >= bound) { 147 inst1 = *(int *) inst1_addr; 148 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 149 // Stop, found the addis which writes dst. 150 break; 151 } 152 inst1_addr -= BytesPerInstWord; 153 } 154 155 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 156 set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset)); 157 set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset)); 158 return (int)((intptr_t)addr - (intptr_t)inst1_addr); 159 } 160 161 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) { 162 const address inst2_addr = a; 163 const int inst2 = *(int *)inst2_addr; 164 165 // The relocation points to the second instruction, the addi, 166 // and the addi reads and writes the same register dst. 167 const int dst = inv_rt_field(inst2); 168 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 169 170 // Now, find the preceding addis which writes to dst. 171 int inst1 = 0; 172 address inst1_addr = inst2_addr - BytesPerInstWord; 173 while (inst1_addr >= bound) { 174 inst1 = *(int *) inst1_addr; 175 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 176 // stop, found the addis which writes dst 177 break; 178 } 179 inst1_addr -= BytesPerInstWord; 180 } 181 182 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 183 184 int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0); 185 // -1 is a special case 186 if (offset == -1) { 187 return (address)(intptr_t)-1; 188 } else { 189 return global_toc() + offset; 190 } 191 } 192 193 #ifdef _LP64 194 // Patch compressed oops or klass constants. 195 // Assembler sequence is 196 // 1) compressed oops: 197 // lis rx = const.hi 198 // ori rx = rx | const.lo 199 // 2) compressed klass: 200 // lis rx = const.hi 201 // clrldi rx = rx & 0xFFFFffff // clearMS32b, optional 202 // ori rx = rx | const.lo 203 // Clrldi will be passed by. 204 int MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) { 205 assert(UseCompressedOops, "Should only patch compressed oops"); 206 207 const address inst2_addr = a; 208 const int inst2 = *(int *)inst2_addr; 209 210 // The relocation points to the second instruction, the ori, 211 // and the ori reads and writes the same register dst. 212 const int dst = inv_rta_field(inst2); 213 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 214 // Now, find the preceding addis which writes to dst. 215 int inst1 = 0; 216 address inst1_addr = inst2_addr - BytesPerInstWord; 217 bool inst1_found = false; 218 while (inst1_addr >= bound) { 219 inst1 = *(int *)inst1_addr; 220 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; } 221 inst1_addr -= BytesPerInstWord; 222 } 223 assert(inst1_found, "inst is not lis"); 224 225 int xc = (data >> 16) & 0xffff; 226 int xd = (data >> 0) & 0xffff; 227 228 set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo 229 set_imm((int *)inst2_addr, (xd)); // unsigned int 230 return (int)((intptr_t)inst2_addr - (intptr_t)inst1_addr); 231 } 232 233 // Get compressed oop or klass constant. 234 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) { 235 assert(UseCompressedOops, "Should only patch compressed oops"); 236 237 const address inst2_addr = a; 238 const int inst2 = *(int *)inst2_addr; 239 240 // The relocation points to the second instruction, the ori, 241 // and the ori reads and writes the same register dst. 242 const int dst = inv_rta_field(inst2); 243 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 244 // Now, find the preceding lis which writes to dst. 245 int inst1 = 0; 246 address inst1_addr = inst2_addr - BytesPerInstWord; 247 bool inst1_found = false; 248 249 while (inst1_addr >= bound) { 250 inst1 = *(int *) inst1_addr; 251 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;} 252 inst1_addr -= BytesPerInstWord; 253 } 254 assert(inst1_found, "inst is not lis"); 255 256 uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff)); 257 uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16); 258 259 return (int) (xl | xh); 260 } 261 #endif // _LP64 262 263 // Returns true if successful. 264 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a, 265 Register toc, bool fixed_size) { 266 int toc_offset = 0; 267 // Use RelocationHolder::none for the constant pool entry, otherwise 268 // we will end up with a failing NativeCall::verify(x) where x is 269 // the address of the constant pool entry. 270 // FIXME: We should insert relocation information for oops at the constant 271 // pool entries instead of inserting it at the loads; patching of a constant 272 // pool entry should be less expensive. 273 address const_address = address_constant((address)a.value(), RelocationHolder::none); 274 if (const_address == NULL) { return false; } // allocation failure 275 // Relocate at the pc of the load. 276 relocate(a.rspec()); 277 toc_offset = (int)(const_address - code()->consts()->start()); 278 ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size); 279 return true; 280 } 281 282 bool MacroAssembler::is_load_const_from_method_toc_at(address a) { 283 const address inst1_addr = a; 284 const int inst1 = *(int *)inst1_addr; 285 286 // The relocation points to the ld or the addis. 287 return (is_ld(inst1)) || 288 (is_addis(inst1) && inv_ra_field(inst1) != 0); 289 } 290 291 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) { 292 assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc"); 293 294 const address inst1_addr = a; 295 const int inst1 = *(int *)inst1_addr; 296 297 if (is_ld(inst1)) { 298 return inv_d1_field(inst1); 299 } else if (is_addis(inst1)) { 300 const int dst = inv_rt_field(inst1); 301 302 // Now, find the succeeding ld which reads and writes to dst. 303 address inst2_addr = inst1_addr + BytesPerInstWord; 304 int inst2 = 0; 305 while (true) { 306 inst2 = *(int *) inst2_addr; 307 if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) { 308 // Stop, found the ld which reads and writes dst. 309 break; 310 } 311 inst2_addr += BytesPerInstWord; 312 } 313 return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2); 314 } 315 ShouldNotReachHere(); 316 return 0; 317 } 318 319 // Get the constant from a `load_const' sequence. 320 long MacroAssembler::get_const(address a) { 321 assert(is_load_const_at(a), "not a load of a constant"); 322 const int *p = (const int*) a; 323 unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48); 324 if (is_ori(*(p+1))) { 325 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32); 326 x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16); 327 x |= (((unsigned long) (get_imm(a,4) & 0xffff))); 328 } else if (is_lis(*(p+1))) { 329 x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32); 330 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16); 331 x |= (((unsigned long) (get_imm(a,3) & 0xffff))); 332 } else { 333 ShouldNotReachHere(); 334 return (long) 0; 335 } 336 return (long) x; 337 } 338 339 // Patch the 64 bit constant of a `load_const' sequence. This is a low 340 // level procedure. It neither flushes the instruction cache nor is it 341 // mt safe. 342 void MacroAssembler::patch_const(address a, long x) { 343 assert(is_load_const_at(a), "not a load of a constant"); 344 int *p = (int*) a; 345 if (is_ori(*(p+1))) { 346 set_imm(0 + p, (x >> 48) & 0xffff); 347 set_imm(1 + p, (x >> 32) & 0xffff); 348 set_imm(3 + p, (x >> 16) & 0xffff); 349 set_imm(4 + p, x & 0xffff); 350 } else if (is_lis(*(p+1))) { 351 set_imm(0 + p, (x >> 48) & 0xffff); 352 set_imm(2 + p, (x >> 32) & 0xffff); 353 set_imm(1 + p, (x >> 16) & 0xffff); 354 set_imm(3 + p, x & 0xffff); 355 } else { 356 ShouldNotReachHere(); 357 } 358 } 359 360 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) { 361 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 362 int index = oop_recorder()->allocate_metadata_index(obj); 363 RelocationHolder rspec = metadata_Relocation::spec(index); 364 return AddressLiteral((address)obj, rspec); 365 } 366 367 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) { 368 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 369 int index = oop_recorder()->find_index(obj); 370 RelocationHolder rspec = metadata_Relocation::spec(index); 371 return AddressLiteral((address)obj, rspec); 372 } 373 374 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) { 375 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 376 int oop_index = oop_recorder()->allocate_oop_index(obj); 377 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 378 } 379 380 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) { 381 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 382 int oop_index = oop_recorder()->find_index(obj); 383 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 384 } 385 386 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr, 387 Register tmp, int offset) { 388 intptr_t value = *delayed_value_addr; 389 if (value != 0) { 390 return RegisterOrConstant(value + offset); 391 } 392 393 // Load indirectly to solve generation ordering problem. 394 // static address, no relocation 395 int simm16_offset = load_const_optimized(tmp, delayed_value_addr, noreg, true); 396 ld(tmp, simm16_offset, tmp); // must be aligned ((xa & 3) == 0) 397 398 if (offset != 0) { 399 addi(tmp, tmp, offset); 400 } 401 402 return RegisterOrConstant(tmp); 403 } 404 405 #ifndef PRODUCT 406 void MacroAssembler::pd_print_patched_instruction(address branch) { 407 Unimplemented(); // TODO: PPC port 408 } 409 #endif // ndef PRODUCT 410 411 // Conditional far branch for destinations encodable in 24+2 bits. 412 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) { 413 414 // If requested by flag optimize, relocate the bc_far as a 415 // runtime_call and prepare for optimizing it when the code gets 416 // relocated. 417 if (optimize == bc_far_optimize_on_relocate) { 418 relocate(relocInfo::runtime_call_type); 419 } 420 421 // variant 2: 422 // 423 // b!cxx SKIP 424 // bxx DEST 425 // SKIP: 426 // 427 428 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 429 opposite_bcond(inv_boint_bcond(boint))); 430 431 // We emit two branches. 432 // First, a conditional branch which jumps around the far branch. 433 const address not_taken_pc = pc() + 2 * BytesPerInstWord; 434 const address bc_pc = pc(); 435 bc(opposite_boint, biint, not_taken_pc); 436 437 const int bc_instr = *(int*)bc_pc; 438 assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition"); 439 assert(opposite_boint == inv_bo_field(bc_instr), "postcondition"); 440 assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))), 441 opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))), 442 "postcondition"); 443 assert(biint == inv_bi_field(bc_instr), "postcondition"); 444 445 // Second, an unconditional far branch which jumps to dest. 446 // Note: target(dest) remembers the current pc (see CodeSection::target) 447 // and returns the current pc if the label is not bound yet; when 448 // the label gets bound, the unconditional far branch will be patched. 449 const address target_pc = target(dest); 450 const address b_pc = pc(); 451 b(target_pc); 452 453 assert(not_taken_pc == pc(), "postcondition"); 454 assert(dest.is_bound() || target_pc == b_pc, "postcondition"); 455 } 456 457 // 1 or 2 instructions 458 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) { 459 if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) { 460 bc(boint, biint, dest); 461 } else { 462 bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate); 463 } 464 } 465 466 bool MacroAssembler::is_bc_far_at(address instruction_addr) { 467 return is_bc_far_variant1_at(instruction_addr) || 468 is_bc_far_variant2_at(instruction_addr) || 469 is_bc_far_variant3_at(instruction_addr); 470 } 471 472 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) { 473 if (is_bc_far_variant1_at(instruction_addr)) { 474 const address instruction_1_addr = instruction_addr; 475 const int instruction_1 = *(int*)instruction_1_addr; 476 return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr); 477 } else if (is_bc_far_variant2_at(instruction_addr)) { 478 const address instruction_2_addr = instruction_addr + 4; 479 return bxx_destination(instruction_2_addr); 480 } else if (is_bc_far_variant3_at(instruction_addr)) { 481 return instruction_addr + 8; 482 } 483 // variant 4 ??? 484 ShouldNotReachHere(); 485 return NULL; 486 } 487 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) { 488 489 if (is_bc_far_variant3_at(instruction_addr)) { 490 // variant 3, far cond branch to the next instruction, already patched to nops: 491 // 492 // nop 493 // endgroup 494 // SKIP/DEST: 495 // 496 return; 497 } 498 499 // first, extract boint and biint from the current branch 500 int boint = 0; 501 int biint = 0; 502 503 ResourceMark rm; 504 const int code_size = 2 * BytesPerInstWord; 505 CodeBuffer buf(instruction_addr, code_size); 506 MacroAssembler masm(&buf); 507 if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) { 508 // Far branch to next instruction: Optimize it by patching nops (produce variant 3). 509 masm.nop(); 510 masm.endgroup(); 511 } else { 512 if (is_bc_far_variant1_at(instruction_addr)) { 513 // variant 1, the 1st instruction contains the destination address: 514 // 515 // bcxx DEST 516 // nop 517 // 518 const int instruction_1 = *(int*)(instruction_addr); 519 boint = inv_bo_field(instruction_1); 520 biint = inv_bi_field(instruction_1); 521 } else if (is_bc_far_variant2_at(instruction_addr)) { 522 // variant 2, the 2nd instruction contains the destination address: 523 // 524 // b!cxx SKIP 525 // bxx DEST 526 // SKIP: 527 // 528 const int instruction_1 = *(int*)(instruction_addr); 529 boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))), 530 opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1)))); 531 biint = inv_bi_field(instruction_1); 532 } else { 533 // variant 4??? 534 ShouldNotReachHere(); 535 } 536 537 // second, set the new branch destination and optimize the code 538 if (dest != instruction_addr + 4 && // the bc_far is still unbound! 539 masm.is_within_range_of_bcxx(dest, instruction_addr)) { 540 // variant 1: 541 // 542 // bcxx DEST 543 // nop 544 // 545 masm.bc(boint, biint, dest); 546 masm.nop(); 547 } else { 548 // variant 2: 549 // 550 // b!cxx SKIP 551 // bxx DEST 552 // SKIP: 553 // 554 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 555 opposite_bcond(inv_boint_bcond(boint))); 556 const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord; 557 masm.bc(opposite_boint, biint, not_taken_pc); 558 masm.b(dest); 559 } 560 } 561 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 562 } 563 564 // Emit a NOT mt-safe patchable 64 bit absolute call/jump. 565 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) { 566 // get current pc 567 uint64_t start_pc = (uint64_t) pc(); 568 569 const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last 570 const address pc_of_b = (address) (start_pc + (0*BytesPerInstWord)); // b is first 571 572 // relocate here 573 if (rt != relocInfo::none) { 574 relocate(rt); 575 } 576 577 if ( ReoptimizeCallSequences && 578 (( link && is_within_range_of_b(dest, pc_of_bl)) || 579 (!link && is_within_range_of_b(dest, pc_of_b)))) { 580 // variant 2: 581 // Emit an optimized, pc-relative call/jump. 582 583 if (link) { 584 // some padding 585 nop(); 586 nop(); 587 nop(); 588 nop(); 589 nop(); 590 nop(); 591 592 // do the call 593 assert(pc() == pc_of_bl, "just checking"); 594 bl(dest, relocInfo::none); 595 } else { 596 // do the jump 597 assert(pc() == pc_of_b, "just checking"); 598 b(dest, relocInfo::none); 599 600 // some padding 601 nop(); 602 nop(); 603 nop(); 604 nop(); 605 nop(); 606 nop(); 607 } 608 609 // Assert that we can identify the emitted call/jump. 610 assert(is_bxx64_patchable_variant2_at((address)start_pc, link), 611 "can't identify emitted call"); 612 } else { 613 // variant 1: 614 mr(R0, R11); // spill R11 -> R0. 615 616 // Load the destination address into CTR, 617 // calculate destination relative to global toc. 618 calculate_address_from_global_toc(R11, dest, true, true, false); 619 620 mtctr(R11); 621 mr(R11, R0); // spill R11 <- R0. 622 nop(); 623 624 // do the call/jump 625 if (link) { 626 bctrl(); 627 } else{ 628 bctr(); 629 } 630 // Assert that we can identify the emitted call/jump. 631 assert(is_bxx64_patchable_variant1b_at((address)start_pc, link), 632 "can't identify emitted call"); 633 } 634 635 // Assert that we can identify the emitted call/jump. 636 assert(is_bxx64_patchable_at((address)start_pc, link), 637 "can't identify emitted call"); 638 assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest, 639 "wrong encoding of dest address"); 640 } 641 642 // Identify a bxx64_patchable instruction. 643 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) { 644 return is_bxx64_patchable_variant1b_at(instruction_addr, link) 645 //|| is_bxx64_patchable_variant1_at(instruction_addr, link) 646 || is_bxx64_patchable_variant2_at(instruction_addr, link); 647 } 648 649 // Does the call64_patchable instruction use a pc-relative encoding of 650 // the call destination? 651 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) { 652 // variant 2 is pc-relative 653 return is_bxx64_patchable_variant2_at(instruction_addr, link); 654 } 655 656 // Identify variant 1. 657 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) { 658 unsigned int* instr = (unsigned int*) instruction_addr; 659 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 660 && is_mtctr(instr[5]) // mtctr 661 && is_load_const_at(instruction_addr); 662 } 663 664 // Identify variant 1b: load destination relative to global toc. 665 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) { 666 unsigned int* instr = (unsigned int*) instruction_addr; 667 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 668 && is_mtctr(instr[3]) // mtctr 669 && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr); 670 } 671 672 // Identify variant 2. 673 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) { 674 unsigned int* instr = (unsigned int*) instruction_addr; 675 if (link) { 676 return is_bl (instr[6]) // bl dest is last 677 && is_nop(instr[0]) // nop 678 && is_nop(instr[1]) // nop 679 && is_nop(instr[2]) // nop 680 && is_nop(instr[3]) // nop 681 && is_nop(instr[4]) // nop 682 && is_nop(instr[5]); // nop 683 } else { 684 return is_b (instr[0]) // b dest is first 685 && is_nop(instr[1]) // nop 686 && is_nop(instr[2]) // nop 687 && is_nop(instr[3]) // nop 688 && is_nop(instr[4]) // nop 689 && is_nop(instr[5]) // nop 690 && is_nop(instr[6]); // nop 691 } 692 } 693 694 // Set dest address of a bxx64_patchable instruction. 695 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) { 696 ResourceMark rm; 697 int code_size = MacroAssembler::bxx64_patchable_size; 698 CodeBuffer buf(instruction_addr, code_size); 699 MacroAssembler masm(&buf); 700 masm.bxx64_patchable(dest, relocInfo::none, link); 701 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 702 } 703 704 // Get dest address of a bxx64_patchable instruction. 705 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) { 706 if (is_bxx64_patchable_variant1_at(instruction_addr, link)) { 707 return (address) (unsigned long) get_const(instruction_addr); 708 } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) { 709 unsigned int* instr = (unsigned int*) instruction_addr; 710 if (link) { 711 const int instr_idx = 6; // bl is last 712 int branchoffset = branch_destination(instr[instr_idx], 0); 713 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 714 } else { 715 const int instr_idx = 0; // b is first 716 int branchoffset = branch_destination(instr[instr_idx], 0); 717 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 718 } 719 // Load dest relative to global toc. 720 } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) { 721 return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, 722 instruction_addr); 723 } else { 724 ShouldNotReachHere(); 725 return NULL; 726 } 727 } 728 729 // Uses ordering which corresponds to ABI: 730 // _savegpr0_14: std r14,-144(r1) 731 // _savegpr0_15: std r15,-136(r1) 732 // _savegpr0_16: std r16,-128(r1) 733 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) { 734 std(R14, offset, dst); offset += 8; 735 std(R15, offset, dst); offset += 8; 736 std(R16, offset, dst); offset += 8; 737 std(R17, offset, dst); offset += 8; 738 std(R18, offset, dst); offset += 8; 739 std(R19, offset, dst); offset += 8; 740 std(R20, offset, dst); offset += 8; 741 std(R21, offset, dst); offset += 8; 742 std(R22, offset, dst); offset += 8; 743 std(R23, offset, dst); offset += 8; 744 std(R24, offset, dst); offset += 8; 745 std(R25, offset, dst); offset += 8; 746 std(R26, offset, dst); offset += 8; 747 std(R27, offset, dst); offset += 8; 748 std(R28, offset, dst); offset += 8; 749 std(R29, offset, dst); offset += 8; 750 std(R30, offset, dst); offset += 8; 751 std(R31, offset, dst); offset += 8; 752 753 stfd(F14, offset, dst); offset += 8; 754 stfd(F15, offset, dst); offset += 8; 755 stfd(F16, offset, dst); offset += 8; 756 stfd(F17, offset, dst); offset += 8; 757 stfd(F18, offset, dst); offset += 8; 758 stfd(F19, offset, dst); offset += 8; 759 stfd(F20, offset, dst); offset += 8; 760 stfd(F21, offset, dst); offset += 8; 761 stfd(F22, offset, dst); offset += 8; 762 stfd(F23, offset, dst); offset += 8; 763 stfd(F24, offset, dst); offset += 8; 764 stfd(F25, offset, dst); offset += 8; 765 stfd(F26, offset, dst); offset += 8; 766 stfd(F27, offset, dst); offset += 8; 767 stfd(F28, offset, dst); offset += 8; 768 stfd(F29, offset, dst); offset += 8; 769 stfd(F30, offset, dst); offset += 8; 770 stfd(F31, offset, dst); 771 } 772 773 // Uses ordering which corresponds to ABI: 774 // _restgpr0_14: ld r14,-144(r1) 775 // _restgpr0_15: ld r15,-136(r1) 776 // _restgpr0_16: ld r16,-128(r1) 777 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) { 778 ld(R14, offset, src); offset += 8; 779 ld(R15, offset, src); offset += 8; 780 ld(R16, offset, src); offset += 8; 781 ld(R17, offset, src); offset += 8; 782 ld(R18, offset, src); offset += 8; 783 ld(R19, offset, src); offset += 8; 784 ld(R20, offset, src); offset += 8; 785 ld(R21, offset, src); offset += 8; 786 ld(R22, offset, src); offset += 8; 787 ld(R23, offset, src); offset += 8; 788 ld(R24, offset, src); offset += 8; 789 ld(R25, offset, src); offset += 8; 790 ld(R26, offset, src); offset += 8; 791 ld(R27, offset, src); offset += 8; 792 ld(R28, offset, src); offset += 8; 793 ld(R29, offset, src); offset += 8; 794 ld(R30, offset, src); offset += 8; 795 ld(R31, offset, src); offset += 8; 796 797 // FP registers 798 lfd(F14, offset, src); offset += 8; 799 lfd(F15, offset, src); offset += 8; 800 lfd(F16, offset, src); offset += 8; 801 lfd(F17, offset, src); offset += 8; 802 lfd(F18, offset, src); offset += 8; 803 lfd(F19, offset, src); offset += 8; 804 lfd(F20, offset, src); offset += 8; 805 lfd(F21, offset, src); offset += 8; 806 lfd(F22, offset, src); offset += 8; 807 lfd(F23, offset, src); offset += 8; 808 lfd(F24, offset, src); offset += 8; 809 lfd(F25, offset, src); offset += 8; 810 lfd(F26, offset, src); offset += 8; 811 lfd(F27, offset, src); offset += 8; 812 lfd(F28, offset, src); offset += 8; 813 lfd(F29, offset, src); offset += 8; 814 lfd(F30, offset, src); offset += 8; 815 lfd(F31, offset, src); 816 } 817 818 // For verify_oops. 819 void MacroAssembler::save_volatile_gprs(Register dst, int offset) { 820 std(R2, offset, dst); offset += 8; 821 std(R3, offset, dst); offset += 8; 822 std(R4, offset, dst); offset += 8; 823 std(R5, offset, dst); offset += 8; 824 std(R6, offset, dst); offset += 8; 825 std(R7, offset, dst); offset += 8; 826 std(R8, offset, dst); offset += 8; 827 std(R9, offset, dst); offset += 8; 828 std(R10, offset, dst); offset += 8; 829 std(R11, offset, dst); offset += 8; 830 std(R12, offset, dst); offset += 8; 831 832 stfd(F0, offset, dst); offset += 8; 833 stfd(F1, offset, dst); offset += 8; 834 stfd(F2, offset, dst); offset += 8; 835 stfd(F3, offset, dst); offset += 8; 836 stfd(F4, offset, dst); offset += 8; 837 stfd(F5, offset, dst); offset += 8; 838 stfd(F6, offset, dst); offset += 8; 839 stfd(F7, offset, dst); offset += 8; 840 stfd(F8, offset, dst); offset += 8; 841 stfd(F9, offset, dst); offset += 8; 842 stfd(F10, offset, dst); offset += 8; 843 stfd(F11, offset, dst); offset += 8; 844 stfd(F12, offset, dst); offset += 8; 845 stfd(F13, offset, dst); 846 } 847 848 // For verify_oops. 849 void MacroAssembler::restore_volatile_gprs(Register src, int offset) { 850 ld(R2, offset, src); offset += 8; 851 ld(R3, offset, src); offset += 8; 852 ld(R4, offset, src); offset += 8; 853 ld(R5, offset, src); offset += 8; 854 ld(R6, offset, src); offset += 8; 855 ld(R7, offset, src); offset += 8; 856 ld(R8, offset, src); offset += 8; 857 ld(R9, offset, src); offset += 8; 858 ld(R10, offset, src); offset += 8; 859 ld(R11, offset, src); offset += 8; 860 ld(R12, offset, src); offset += 8; 861 862 lfd(F0, offset, src); offset += 8; 863 lfd(F1, offset, src); offset += 8; 864 lfd(F2, offset, src); offset += 8; 865 lfd(F3, offset, src); offset += 8; 866 lfd(F4, offset, src); offset += 8; 867 lfd(F5, offset, src); offset += 8; 868 lfd(F6, offset, src); offset += 8; 869 lfd(F7, offset, src); offset += 8; 870 lfd(F8, offset, src); offset += 8; 871 lfd(F9, offset, src); offset += 8; 872 lfd(F10, offset, src); offset += 8; 873 lfd(F11, offset, src); offset += 8; 874 lfd(F12, offset, src); offset += 8; 875 lfd(F13, offset, src); 876 } 877 878 void MacroAssembler::save_LR_CR(Register tmp) { 879 mfcr(tmp); 880 std(tmp, _abi(cr), R1_SP); 881 mflr(tmp); 882 std(tmp, _abi(lr), R1_SP); 883 // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad) 884 } 885 886 void MacroAssembler::restore_LR_CR(Register tmp) { 887 assert(tmp != R1_SP, "must be distinct"); 888 ld(tmp, _abi(lr), R1_SP); 889 mtlr(tmp); 890 ld(tmp, _abi(cr), R1_SP); 891 mtcr(tmp); 892 } 893 894 address MacroAssembler::get_PC_trash_LR(Register result) { 895 Label L; 896 bl(L); 897 bind(L); 898 address lr_pc = pc(); 899 mflr(result); 900 return lr_pc; 901 } 902 903 void MacroAssembler::resize_frame(Register offset, Register tmp) { 904 #ifdef ASSERT 905 assert_different_registers(offset, tmp, R1_SP); 906 andi_(tmp, offset, frame::alignment_in_bytes-1); 907 asm_assert_eq("resize_frame: unaligned", 0x204); 908 #endif 909 910 // tmp <- *(SP) 911 ld(tmp, _abi(callers_sp), R1_SP); 912 // addr <- SP + offset; 913 // *(addr) <- tmp; 914 // SP <- addr 915 stdux(tmp, R1_SP, offset); 916 } 917 918 void MacroAssembler::resize_frame(int offset, Register tmp) { 919 assert(is_simm(offset, 16), "too big an offset"); 920 assert_different_registers(tmp, R1_SP); 921 assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned"); 922 // tmp <- *(SP) 923 ld(tmp, _abi(callers_sp), R1_SP); 924 // addr <- SP + offset; 925 // *(addr) <- tmp; 926 // SP <- addr 927 stdu(tmp, offset, R1_SP); 928 } 929 930 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) { 931 // (addr == tmp1) || (addr == tmp2) is allowed here! 932 assert(tmp1 != tmp2, "must be distinct"); 933 934 // compute offset w.r.t. current stack pointer 935 // tmp_1 <- addr - SP (!) 936 subf(tmp1, R1_SP, addr); 937 938 // atomically update SP keeping back link. 939 resize_frame(tmp1/* offset */, tmp2/* tmp */); 940 } 941 942 void MacroAssembler::push_frame(Register bytes, Register tmp) { 943 #ifdef ASSERT 944 assert(bytes != R0, "r0 not allowed here"); 945 andi_(R0, bytes, frame::alignment_in_bytes-1); 946 asm_assert_eq("push_frame(Reg, Reg): unaligned", 0x203); 947 #endif 948 neg(tmp, bytes); 949 stdux(R1_SP, R1_SP, tmp); 950 } 951 952 // Push a frame of size `bytes'. 953 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) { 954 long offset = align_addr(bytes, frame::alignment_in_bytes); 955 if (is_simm(-offset, 16)) { 956 stdu(R1_SP, -offset, R1_SP); 957 } else { 958 load_const_optimized(tmp, -offset); 959 stdux(R1_SP, R1_SP, tmp); 960 } 961 } 962 963 // Push a frame of size `bytes' plus abi_reg_args on top. 964 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) { 965 push_frame(bytes + frame::abi_reg_args_size, tmp); 966 } 967 968 // Setup up a new C frame with a spill area for non-volatile GPRs and 969 // additional space for local variables. 970 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes, 971 Register tmp) { 972 push_frame(bytes + frame::abi_reg_args_size + frame::spill_nonvolatiles_size, tmp); 973 } 974 975 // Pop current C frame. 976 void MacroAssembler::pop_frame() { 977 ld(R1_SP, _abi(callers_sp), R1_SP); 978 } 979 980 #if defined(ABI_ELFv2) 981 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) { 982 // TODO(asmundak): make sure the caller uses R12 as function descriptor 983 // most of the times. 984 if (R12 != r_function_entry) { 985 mr(R12, r_function_entry); 986 } 987 mtctr(R12); 988 // Do a call or a branch. 989 if (and_link) { 990 bctrl(); 991 } else { 992 bctr(); 993 } 994 _last_calls_return_pc = pc(); 995 996 return _last_calls_return_pc; 997 } 998 999 // Call a C function via a function descriptor and use full C 1000 // calling conventions. Updates and returns _last_calls_return_pc. 1001 address MacroAssembler::call_c(Register r_function_entry) { 1002 return branch_to(r_function_entry, /*and_link=*/true); 1003 } 1004 1005 // For tail calls: only branch, don't link, so callee returns to caller of this function. 1006 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) { 1007 return branch_to(r_function_entry, /*and_link=*/false); 1008 } 1009 1010 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) { 1011 load_const(R12, function_entry, R0); 1012 return branch_to(R12, /*and_link=*/true); 1013 } 1014 1015 #else 1016 // Generic version of a call to C function via a function descriptor 1017 // with variable support for C calling conventions (TOC, ENV, etc.). 1018 // Updates and returns _last_calls_return_pc. 1019 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call, 1020 bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) { 1021 // we emit standard ptrgl glue code here 1022 assert((function_descriptor != R0), "function_descriptor cannot be R0"); 1023 1024 // retrieve necessary entries from the function descriptor 1025 ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor); 1026 mtctr(R0); 1027 1028 if (load_toc_of_callee) { 1029 ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor); 1030 } 1031 if (load_env_of_callee) { 1032 ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor); 1033 } else if (load_toc_of_callee) { 1034 li(R11, 0); 1035 } 1036 1037 // do a call or a branch 1038 if (and_link) { 1039 bctrl(); 1040 } else { 1041 bctr(); 1042 } 1043 _last_calls_return_pc = pc(); 1044 1045 return _last_calls_return_pc; 1046 } 1047 1048 // Call a C function via a function descriptor and use full C calling 1049 // conventions. 1050 // We don't use the TOC in generated code, so there is no need to save 1051 // and restore its value. 1052 address MacroAssembler::call_c(Register fd) { 1053 return branch_to(fd, /*and_link=*/true, 1054 /*save toc=*/false, 1055 /*restore toc=*/false, 1056 /*load toc=*/true, 1057 /*load env=*/true); 1058 } 1059 1060 address MacroAssembler::call_c_and_return_to_caller(Register fd) { 1061 return branch_to(fd, /*and_link=*/false, 1062 /*save toc=*/false, 1063 /*restore toc=*/false, 1064 /*load toc=*/true, 1065 /*load env=*/true); 1066 } 1067 1068 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) { 1069 if (rt != relocInfo::none) { 1070 // this call needs to be relocatable 1071 if (!ReoptimizeCallSequences 1072 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1073 || fd == NULL // support code-size estimation 1074 || !fd->is_friend_function() 1075 || fd->entry() == NULL) { 1076 // it's not a friend function as defined by class FunctionDescriptor, 1077 // so do a full call-c here. 1078 load_const(R11, (address)fd, R0); 1079 1080 bool has_env = (fd != NULL && fd->env() != NULL); 1081 return branch_to(R11, /*and_link=*/true, 1082 /*save toc=*/false, 1083 /*restore toc=*/false, 1084 /*load toc=*/true, 1085 /*load env=*/has_env); 1086 } else { 1087 // It's a friend function. Load the entry point and don't care about 1088 // toc and env. Use an optimizable call instruction, but ensure the 1089 // same code-size as in the case of a non-friend function. 1090 nop(); 1091 nop(); 1092 nop(); 1093 bl64_patchable(fd->entry(), rt); 1094 _last_calls_return_pc = pc(); 1095 return _last_calls_return_pc; 1096 } 1097 } else { 1098 // This call does not need to be relocatable, do more aggressive 1099 // optimizations. 1100 if (!ReoptimizeCallSequences 1101 || !fd->is_friend_function()) { 1102 // It's not a friend function as defined by class FunctionDescriptor, 1103 // so do a full call-c here. 1104 load_const(R11, (address)fd, R0); 1105 return branch_to(R11, /*and_link=*/true, 1106 /*save toc=*/false, 1107 /*restore toc=*/false, 1108 /*load toc=*/true, 1109 /*load env=*/true); 1110 } else { 1111 // it's a friend function, load the entry point and don't care about 1112 // toc and env. 1113 address dest = fd->entry(); 1114 if (is_within_range_of_b(dest, pc())) { 1115 bl(dest); 1116 } else { 1117 bl64_patchable(dest, rt); 1118 } 1119 _last_calls_return_pc = pc(); 1120 return _last_calls_return_pc; 1121 } 1122 } 1123 } 1124 1125 // Call a C function. All constants needed reside in TOC. 1126 // 1127 // Read the address to call from the TOC. 1128 // Read env from TOC, if fd specifies an env. 1129 // Read new TOC from TOC. 1130 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd, 1131 relocInfo::relocType rt, Register toc) { 1132 if (!ReoptimizeCallSequences 1133 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1134 || !fd->is_friend_function()) { 1135 // It's not a friend function as defined by class FunctionDescriptor, 1136 // so do a full call-c here. 1137 assert(fd->entry() != NULL, "function must be linked"); 1138 1139 AddressLiteral fd_entry(fd->entry()); 1140 bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true); 1141 mtctr(R11); 1142 if (fd->env() == NULL) { 1143 li(R11, 0); 1144 nop(); 1145 } else { 1146 AddressLiteral fd_env(fd->env()); 1147 success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true); 1148 } 1149 AddressLiteral fd_toc(fd->toc()); 1150 // Set R2_TOC (load from toc) 1151 success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true); 1152 bctrl(); 1153 _last_calls_return_pc = pc(); 1154 if (!success) { return NULL; } 1155 } else { 1156 // It's a friend function, load the entry point and don't care about 1157 // toc and env. Use an optimizable call instruction, but ensure the 1158 // same code-size as in the case of a non-friend function. 1159 nop(); 1160 bl64_patchable(fd->entry(), rt); 1161 _last_calls_return_pc = pc(); 1162 } 1163 return _last_calls_return_pc; 1164 } 1165 #endif // ABI_ELFv2 1166 1167 void MacroAssembler::call_VM_base(Register oop_result, 1168 Register last_java_sp, 1169 address entry_point, 1170 bool check_exceptions) { 1171 BLOCK_COMMENT("call_VM {"); 1172 // Determine last_java_sp register. 1173 if (!last_java_sp->is_valid()) { 1174 last_java_sp = R1_SP; 1175 } 1176 set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1); 1177 1178 // ARG1 must hold thread address. 1179 mr(R3_ARG1, R16_thread); 1180 #if defined(ABI_ELFv2) 1181 address return_pc = call_c(entry_point, relocInfo::none); 1182 #else 1183 address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none); 1184 #endif 1185 1186 reset_last_Java_frame(); 1187 1188 // Check for pending exceptions. 1189 if (check_exceptions) { 1190 // We don't check for exceptions here. 1191 ShouldNotReachHere(); 1192 } 1193 1194 // Get oop result if there is one and reset the value in the thread. 1195 if (oop_result->is_valid()) { 1196 get_vm_result(oop_result); 1197 } 1198 1199 _last_calls_return_pc = return_pc; 1200 BLOCK_COMMENT("} call_VM"); 1201 } 1202 1203 void MacroAssembler::call_VM_leaf_base(address entry_point) { 1204 BLOCK_COMMENT("call_VM_leaf {"); 1205 #if defined(ABI_ELFv2) 1206 call_c(entry_point, relocInfo::none); 1207 #else 1208 call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none); 1209 #endif 1210 BLOCK_COMMENT("} call_VM_leaf"); 1211 } 1212 1213 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) { 1214 call_VM_base(oop_result, noreg, entry_point, check_exceptions); 1215 } 1216 1217 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, 1218 bool check_exceptions) { 1219 // R3_ARG1 is reserved for the thread. 1220 mr_if_needed(R4_ARG2, arg_1); 1221 call_VM(oop_result, entry_point, check_exceptions); 1222 } 1223 1224 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, 1225 bool check_exceptions) { 1226 // R3_ARG1 is reserved for the thread 1227 mr_if_needed(R4_ARG2, arg_1); 1228 assert(arg_2 != R4_ARG2, "smashed argument"); 1229 mr_if_needed(R5_ARG3, arg_2); 1230 call_VM(oop_result, entry_point, check_exceptions); 1231 } 1232 1233 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3, 1234 bool check_exceptions) { 1235 // R3_ARG1 is reserved for the thread 1236 mr_if_needed(R4_ARG2, arg_1); 1237 assert(arg_2 != R4_ARG2, "smashed argument"); 1238 mr_if_needed(R5_ARG3, arg_2); 1239 mr_if_needed(R6_ARG4, arg_3); 1240 call_VM(oop_result, entry_point, check_exceptions); 1241 } 1242 1243 void MacroAssembler::call_VM_leaf(address entry_point) { 1244 call_VM_leaf_base(entry_point); 1245 } 1246 1247 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) { 1248 mr_if_needed(R3_ARG1, arg_1); 1249 call_VM_leaf(entry_point); 1250 } 1251 1252 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) { 1253 mr_if_needed(R3_ARG1, arg_1); 1254 assert(arg_2 != R3_ARG1, "smashed argument"); 1255 mr_if_needed(R4_ARG2, arg_2); 1256 call_VM_leaf(entry_point); 1257 } 1258 1259 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) { 1260 mr_if_needed(R3_ARG1, arg_1); 1261 assert(arg_2 != R3_ARG1, "smashed argument"); 1262 mr_if_needed(R4_ARG2, arg_2); 1263 assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument"); 1264 mr_if_needed(R5_ARG3, arg_3); 1265 call_VM_leaf(entry_point); 1266 } 1267 1268 // Check whether instruction is a read access to the polling page 1269 // which was emitted by load_from_polling_page(..). 1270 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext, 1271 address* polling_address_ptr) { 1272 if (!is_ld(instruction)) 1273 return false; // It's not a ld. Fail. 1274 1275 int rt = inv_rt_field(instruction); 1276 int ra = inv_ra_field(instruction); 1277 int ds = inv_ds_field(instruction); 1278 if (!(ds == 0 && ra != 0 && rt == 0)) { 1279 return false; // It's not a ld(r0, X, ra). Fail. 1280 } 1281 1282 if (!ucontext) { 1283 // Set polling address. 1284 if (polling_address_ptr != NULL) { 1285 *polling_address_ptr = NULL; 1286 } 1287 return true; // No ucontext given. Can't check value of ra. Assume true. 1288 } 1289 1290 #ifdef LINUX 1291 // Ucontext given. Check that register ra contains the address of 1292 // the safepoing polling page. 1293 ucontext_t* uc = (ucontext_t*) ucontext; 1294 // Set polling address. 1295 address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds; 1296 if (polling_address_ptr != NULL) { 1297 *polling_address_ptr = addr; 1298 } 1299 return os::is_poll_address(addr); 1300 #else 1301 // Not on Linux, ucontext must be NULL. 1302 ShouldNotReachHere(); 1303 return false; 1304 #endif 1305 } 1306 1307 bool MacroAssembler::is_memory_serialization(int instruction, JavaThread* thread, void* ucontext) { 1308 #ifdef LINUX 1309 ucontext_t* uc = (ucontext_t*) ucontext; 1310 1311 if (is_stwx(instruction) || is_stwux(instruction)) { 1312 int ra = inv_ra_field(instruction); 1313 int rb = inv_rb_field(instruction); 1314 1315 // look up content of ra and rb in ucontext 1316 address ra_val=(address)uc->uc_mcontext.regs->gpr[ra]; 1317 long rb_val=(long)uc->uc_mcontext.regs->gpr[rb]; 1318 return os::is_memory_serialize_page(thread, ra_val+rb_val); 1319 } else if (is_stw(instruction) || is_stwu(instruction)) { 1320 int ra = inv_ra_field(instruction); 1321 int d1 = inv_d1_field(instruction); 1322 1323 // look up content of ra in ucontext 1324 address ra_val=(address)uc->uc_mcontext.regs->gpr[ra]; 1325 return os::is_memory_serialize_page(thread, ra_val+d1); 1326 } else { 1327 return false; 1328 } 1329 #else 1330 // workaround not needed on !LINUX :-) 1331 ShouldNotCallThis(); 1332 return false; 1333 #endif 1334 } 1335 1336 void MacroAssembler::bang_stack_with_offset(int offset) { 1337 // When increasing the stack, the old stack pointer will be written 1338 // to the new top of stack according to the PPC64 abi. 1339 // Therefore, stack banging is not necessary when increasing 1340 // the stack by <= os::vm_page_size() bytes. 1341 // When increasing the stack by a larger amount, this method is 1342 // called repeatedly to bang the intermediate pages. 1343 1344 // Stack grows down, caller passes positive offset. 1345 assert(offset > 0, "must bang with positive offset"); 1346 1347 long stdoffset = -offset; 1348 1349 if (is_simm(stdoffset, 16)) { 1350 // Signed 16 bit offset, a simple std is ok. 1351 if (UseLoadInstructionsForStackBangingPPC64) { 1352 ld(R0, (int)(signed short)stdoffset, R1_SP); 1353 } else { 1354 std(R0,(int)(signed short)stdoffset, R1_SP); 1355 } 1356 } else if (is_simm(stdoffset, 31)) { 1357 const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset); 1358 const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset); 1359 1360 Register tmp = R11; 1361 addis(tmp, R1_SP, hi); 1362 if (UseLoadInstructionsForStackBangingPPC64) { 1363 ld(R0, lo, tmp); 1364 } else { 1365 std(R0, lo, tmp); 1366 } 1367 } else { 1368 ShouldNotReachHere(); 1369 } 1370 } 1371 1372 // If instruction is a stack bang of the form 1373 // std R0, x(Ry), (see bang_stack_with_offset()) 1374 // stdu R1_SP, x(R1_SP), (see push_frame(), resize_frame()) 1375 // or stdux R1_SP, Rx, R1_SP (see push_frame(), resize_frame()) 1376 // return the banged address. Otherwise, return 0. 1377 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) { 1378 #ifdef LINUX 1379 ucontext_t* uc = (ucontext_t*) ucontext; 1380 int rs = inv_rs_field(instruction); 1381 int ra = inv_ra_field(instruction); 1382 if ( (is_ld(instruction) && rs == 0 && UseLoadInstructionsForStackBangingPPC64) 1383 || (is_std(instruction) && rs == 0 && !UseLoadInstructionsForStackBangingPPC64) 1384 || (is_stdu(instruction) && rs == 1)) { 1385 int ds = inv_ds_field(instruction); 1386 // return banged address 1387 return ds+(address)uc->uc_mcontext.regs->gpr[ra]; 1388 } else if (is_stdux(instruction) && rs == 1) { 1389 int rb = inv_rb_field(instruction); 1390 address sp = (address)uc->uc_mcontext.regs->gpr[1]; 1391 long rb_val = (long)uc->uc_mcontext.regs->gpr[rb]; 1392 return ra != 1 || rb_val >= 0 ? NULL // not a stack bang 1393 : sp + rb_val; // banged address 1394 } 1395 return NULL; // not a stack bang 1396 #else 1397 // workaround not needed on !LINUX :-) 1398 ShouldNotCallThis(); 1399 return NULL; 1400 #endif 1401 } 1402 1403 // CmpxchgX sets condition register to cmpX(current, compare). 1404 void MacroAssembler::cmpxchgw(ConditionRegister flag, Register dest_current_value, 1405 Register compare_value, Register exchange_value, 1406 Register addr_base, int semantics, bool cmpxchgx_hint, 1407 Register int_flag_success, bool contention_hint) { 1408 Label retry; 1409 Label failed; 1410 Label done; 1411 1412 // Save one branch if result is returned via register and 1413 // result register is different from the other ones. 1414 bool use_result_reg = (int_flag_success != noreg); 1415 bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value && 1416 int_flag_success != exchange_value && int_flag_success != addr_base); 1417 1418 if (use_result_reg && preset_result_reg) { 1419 li(int_flag_success, 0); // preset (assume cas failed) 1420 } 1421 1422 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1423 if (contention_hint) { // Don't try to reserve if cmp fails. 1424 lwz(dest_current_value, 0, addr_base); 1425 cmpw(flag, dest_current_value, compare_value); 1426 bne(flag, failed); 1427 } 1428 1429 // release/fence semantics 1430 if (semantics & MemBarRel) { 1431 release(); 1432 } 1433 1434 // atomic emulation loop 1435 bind(retry); 1436 1437 lwarx(dest_current_value, addr_base, cmpxchgx_hint); 1438 cmpw(flag, dest_current_value, compare_value); 1439 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1440 bne_predict_not_taken(flag, failed); 1441 } else { 1442 bne( flag, failed); 1443 } 1444 // branch to done => (flag == ne), (dest_current_value != compare_value) 1445 // fall through => (flag == eq), (dest_current_value == compare_value) 1446 1447 stwcx_(exchange_value, addr_base); 1448 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1449 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1450 } else { 1451 bne( CCR0, retry); // StXcx_ sets CCR0. 1452 } 1453 // fall through => (flag == eq), (dest_current_value == compare_value), (swapped) 1454 1455 // Result in register (must do this at the end because int_flag_success can be the 1456 // same register as one above). 1457 if (use_result_reg) { 1458 li(int_flag_success, 1); 1459 } 1460 1461 if (semantics & MemBarFenceAfter) { 1462 fence(); 1463 } else if (semantics & MemBarAcq) { 1464 isync(); 1465 } 1466 1467 if (use_result_reg && !preset_result_reg) { 1468 b(done); 1469 } 1470 1471 bind(failed); 1472 if (use_result_reg && !preset_result_reg) { 1473 li(int_flag_success, 0); 1474 } 1475 1476 bind(done); 1477 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1478 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1479 } 1480 1481 // Preforms atomic compare exchange: 1482 // if (compare_value == *addr_base) 1483 // *addr_base = exchange_value 1484 // int_flag_success = 1; 1485 // else 1486 // int_flag_success = 0; 1487 // 1488 // ConditionRegister flag = cmp(compare_value, *addr_base) 1489 // Register dest_current_value = *addr_base 1490 // Register compare_value Used to compare with value in memory 1491 // Register exchange_value Written to memory if compare_value == *addr_base 1492 // Register addr_base The memory location to compareXChange 1493 // Register int_flag_success Set to 1 if exchange_value was written to *addr_base 1494 // 1495 // To avoid the costly compare exchange the value is tested beforehand. 1496 // Several special cases exist to avoid that unnecessary information is generated. 1497 // 1498 void MacroAssembler::cmpxchgd(ConditionRegister flag, 1499 Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value, 1500 Register addr_base, int semantics, bool cmpxchgx_hint, 1501 Register int_flag_success, Label* failed_ext, bool contention_hint) { 1502 Label retry; 1503 Label failed_int; 1504 Label& failed = (failed_ext != NULL) ? *failed_ext : failed_int; 1505 Label done; 1506 1507 // Save one branch if result is returned via register and result register is different from the other ones. 1508 bool use_result_reg = (int_flag_success!=noreg); 1509 bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() && 1510 int_flag_success!=exchange_value && int_flag_success!=addr_base); 1511 assert(int_flag_success == noreg || failed_ext == NULL, "cannot have both"); 1512 1513 if (use_result_reg && preset_result_reg) { 1514 li(int_flag_success, 0); // preset (assume cas failed) 1515 } 1516 1517 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1518 if (contention_hint) { // Don't try to reserve if cmp fails. 1519 ld(dest_current_value, 0, addr_base); 1520 cmpd(flag, compare_value, dest_current_value); 1521 bne(flag, failed); 1522 } 1523 1524 // release/fence semantics 1525 if (semantics & MemBarRel) { 1526 release(); 1527 } 1528 1529 // atomic emulation loop 1530 bind(retry); 1531 1532 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1533 cmpd(flag, compare_value, dest_current_value); 1534 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1535 bne_predict_not_taken(flag, failed); 1536 } else { 1537 bne( flag, failed); 1538 } 1539 1540 stdcx_(exchange_value, addr_base); 1541 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1542 bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0 1543 } else { 1544 bne( CCR0, retry); // stXcx_ sets CCR0 1545 } 1546 1547 // result in register (must do this at the end because int_flag_success can be the same register as one above) 1548 if (use_result_reg) { 1549 li(int_flag_success, 1); 1550 } 1551 1552 if (semantics & MemBarFenceAfter) { 1553 fence(); 1554 } else if (semantics & MemBarAcq) { 1555 isync(); 1556 } 1557 1558 if (use_result_reg && !preset_result_reg) { 1559 b(done); 1560 } 1561 1562 bind(failed_int); 1563 if (use_result_reg && !preset_result_reg) { 1564 li(int_flag_success, 0); 1565 } 1566 1567 bind(done); 1568 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1569 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1570 } 1571 1572 // Look up the method for a megamorphic invokeinterface call. 1573 // The target method is determined by <intf_klass, itable_index>. 1574 // The receiver klass is in recv_klass. 1575 // On success, the result will be in method_result, and execution falls through. 1576 // On failure, execution transfers to the given label. 1577 void MacroAssembler::lookup_interface_method(Register recv_klass, 1578 Register intf_klass, 1579 RegisterOrConstant itable_index, 1580 Register method_result, 1581 Register scan_temp, 1582 Register sethi_temp, 1583 Label& L_no_such_interface) { 1584 assert_different_registers(recv_klass, intf_klass, method_result, scan_temp); 1585 assert(itable_index.is_constant() || itable_index.as_register() == method_result, 1586 "caller must use same register for non-constant itable index as for method"); 1587 1588 // Compute start of first itableOffsetEntry (which is at the end of the vtable). 1589 int vtable_base = in_bytes(Klass::vtable_start_offset()); 1590 int itentry_off = itableMethodEntry::method_offset_in_bytes(); 1591 int logMEsize = exact_log2(itableMethodEntry::size() * wordSize); 1592 int scan_step = itableOffsetEntry::size() * wordSize; 1593 int log_vte_size= exact_log2(vtableEntry::size_in_bytes()); 1594 1595 lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass); 1596 // %%% We should store the aligned, prescaled offset in the klassoop. 1597 // Then the next several instructions would fold away. 1598 1599 sldi(scan_temp, scan_temp, log_vte_size); 1600 addi(scan_temp, scan_temp, vtable_base); 1601 add(scan_temp, recv_klass, scan_temp); 1602 1603 // Adjust recv_klass by scaled itable_index, so we can free itable_index. 1604 if (itable_index.is_register()) { 1605 Register itable_offset = itable_index.as_register(); 1606 sldi(itable_offset, itable_offset, logMEsize); 1607 if (itentry_off) addi(itable_offset, itable_offset, itentry_off); 1608 add(recv_klass, itable_offset, recv_klass); 1609 } else { 1610 long itable_offset = (long)itable_index.as_constant(); 1611 load_const_optimized(sethi_temp, (itable_offset<<logMEsize)+itentry_off); // static address, no relocation 1612 add(recv_klass, sethi_temp, recv_klass); 1613 } 1614 1615 // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) { 1616 // if (scan->interface() == intf) { 1617 // result = (klass + scan->offset() + itable_index); 1618 // } 1619 // } 1620 Label search, found_method; 1621 1622 for (int peel = 1; peel >= 0; peel--) { 1623 // %%%% Could load both offset and interface in one ldx, if they were 1624 // in the opposite order. This would save a load. 1625 ld(method_result, itableOffsetEntry::interface_offset_in_bytes(), scan_temp); 1626 1627 // Check that this entry is non-null. A null entry means that 1628 // the receiver class doesn't implement the interface, and wasn't the 1629 // same as when the caller was compiled. 1630 cmpd(CCR0, method_result, intf_klass); 1631 1632 if (peel) { 1633 beq(CCR0, found_method); 1634 } else { 1635 bne(CCR0, search); 1636 // (invert the test to fall through to found_method...) 1637 } 1638 1639 if (!peel) break; 1640 1641 bind(search); 1642 1643 cmpdi(CCR0, method_result, 0); 1644 beq(CCR0, L_no_such_interface); 1645 addi(scan_temp, scan_temp, scan_step); 1646 } 1647 1648 bind(found_method); 1649 1650 // Got a hit. 1651 int ito_offset = itableOffsetEntry::offset_offset_in_bytes(); 1652 lwz(scan_temp, ito_offset, scan_temp); 1653 ldx(method_result, scan_temp, recv_klass); 1654 } 1655 1656 // virtual method calling 1657 void MacroAssembler::lookup_virtual_method(Register recv_klass, 1658 RegisterOrConstant vtable_index, 1659 Register method_result) { 1660 1661 assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg()); 1662 1663 const int base = in_bytes(Klass::vtable_start_offset()); 1664 assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); 1665 1666 if (vtable_index.is_register()) { 1667 sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord); 1668 add(recv_klass, vtable_index.as_register(), recv_klass); 1669 } else { 1670 addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord); 1671 } 1672 ld(R19_method, base + vtableEntry::method_offset_in_bytes(), recv_klass); 1673 } 1674 1675 /////////////////////////////////////////// subtype checking //////////////////////////////////////////// 1676 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, 1677 Register super_klass, 1678 Register temp1_reg, 1679 Register temp2_reg, 1680 Label* L_success, 1681 Label* L_failure, 1682 Label* L_slow_path, 1683 RegisterOrConstant super_check_offset) { 1684 1685 const Register check_cache_offset = temp1_reg; 1686 const Register cached_super = temp2_reg; 1687 1688 assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super); 1689 1690 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1691 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1692 1693 bool must_load_sco = (super_check_offset.constant_or_zero() == -1); 1694 bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset); 1695 1696 Label L_fallthrough; 1697 int label_nulls = 0; 1698 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 1699 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 1700 if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; } 1701 assert(label_nulls <= 1 || 1702 (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path), 1703 "at most one NULL in the batch, usually"); 1704 1705 // If the pointers are equal, we are done (e.g., String[] elements). 1706 // This self-check enables sharing of secondary supertype arrays among 1707 // non-primary types such as array-of-interface. Otherwise, each such 1708 // type would need its own customized SSA. 1709 // We move this check to the front of the fast path because many 1710 // type checks are in fact trivially successful in this manner, 1711 // so we get a nicely predicted branch right at the start of the check. 1712 cmpd(CCR0, sub_klass, super_klass); 1713 beq(CCR0, *L_success); 1714 1715 // Check the supertype display: 1716 if (must_load_sco) { 1717 // The super check offset is always positive... 1718 lwz(check_cache_offset, sco_offset, super_klass); 1719 super_check_offset = RegisterOrConstant(check_cache_offset); 1720 // super_check_offset is register. 1721 assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register()); 1722 } 1723 // The loaded value is the offset from KlassOopDesc. 1724 1725 ld(cached_super, super_check_offset, sub_klass); 1726 cmpd(CCR0, cached_super, super_klass); 1727 1728 // This check has worked decisively for primary supers. 1729 // Secondary supers are sought in the super_cache ('super_cache_addr'). 1730 // (Secondary supers are interfaces and very deeply nested subtypes.) 1731 // This works in the same check above because of a tricky aliasing 1732 // between the super_cache and the primary super display elements. 1733 // (The 'super_check_addr' can address either, as the case requires.) 1734 // Note that the cache is updated below if it does not help us find 1735 // what we need immediately. 1736 // So if it was a primary super, we can just fail immediately. 1737 // Otherwise, it's the slow path for us (no success at this point). 1738 1739 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); } 1740 1741 if (super_check_offset.is_register()) { 1742 beq(CCR0, *L_success); 1743 cmpwi(CCR0, super_check_offset.as_register(), sc_offset); 1744 if (L_failure == &L_fallthrough) { 1745 beq(CCR0, *L_slow_path); 1746 } else { 1747 bne(CCR0, *L_failure); 1748 FINAL_JUMP(*L_slow_path); 1749 } 1750 } else { 1751 if (super_check_offset.as_constant() == sc_offset) { 1752 // Need a slow path; fast failure is impossible. 1753 if (L_slow_path == &L_fallthrough) { 1754 beq(CCR0, *L_success); 1755 } else { 1756 bne(CCR0, *L_slow_path); 1757 FINAL_JUMP(*L_success); 1758 } 1759 } else { 1760 // No slow path; it's a fast decision. 1761 if (L_failure == &L_fallthrough) { 1762 beq(CCR0, *L_success); 1763 } else { 1764 bne(CCR0, *L_failure); 1765 FINAL_JUMP(*L_success); 1766 } 1767 } 1768 } 1769 1770 bind(L_fallthrough); 1771 #undef FINAL_JUMP 1772 } 1773 1774 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 1775 Register super_klass, 1776 Register temp1_reg, 1777 Register temp2_reg, 1778 Label* L_success, 1779 Register result_reg) { 1780 const Register array_ptr = temp1_reg; // current value from cache array 1781 const Register temp = temp2_reg; 1782 1783 assert_different_registers(sub_klass, super_klass, array_ptr, temp); 1784 1785 int source_offset = in_bytes(Klass::secondary_supers_offset()); 1786 int target_offset = in_bytes(Klass::secondary_super_cache_offset()); 1787 1788 int length_offset = Array<Klass*>::length_offset_in_bytes(); 1789 int base_offset = Array<Klass*>::base_offset_in_bytes(); 1790 1791 Label hit, loop, failure, fallthru; 1792 1793 ld(array_ptr, source_offset, sub_klass); 1794 1795 // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated."); 1796 lwz(temp, length_offset, array_ptr); 1797 cmpwi(CCR0, temp, 0); 1798 beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0 1799 1800 mtctr(temp); // load ctr 1801 1802 bind(loop); 1803 // Oops in table are NO MORE compressed. 1804 ld(temp, base_offset, array_ptr); 1805 cmpd(CCR0, temp, super_klass); 1806 beq(CCR0, hit); 1807 addi(array_ptr, array_ptr, BytesPerWord); 1808 bdnz(loop); 1809 1810 bind(failure); 1811 if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss) 1812 b(fallthru); 1813 1814 bind(hit); 1815 std(super_klass, target_offset, sub_klass); // save result to cache 1816 if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit) 1817 if (L_success != NULL) { b(*L_success); } 1818 else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided 1819 1820 bind(fallthru); 1821 } 1822 1823 // Try fast path, then go to slow one if not successful 1824 void MacroAssembler::check_klass_subtype(Register sub_klass, 1825 Register super_klass, 1826 Register temp1_reg, 1827 Register temp2_reg, 1828 Label& L_success) { 1829 Label L_failure; 1830 check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure); 1831 check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success); 1832 bind(L_failure); // Fallthru if not successful. 1833 } 1834 1835 void MacroAssembler::check_method_handle_type(Register mtype_reg, Register mh_reg, 1836 Register temp_reg, 1837 Label& wrong_method_type) { 1838 assert_different_registers(mtype_reg, mh_reg, temp_reg); 1839 // Compare method type against that of the receiver. 1840 load_heap_oop_not_null(temp_reg, delayed_value(java_lang_invoke_MethodHandle::type_offset_in_bytes, temp_reg), mh_reg); 1841 cmpd(CCR0, temp_reg, mtype_reg); 1842 bne(CCR0, wrong_method_type); 1843 } 1844 1845 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot, 1846 Register temp_reg, 1847 int extra_slot_offset) { 1848 // cf. TemplateTable::prepare_invoke(), if (load_receiver). 1849 int stackElementSize = Interpreter::stackElementSize; 1850 int offset = extra_slot_offset * stackElementSize; 1851 if (arg_slot.is_constant()) { 1852 offset += arg_slot.as_constant() * stackElementSize; 1853 return offset; 1854 } else { 1855 assert(temp_reg != noreg, "must specify"); 1856 sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize)); 1857 if (offset != 0) 1858 addi(temp_reg, temp_reg, offset); 1859 return temp_reg; 1860 } 1861 } 1862 1863 // Supports temp2_reg = R0. 1864 void MacroAssembler::biased_locking_enter(ConditionRegister cr_reg, Register obj_reg, 1865 Register mark_reg, Register temp_reg, 1866 Register temp2_reg, Label& done, Label* slow_case) { 1867 assert(UseBiasedLocking, "why call this otherwise?"); 1868 1869 #ifdef ASSERT 1870 assert_different_registers(obj_reg, mark_reg, temp_reg, temp2_reg); 1871 #endif 1872 1873 Label cas_label; 1874 1875 // Branch to done if fast path fails and no slow_case provided. 1876 Label *slow_case_int = (slow_case != NULL) ? slow_case : &done; 1877 1878 // Biased locking 1879 // See whether the lock is currently biased toward our thread and 1880 // whether the epoch is still valid 1881 // Note that the runtime guarantees sufficient alignment of JavaThread 1882 // pointers to allow age to be placed into low bits 1883 assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, 1884 "biased locking makes assumptions about bit layout"); 1885 1886 if (PrintBiasedLockingStatistics) { 1887 load_const(temp2_reg, (address) BiasedLocking::total_entry_count_addr(), temp_reg); 1888 lwzx(temp_reg, temp2_reg); 1889 addi(temp_reg, temp_reg, 1); 1890 stwx(temp_reg, temp2_reg); 1891 } 1892 1893 andi(temp_reg, mark_reg, markOopDesc::biased_lock_mask_in_place); 1894 cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern); 1895 bne(cr_reg, cas_label); 1896 1897 load_klass(temp_reg, obj_reg); 1898 1899 load_const_optimized(temp2_reg, ~((int) markOopDesc::age_mask_in_place)); 1900 ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg); 1901 orr(temp_reg, R16_thread, temp_reg); 1902 xorr(temp_reg, mark_reg, temp_reg); 1903 andr(temp_reg, temp_reg, temp2_reg); 1904 cmpdi(cr_reg, temp_reg, 0); 1905 if (PrintBiasedLockingStatistics) { 1906 Label l; 1907 bne(cr_reg, l); 1908 load_const(temp2_reg, (address) BiasedLocking::biased_lock_entry_count_addr()); 1909 lwzx(mark_reg, temp2_reg); 1910 addi(mark_reg, mark_reg, 1); 1911 stwx(mark_reg, temp2_reg); 1912 // restore mark_reg 1913 ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg); 1914 bind(l); 1915 } 1916 beq(cr_reg, done); 1917 1918 Label try_revoke_bias; 1919 Label try_rebias; 1920 1921 // At this point we know that the header has the bias pattern and 1922 // that we are not the bias owner in the current epoch. We need to 1923 // figure out more details about the state of the header in order to 1924 // know what operations can be legally performed on the object's 1925 // header. 1926 1927 // If the low three bits in the xor result aren't clear, that means 1928 // the prototype header is no longer biased and we have to revoke 1929 // the bias on this object. 1930 andi(temp2_reg, temp_reg, markOopDesc::biased_lock_mask_in_place); 1931 cmpwi(cr_reg, temp2_reg, 0); 1932 bne(cr_reg, try_revoke_bias); 1933 1934 // Biasing is still enabled for this data type. See whether the 1935 // epoch of the current bias is still valid, meaning that the epoch 1936 // bits of the mark word are equal to the epoch bits of the 1937 // prototype header. (Note that the prototype header's epoch bits 1938 // only change at a safepoint.) If not, attempt to rebias the object 1939 // toward the current thread. Note that we must be absolutely sure 1940 // that the current epoch is invalid in order to do this because 1941 // otherwise the manipulations it performs on the mark word are 1942 // illegal. 1943 1944 int shift_amount = 64 - markOopDesc::epoch_shift; 1945 // rotate epoch bits to right (little) end and set other bits to 0 1946 // [ big part | epoch | little part ] -> [ 0..0 | epoch ] 1947 rldicl_(temp2_reg, temp_reg, shift_amount, 64 - markOopDesc::epoch_bits); 1948 // branch if epoch bits are != 0, i.e. they differ, because the epoch has been incremented 1949 bne(CCR0, try_rebias); 1950 1951 // The epoch of the current bias is still valid but we know nothing 1952 // about the owner; it might be set or it might be clear. Try to 1953 // acquire the bias of the object using an atomic operation. If this 1954 // fails we will go in to the runtime to revoke the object's bias. 1955 // Note that we first construct the presumed unbiased header so we 1956 // don't accidentally blow away another thread's valid bias. 1957 andi(mark_reg, mark_reg, (markOopDesc::biased_lock_mask_in_place | 1958 markOopDesc::age_mask_in_place | 1959 markOopDesc::epoch_mask_in_place)); 1960 orr(temp_reg, R16_thread, mark_reg); 1961 1962 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 1963 1964 // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg). 1965 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg, 1966 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg, 1967 /*where=*/obj_reg, 1968 MacroAssembler::MemBarAcq, 1969 MacroAssembler::cmpxchgx_hint_acquire_lock(), 1970 noreg, slow_case_int); // bail out if failed 1971 1972 // If the biasing toward our thread failed, this means that 1973 // another thread succeeded in biasing it toward itself and we 1974 // need to revoke that bias. The revocation will occur in the 1975 // interpreter runtime in the slow case. 1976 if (PrintBiasedLockingStatistics) { 1977 load_const(temp2_reg, (address) BiasedLocking::anonymously_biased_lock_entry_count_addr(), temp_reg); 1978 lwzx(temp_reg, temp2_reg); 1979 addi(temp_reg, temp_reg, 1); 1980 stwx(temp_reg, temp2_reg); 1981 } 1982 b(done); 1983 1984 bind(try_rebias); 1985 // At this point we know the epoch has expired, meaning that the 1986 // current "bias owner", if any, is actually invalid. Under these 1987 // circumstances _only_, we are allowed to use the current header's 1988 // value as the comparison value when doing the cas to acquire the 1989 // bias in the current epoch. In other words, we allow transfer of 1990 // the bias from one thread to another directly in this situation. 1991 load_klass(temp_reg, obj_reg); 1992 andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place); 1993 orr(temp2_reg, R16_thread, temp2_reg); 1994 ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg); 1995 orr(temp_reg, temp2_reg, temp_reg); 1996 1997 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 1998 1999 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg, 2000 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg, 2001 /*where=*/obj_reg, 2002 MacroAssembler::MemBarAcq, 2003 MacroAssembler::cmpxchgx_hint_acquire_lock(), 2004 noreg, slow_case_int); // bail out if failed 2005 2006 // If the biasing toward our thread failed, this means that 2007 // another thread succeeded in biasing it toward itself and we 2008 // need to revoke that bias. The revocation will occur in the 2009 // interpreter runtime in the slow case. 2010 if (PrintBiasedLockingStatistics) { 2011 load_const(temp2_reg, (address) BiasedLocking::rebiased_lock_entry_count_addr(), temp_reg); 2012 lwzx(temp_reg, temp2_reg); 2013 addi(temp_reg, temp_reg, 1); 2014 stwx(temp_reg, temp2_reg); 2015 } 2016 b(done); 2017 2018 bind(try_revoke_bias); 2019 // The prototype mark in the klass doesn't have the bias bit set any 2020 // more, indicating that objects of this data type are not supposed 2021 // to be biased any more. We are going to try to reset the mark of 2022 // this object to the prototype value and fall through to the 2023 // CAS-based locking scheme. Note that if our CAS fails, it means 2024 // that another thread raced us for the privilege of revoking the 2025 // bias of this particular object, so it's okay to continue in the 2026 // normal locking code. 2027 load_klass(temp_reg, obj_reg); 2028 ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg); 2029 andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place); 2030 orr(temp_reg, temp_reg, temp2_reg); 2031 2032 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2033 2034 // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg). 2035 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg, 2036 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg, 2037 /*where=*/obj_reg, 2038 MacroAssembler::MemBarAcq, 2039 MacroAssembler::cmpxchgx_hint_acquire_lock()); 2040 2041 // reload markOop in mark_reg before continuing with lightweight locking 2042 ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg); 2043 2044 // Fall through to the normal CAS-based lock, because no matter what 2045 // the result of the above CAS, some thread must have succeeded in 2046 // removing the bias bit from the object's header. 2047 if (PrintBiasedLockingStatistics) { 2048 Label l; 2049 bne(cr_reg, l); 2050 load_const(temp2_reg, (address) BiasedLocking::revoked_lock_entry_count_addr(), temp_reg); 2051 lwzx(temp_reg, temp2_reg); 2052 addi(temp_reg, temp_reg, 1); 2053 stwx(temp_reg, temp2_reg); 2054 bind(l); 2055 } 2056 2057 bind(cas_label); 2058 } 2059 2060 void MacroAssembler::biased_locking_exit (ConditionRegister cr_reg, Register mark_addr, Register temp_reg, Label& done) { 2061 // Check for biased locking unlock case, which is a no-op 2062 // Note: we do not have to check the thread ID for two reasons. 2063 // First, the interpreter checks for IllegalMonitorStateException at 2064 // a higher level. Second, if the bias was revoked while we held the 2065 // lock, the object could not be rebiased toward another thread, so 2066 // the bias bit would be clear. 2067 2068 ld(temp_reg, 0, mark_addr); 2069 andi(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place); 2070 2071 cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern); 2072 beq(cr_reg, done); 2073 } 2074 2075 // allocation (for C1) 2076 void MacroAssembler::eden_allocate( 2077 Register obj, // result: pointer to object after successful allocation 2078 Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise 2079 int con_size_in_bytes, // object size in bytes if known at compile time 2080 Register t1, // temp register 2081 Register t2, // temp register 2082 Label& slow_case // continuation point if fast allocation fails 2083 ) { 2084 b(slow_case); 2085 } 2086 2087 void MacroAssembler::tlab_allocate( 2088 Register obj, // result: pointer to object after successful allocation 2089 Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise 2090 int con_size_in_bytes, // object size in bytes if known at compile time 2091 Register t1, // temp register 2092 Label& slow_case // continuation point if fast allocation fails 2093 ) { 2094 // make sure arguments make sense 2095 assert_different_registers(obj, var_size_in_bytes, t1); 2096 assert(0 <= con_size_in_bytes && is_simm13(con_size_in_bytes), "illegal object size"); 2097 assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment"); 2098 2099 const Register new_top = t1; 2100 //verify_tlab(); not implemented 2101 2102 ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread); 2103 ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread); 2104 if (var_size_in_bytes == noreg) { 2105 addi(new_top, obj, con_size_in_bytes); 2106 } else { 2107 add(new_top, obj, var_size_in_bytes); 2108 } 2109 cmpld(CCR0, new_top, R0); 2110 bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case); 2111 2112 #ifdef ASSERT 2113 // make sure new free pointer is properly aligned 2114 { 2115 Label L; 2116 andi_(R0, new_top, MinObjAlignmentInBytesMask); 2117 beq(CCR0, L); 2118 stop("updated TLAB free is not properly aligned", 0x934); 2119 bind(L); 2120 } 2121 #endif // ASSERT 2122 2123 // update the tlab top pointer 2124 std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread); 2125 //verify_tlab(); not implemented 2126 } 2127 void MacroAssembler::tlab_refill(Label& retry_tlab, Label& try_eden, Label& slow_case) { 2128 unimplemented("tlab_refill"); 2129 } 2130 void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register t1, Register t2) { 2131 unimplemented("incr_allocated_bytes"); 2132 } 2133 2134 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset, 2135 int insts_call_instruction_offset, Register Rtoc) { 2136 // Start the stub. 2137 address stub = start_a_stub(64); 2138 if (stub == NULL) { return NULL; } // CodeCache full: bail out 2139 2140 // Create a trampoline stub relocation which relates this trampoline stub 2141 // with the call instruction at insts_call_instruction_offset in the 2142 // instructions code-section. 2143 relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset)); 2144 const int stub_start_offset = offset(); 2145 2146 // For java_to_interp stubs we use R11_scratch1 as scratch register 2147 // and in call trampoline stubs we use R12_scratch2. This way we 2148 // can distinguish them (see is_NativeCallTrampolineStub_at()). 2149 Register reg_scratch = R12_scratch2; 2150 2151 // Now, create the trampoline stub's code: 2152 // - load the TOC 2153 // - load the call target from the constant pool 2154 // - call 2155 if (Rtoc == noreg) { 2156 calculate_address_from_global_toc(reg_scratch, method_toc()); 2157 Rtoc = reg_scratch; 2158 } 2159 2160 ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false); 2161 mtctr(reg_scratch); 2162 bctr(); 2163 2164 const address stub_start_addr = addr_at(stub_start_offset); 2165 2166 // Assert that the encoded destination_toc_offset can be identified and that it is correct. 2167 assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(), 2168 "encoded offset into the constant pool must match"); 2169 // Trampoline_stub_size should be good. 2170 assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size"); 2171 assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline"); 2172 2173 // End the stub. 2174 end_a_stub(); 2175 return stub; 2176 } 2177 2178 // TM on PPC64. 2179 void MacroAssembler::atomic_inc_ptr(Register addr, Register result, int simm16) { 2180 Label retry; 2181 bind(retry); 2182 ldarx(result, addr, /*hint*/ false); 2183 addi(result, result, simm16); 2184 stdcx_(result, addr); 2185 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 2186 bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0 2187 } else { 2188 bne( CCR0, retry); // stXcx_ sets CCR0 2189 } 2190 } 2191 2192 void MacroAssembler::atomic_ori_int(Register addr, Register result, int uimm16) { 2193 Label retry; 2194 bind(retry); 2195 lwarx(result, addr, /*hint*/ false); 2196 ori(result, result, uimm16); 2197 stwcx_(result, addr); 2198 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 2199 bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0 2200 } else { 2201 bne( CCR0, retry); // stXcx_ sets CCR0 2202 } 2203 } 2204 2205 #if INCLUDE_RTM_OPT 2206 2207 // Update rtm_counters based on abort status 2208 // input: abort_status 2209 // rtm_counters (RTMLockingCounters*) 2210 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters_Reg) { 2211 // Mapping to keep PreciseRTMLockingStatistics similar to x86. 2212 // x86 ppc (! means inverted, ? means not the same) 2213 // 0 31 Set if abort caused by XABORT instruction. 2214 // 1 ! 7 If set, the transaction may succeed on a retry. This bit is always clear if bit 0 is set. 2215 // 2 13 Set if another logical processor conflicted with a memory address that was part of the transaction that aborted. 2216 // 3 10 Set if an internal buffer overflowed. 2217 // 4 ?12 Set if a debug breakpoint was hit. 2218 // 5 ?32 Set if an abort occurred during execution of a nested transaction. 2219 const int tm_failure_bit[] = {Assembler::tm_tabort, // Note: Seems like signal handler sets this, too. 2220 Assembler::tm_failure_persistent, // inverted: transient 2221 Assembler::tm_trans_cf, 2222 Assembler::tm_footprint_of, 2223 Assembler::tm_non_trans_cf, 2224 Assembler::tm_suspended}; 2225 const bool tm_failure_inv[] = {false, true, false, false, false, false}; 2226 assert(sizeof(tm_failure_bit)/sizeof(int) == RTMLockingCounters::ABORT_STATUS_LIMIT, "adapt mapping!"); 2227 2228 const Register addr_Reg = R0; 2229 // Keep track of offset to where rtm_counters_Reg had pointed to. 2230 int counters_offs = RTMLockingCounters::abort_count_offset(); 2231 addi(addr_Reg, rtm_counters_Reg, counters_offs); 2232 const Register temp_Reg = rtm_counters_Reg; 2233 2234 //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically 2235 ldx(temp_Reg, addr_Reg); 2236 addi(temp_Reg, temp_Reg, 1); 2237 stdx(temp_Reg, addr_Reg); 2238 2239 if (PrintPreciseRTMLockingStatistics) { 2240 int counters_offs_delta = RTMLockingCounters::abortX_count_offset() - counters_offs; 2241 2242 //mftexasr(abort_status); done by caller 2243 for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) { 2244 counters_offs += counters_offs_delta; 2245 li(temp_Reg, counters_offs_delta); // can't use addi with R0 2246 add(addr_Reg, addr_Reg, temp_Reg); // point to next counter 2247 counters_offs_delta = sizeof(uintx); 2248 2249 Label check_abort; 2250 rldicr_(temp_Reg, abort_status, tm_failure_bit[i], 0); 2251 if (tm_failure_inv[i]) { 2252 bne(CCR0, check_abort); 2253 } else { 2254 beq(CCR0, check_abort); 2255 } 2256 //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically 2257 ldx(temp_Reg, addr_Reg); 2258 addi(temp_Reg, temp_Reg, 1); 2259 stdx(temp_Reg, addr_Reg); 2260 bind(check_abort); 2261 } 2262 } 2263 li(temp_Reg, -counters_offs); // can't use addi with R0 2264 add(rtm_counters_Reg, addr_Reg, temp_Reg); // restore 2265 } 2266 2267 // Branch if (random & (count-1) != 0), count is 2^n 2268 // tmp and CR0 are killed 2269 void MacroAssembler::branch_on_random_using_tb(Register tmp, int count, Label& brLabel) { 2270 mftb(tmp); 2271 andi_(tmp, tmp, count-1); 2272 bne(CCR0, brLabel); 2273 } 2274 2275 // Perform abort ratio calculation, set no_rtm bit if high ratio. 2276 // input: rtm_counters_Reg (RTMLockingCounters* address) - KILLED 2277 void MacroAssembler::rtm_abort_ratio_calculation(Register rtm_counters_Reg, 2278 RTMLockingCounters* rtm_counters, 2279 Metadata* method_data) { 2280 Label L_done, L_check_always_rtm1, L_check_always_rtm2; 2281 2282 if (RTMLockingCalculationDelay > 0) { 2283 // Delay calculation. 2284 ld(rtm_counters_Reg, (RegisterOrConstant)(intptr_t)RTMLockingCounters::rtm_calculation_flag_addr()); 2285 cmpdi(CCR0, rtm_counters_Reg, 0); 2286 beq(CCR0, L_done); 2287 load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload 2288 } 2289 // Abort ratio calculation only if abort_count > RTMAbortThreshold. 2290 // Aborted transactions = abort_count * 100 2291 // All transactions = total_count * RTMTotalCountIncrRate 2292 // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio) 2293 ld(R0, RTMLockingCounters::abort_count_offset(), rtm_counters_Reg); 2294 cmpdi(CCR0, R0, RTMAbortThreshold); 2295 blt(CCR0, L_check_always_rtm2); 2296 mulli(R0, R0, 100); 2297 2298 const Register tmpReg = rtm_counters_Reg; 2299 ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg); 2300 mulli(tmpReg, tmpReg, RTMTotalCountIncrRate); 2301 mulli(tmpReg, tmpReg, RTMAbortRatio); 2302 cmpd(CCR0, R0, tmpReg); 2303 blt(CCR0, L_check_always_rtm1); // jump to reload 2304 if (method_data != NULL) { 2305 // Set rtm_state to "no rtm" in MDO. 2306 // Not using a metadata relocation. Method and Class Loader are kept alive anyway. 2307 // (See nmethod::metadata_do and CodeBuffer::finalize_oop_references.) 2308 load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg); 2309 atomic_ori_int(R0, tmpReg, NoRTM); 2310 } 2311 b(L_done); 2312 2313 bind(L_check_always_rtm1); 2314 load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload 2315 bind(L_check_always_rtm2); 2316 ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg); 2317 cmpdi(CCR0, tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate); 2318 blt(CCR0, L_done); 2319 if (method_data != NULL) { 2320 // Set rtm_state to "always rtm" in MDO. 2321 // Not using a metadata relocation. See above. 2322 load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg); 2323 atomic_ori_int(R0, tmpReg, UseRTM); 2324 } 2325 bind(L_done); 2326 } 2327 2328 // Update counters and perform abort ratio calculation. 2329 // input: abort_status_Reg 2330 void MacroAssembler::rtm_profiling(Register abort_status_Reg, Register temp_Reg, 2331 RTMLockingCounters* rtm_counters, 2332 Metadata* method_data, 2333 bool profile_rtm) { 2334 2335 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2336 // Update rtm counters based on state at abort. 2337 // Reads abort_status_Reg, updates flags. 2338 assert_different_registers(abort_status_Reg, temp_Reg); 2339 load_const_optimized(temp_Reg, (address)rtm_counters, R0); 2340 rtm_counters_update(abort_status_Reg, temp_Reg); 2341 if (profile_rtm) { 2342 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2343 rtm_abort_ratio_calculation(temp_Reg, rtm_counters, method_data); 2344 } 2345 } 2346 2347 // Retry on abort if abort's status indicates non-persistent failure. 2348 // inputs: retry_count_Reg 2349 // : abort_status_Reg 2350 // output: retry_count_Reg decremented by 1 2351 void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, 2352 Label& retryLabel, Label* checkRetry) { 2353 Label doneRetry; 2354 rldicr_(R0, abort_status_Reg, tm_failure_persistent, 0); 2355 bne(CCR0, doneRetry); 2356 if (checkRetry) { bind(*checkRetry); } 2357 addic_(retry_count_Reg, retry_count_Reg, -1); 2358 blt(CCR0, doneRetry); 2359 smt_yield(); // Can't use wait(). No permission (SIGILL). 2360 b(retryLabel); 2361 bind(doneRetry); 2362 } 2363 2364 // Spin and retry if lock is busy. 2365 // inputs: box_Reg (monitor address) 2366 // : retry_count_Reg 2367 // output: retry_count_Reg decremented by 1 2368 // CTR is killed 2369 void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register owner_addr_Reg, Label& retryLabel) { 2370 Label SpinLoop, doneRetry; 2371 addic_(retry_count_Reg, retry_count_Reg, -1); 2372 blt(CCR0, doneRetry); 2373 li(R0, RTMSpinLoopCount); 2374 mtctr(R0); 2375 2376 bind(SpinLoop); 2377 smt_yield(); // Can't use waitrsv(). No permission (SIGILL). 2378 bdz(retryLabel); 2379 ld(R0, 0, owner_addr_Reg); 2380 cmpdi(CCR0, R0, 0); 2381 bne(CCR0, SpinLoop); 2382 b(retryLabel); 2383 2384 bind(doneRetry); 2385 } 2386 2387 // Use RTM for normal stack locks. 2388 // Input: objReg (object to lock) 2389 void MacroAssembler::rtm_stack_locking(ConditionRegister flag, 2390 Register obj, Register mark_word, Register tmp, 2391 Register retry_on_abort_count_Reg, 2392 RTMLockingCounters* stack_rtm_counters, 2393 Metadata* method_data, bool profile_rtm, 2394 Label& DONE_LABEL, Label& IsInflated) { 2395 assert(UseRTMForStackLocks, "why call this otherwise?"); 2396 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); 2397 Label L_rtm_retry, L_decrement_retry, L_on_abort; 2398 2399 if (RTMRetryCount > 0) { 2400 load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 2401 bind(L_rtm_retry); 2402 } 2403 andi_(R0, mark_word, markOopDesc::monitor_value); // inflated vs stack-locked|neutral|biased 2404 bne(CCR0, IsInflated); 2405 2406 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2407 Label L_noincrement; 2408 if (RTMTotalCountIncrRate > 1) { 2409 branch_on_random_using_tb(tmp, (int)RTMTotalCountIncrRate, L_noincrement); 2410 } 2411 assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM"); 2412 load_const_optimized(tmp, (address)stack_rtm_counters->total_count_addr(), R0); 2413 //atomic_inc_ptr(tmp, /*temp, will be reloaded*/mark_word); We don't increment atomically 2414 ldx(mark_word, tmp); 2415 addi(mark_word, mark_word, 1); 2416 stdx(mark_word, tmp); 2417 bind(L_noincrement); 2418 } 2419 tbegin_(); 2420 beq(CCR0, L_on_abort); 2421 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // Reload in transaction, conflicts need to be tracked. 2422 andi(R0, mark_word, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits 2423 cmpwi(flag, R0, markOopDesc::unlocked_value); // bits = 001 unlocked 2424 beq(flag, DONE_LABEL); // all done if unlocked 2425 2426 if (UseRTMXendForLockBusy) { 2427 tend_(); 2428 b(L_decrement_retry); 2429 } else { 2430 tabort_(); 2431 } 2432 bind(L_on_abort); 2433 const Register abort_status_Reg = tmp; 2434 mftexasr(abort_status_Reg); 2435 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2436 rtm_profiling(abort_status_Reg, /*temp*/mark_word, stack_rtm_counters, method_data, profile_rtm); 2437 } 2438 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // reload 2439 if (RTMRetryCount > 0) { 2440 // Retry on lock abort if abort status is not permanent. 2441 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry, &L_decrement_retry); 2442 } else { 2443 bind(L_decrement_retry); 2444 } 2445 } 2446 2447 // Use RTM for inflating locks 2448 // inputs: obj (object to lock) 2449 // mark_word (current header - KILLED) 2450 // boxReg (on-stack box address (displaced header location) - KILLED) 2451 void MacroAssembler::rtm_inflated_locking(ConditionRegister flag, 2452 Register obj, Register mark_word, Register boxReg, 2453 Register retry_on_busy_count_Reg, Register retry_on_abort_count_Reg, 2454 RTMLockingCounters* rtm_counters, 2455 Metadata* method_data, bool profile_rtm, 2456 Label& DONE_LABEL) { 2457 assert(UseRTMLocking, "why call this otherwise?"); 2458 Label L_rtm_retry, L_decrement_retry, L_on_abort; 2459 // Clean monitor_value bit to get valid pointer. 2460 int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markOopDesc::monitor_value; 2461 2462 // Store non-null, using boxReg instead of (intptr_t)markOopDesc::unused_mark(). 2463 std(boxReg, BasicLock::displaced_header_offset_in_bytes(), boxReg); 2464 const Register tmpReg = boxReg; 2465 const Register owner_addr_Reg = mark_word; 2466 addi(owner_addr_Reg, mark_word, owner_offset); 2467 2468 if (RTMRetryCount > 0) { 2469 load_const_optimized(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy. 2470 load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort. 2471 bind(L_rtm_retry); 2472 } 2473 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2474 Label L_noincrement; 2475 if (RTMTotalCountIncrRate > 1) { 2476 branch_on_random_using_tb(R0, (int)RTMTotalCountIncrRate, L_noincrement); 2477 } 2478 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2479 load_const(R0, (address)rtm_counters->total_count_addr(), tmpReg); 2480 //atomic_inc_ptr(R0, tmpReg); We don't increment atomically 2481 ldx(tmpReg, R0); 2482 addi(tmpReg, tmpReg, 1); 2483 stdx(tmpReg, R0); 2484 bind(L_noincrement); 2485 } 2486 tbegin_(); 2487 beq(CCR0, L_on_abort); 2488 // We don't reload mark word. Will only be reset at safepoint. 2489 ld(R0, 0, owner_addr_Reg); // Load in transaction, conflicts need to be tracked. 2490 cmpdi(flag, R0, 0); 2491 beq(flag, DONE_LABEL); 2492 2493 if (UseRTMXendForLockBusy) { 2494 tend_(); 2495 b(L_decrement_retry); 2496 } else { 2497 tabort_(); 2498 } 2499 bind(L_on_abort); 2500 const Register abort_status_Reg = tmpReg; 2501 mftexasr(abort_status_Reg); 2502 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2503 rtm_profiling(abort_status_Reg, /*temp*/ owner_addr_Reg, rtm_counters, method_data, profile_rtm); 2504 // Restore owner_addr_Reg 2505 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); 2506 #ifdef ASSERT 2507 andi_(R0, mark_word, markOopDesc::monitor_value); 2508 asm_assert_ne("must be inflated", 0xa754); // Deflating only allowed at safepoint. 2509 #endif 2510 addi(owner_addr_Reg, mark_word, owner_offset); 2511 } 2512 if (RTMRetryCount > 0) { 2513 // Retry on lock abort if abort status is not permanent. 2514 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 2515 } 2516 2517 // Appears unlocked - try to swing _owner from null to non-null. 2518 cmpxchgd(flag, /*current val*/ R0, (intptr_t)0, /*new val*/ R16_thread, owner_addr_Reg, 2519 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2520 MacroAssembler::cmpxchgx_hint_acquire_lock(), noreg, &L_decrement_retry, true); 2521 2522 if (RTMRetryCount > 0) { 2523 // success done else retry 2524 b(DONE_LABEL); 2525 bind(L_decrement_retry); 2526 // Spin and retry if lock is busy. 2527 rtm_retry_lock_on_busy(retry_on_busy_count_Reg, owner_addr_Reg, L_rtm_retry); 2528 } else { 2529 bind(L_decrement_retry); 2530 } 2531 } 2532 2533 #endif // INCLUDE_RTM_OPT 2534 2535 // "The box" is the space on the stack where we copy the object mark. 2536 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box, 2537 Register temp, Register displaced_header, Register current_header, 2538 bool try_bias, 2539 RTMLockingCounters* rtm_counters, 2540 RTMLockingCounters* stack_rtm_counters, 2541 Metadata* method_data, 2542 bool use_rtm, bool profile_rtm) { 2543 assert_different_registers(oop, box, temp, displaced_header, current_header); 2544 assert(flag != CCR0, "bad condition register"); 2545 Label cont; 2546 Label object_has_monitor; 2547 Label cas_failed; 2548 2549 // Load markOop from object into displaced_header. 2550 ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop); 2551 2552 2553 // Always do locking in runtime. 2554 if (EmitSync & 0x01) { 2555 cmpdi(flag, oop, 0); // Oop can't be 0 here => always false. 2556 return; 2557 } 2558 2559 if (try_bias) { 2560 biased_locking_enter(flag, oop, displaced_header, temp, current_header, cont); 2561 } 2562 2563 #if INCLUDE_RTM_OPT 2564 if (UseRTMForStackLocks && use_rtm) { 2565 rtm_stack_locking(flag, oop, displaced_header, temp, /*temp*/ current_header, 2566 stack_rtm_counters, method_data, profile_rtm, 2567 cont, object_has_monitor); 2568 } 2569 #endif // INCLUDE_RTM_OPT 2570 2571 // Handle existing monitor. 2572 if ((EmitSync & 0x02) == 0) { 2573 // The object has an existing monitor iff (mark & monitor_value) != 0. 2574 andi_(temp, displaced_header, markOopDesc::monitor_value); 2575 bne(CCR0, object_has_monitor); 2576 } 2577 2578 // Set displaced_header to be (markOop of object | UNLOCK_VALUE). 2579 ori(displaced_header, displaced_header, markOopDesc::unlocked_value); 2580 2581 // Load Compare Value application register. 2582 2583 // Initialize the box. (Must happen before we update the object mark!) 2584 std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2585 2586 // Must fence, otherwise, preceding store(s) may float below cmpxchg. 2587 // Compare object markOop with mark and if equal exchange scratch1 with object markOop. 2588 cmpxchgd(/*flag=*/flag, 2589 /*current_value=*/current_header, 2590 /*compare_value=*/displaced_header, 2591 /*exchange_value=*/box, 2592 /*where=*/oop, 2593 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2594 MacroAssembler::cmpxchgx_hint_acquire_lock(), 2595 noreg, 2596 &cas_failed, 2597 /*check without membar and ldarx first*/true); 2598 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2599 2600 // If the compare-and-exchange succeeded, then we found an unlocked 2601 // object and we have now locked it. 2602 b(cont); 2603 2604 bind(cas_failed); 2605 // We did not see an unlocked object so try the fast recursive case. 2606 2607 // Check if the owner is self by comparing the value in the markOop of object 2608 // (current_header) with the stack pointer. 2609 sub(current_header, current_header, R1_SP); 2610 load_const_optimized(temp, ~(os::vm_page_size()-1) | markOopDesc::lock_mask_in_place); 2611 2612 and_(R0/*==0?*/, current_header, temp); 2613 // If condition is true we are cont and hence we can store 0 as the 2614 // displaced header in the box, which indicates that it is a recursive lock. 2615 mcrf(flag,CCR0); 2616 std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box); 2617 2618 // Handle existing monitor. 2619 if ((EmitSync & 0x02) == 0) { 2620 b(cont); 2621 2622 bind(object_has_monitor); 2623 // The object's monitor m is unlocked iff m->owner == NULL, 2624 // otherwise m->owner may contain a thread or a stack address. 2625 2626 #if INCLUDE_RTM_OPT 2627 // Use the same RTM locking code in 32- and 64-bit VM. 2628 if (use_rtm) { 2629 rtm_inflated_locking(flag, oop, displaced_header, box, temp, /*temp*/ current_header, 2630 rtm_counters, method_data, profile_rtm, cont); 2631 } else { 2632 #endif // INCLUDE_RTM_OPT 2633 2634 // Try to CAS m->owner from NULL to current thread. 2635 addi(temp, displaced_header, ObjectMonitor::owner_offset_in_bytes()-markOopDesc::monitor_value); 2636 cmpxchgd(/*flag=*/flag, 2637 /*current_value=*/current_header, 2638 /*compare_value=*/(intptr_t)0, 2639 /*exchange_value=*/R16_thread, 2640 /*where=*/temp, 2641 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2642 MacroAssembler::cmpxchgx_hint_acquire_lock()); 2643 2644 // Store a non-null value into the box. 2645 std(box, BasicLock::displaced_header_offset_in_bytes(), box); 2646 2647 # ifdef ASSERT 2648 bne(flag, cont); 2649 // We have acquired the monitor, check some invariants. 2650 addi(/*monitor=*/temp, temp, -ObjectMonitor::owner_offset_in_bytes()); 2651 // Invariant 1: _recursions should be 0. 2652 //assert(ObjectMonitor::recursions_size_in_bytes() == 8, "unexpected size"); 2653 asm_assert_mem8_is_zero(ObjectMonitor::recursions_offset_in_bytes(), temp, 2654 "monitor->_recursions should be 0", -1); 2655 // Invariant 2: OwnerIsThread shouldn't be 0. 2656 //assert(ObjectMonitor::OwnerIsThread_size_in_bytes() == 4, "unexpected size"); 2657 //asm_assert_mem4_isnot_zero(ObjectMonitor::OwnerIsThread_offset_in_bytes(), temp, 2658 // "monitor->OwnerIsThread shouldn't be 0", -1); 2659 # endif 2660 2661 #if INCLUDE_RTM_OPT 2662 } // use_rtm() 2663 #endif 2664 } 2665 2666 bind(cont); 2667 // flag == EQ indicates success 2668 // flag == NE indicates failure 2669 } 2670 2671 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box, 2672 Register temp, Register displaced_header, Register current_header, 2673 bool try_bias, bool use_rtm) { 2674 assert_different_registers(oop, box, temp, displaced_header, current_header); 2675 assert(flag != CCR0, "bad condition register"); 2676 Label cont; 2677 Label object_has_monitor; 2678 2679 // Always do locking in runtime. 2680 if (EmitSync & 0x01) { 2681 cmpdi(flag, oop, 0); // Oop can't be 0 here => always false. 2682 return; 2683 } 2684 2685 if (try_bias) { 2686 biased_locking_exit(flag, oop, current_header, cont); 2687 } 2688 2689 #if INCLUDE_RTM_OPT 2690 if (UseRTMForStackLocks && use_rtm) { 2691 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); 2692 Label L_regular_unlock; 2693 ld(current_header, oopDesc::mark_offset_in_bytes(), oop); // fetch markword 2694 andi(R0, current_header, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits 2695 cmpwi(flag, R0, markOopDesc::unlocked_value); // bits = 001 unlocked 2696 bne(flag, L_regular_unlock); // else RegularLock 2697 tend_(); // otherwise end... 2698 b(cont); // ... and we're done 2699 bind(L_regular_unlock); 2700 } 2701 #endif 2702 2703 // Find the lock address and load the displaced header from the stack. 2704 ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2705 2706 // If the displaced header is 0, we have a recursive unlock. 2707 cmpdi(flag, displaced_header, 0); 2708 beq(flag, cont); 2709 2710 // Handle existing monitor. 2711 if ((EmitSync & 0x02) == 0) { 2712 // The object has an existing monitor iff (mark & monitor_value) != 0. 2713 RTM_OPT_ONLY( if (!(UseRTMForStackLocks && use_rtm)) ) // skip load if already done 2714 ld(current_header, oopDesc::mark_offset_in_bytes(), oop); 2715 andi_(R0, current_header, markOopDesc::monitor_value); 2716 bne(CCR0, object_has_monitor); 2717 } 2718 2719 // Check if it is still a light weight lock, this is is true if we see 2720 // the stack address of the basicLock in the markOop of the object. 2721 // Cmpxchg sets flag to cmpd(current_header, box). 2722 cmpxchgd(/*flag=*/flag, 2723 /*current_value=*/current_header, 2724 /*compare_value=*/box, 2725 /*exchange_value=*/displaced_header, 2726 /*where=*/oop, 2727 MacroAssembler::MemBarRel, 2728 MacroAssembler::cmpxchgx_hint_release_lock(), 2729 noreg, 2730 &cont); 2731 2732 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2733 2734 // Handle existing monitor. 2735 if ((EmitSync & 0x02) == 0) { 2736 b(cont); 2737 2738 bind(object_has_monitor); 2739 addi(current_header, current_header, -markOopDesc::monitor_value); // monitor 2740 ld(temp, ObjectMonitor::owner_offset_in_bytes(), current_header); 2741 2742 // It's inflated. 2743 #if INCLUDE_RTM_OPT 2744 if (use_rtm) { 2745 Label L_regular_inflated_unlock; 2746 // Clean monitor_value bit to get valid pointer 2747 cmpdi(flag, temp, 0); 2748 bne(flag, L_regular_inflated_unlock); 2749 tend_(); 2750 b(cont); 2751 bind(L_regular_inflated_unlock); 2752 } 2753 #endif 2754 2755 ld(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header); 2756 xorr(temp, R16_thread, temp); // Will be 0 if we are the owner. 2757 orr(temp, temp, displaced_header); // Will be 0 if there are 0 recursions. 2758 cmpdi(flag, temp, 0); 2759 bne(flag, cont); 2760 2761 ld(temp, ObjectMonitor::EntryList_offset_in_bytes(), current_header); 2762 ld(displaced_header, ObjectMonitor::cxq_offset_in_bytes(), current_header); 2763 orr(temp, temp, displaced_header); // Will be 0 if both are 0. 2764 cmpdi(flag, temp, 0); 2765 bne(flag, cont); 2766 release(); 2767 std(temp, ObjectMonitor::owner_offset_in_bytes(), current_header); 2768 } 2769 2770 bind(cont); 2771 // flag == EQ indicates success 2772 // flag == NE indicates failure 2773 } 2774 2775 // Write serialization page so VM thread can do a pseudo remote membar. 2776 // We use the current thread pointer to calculate a thread specific 2777 // offset to write to within the page. This minimizes bus traffic 2778 // due to cache line collision. 2779 void MacroAssembler::serialize_memory(Register thread, Register tmp1, Register tmp2) { 2780 srdi(tmp2, thread, os::get_serialize_page_shift_count()); 2781 2782 int mask = os::vm_page_size() - sizeof(int); 2783 if (Assembler::is_simm(mask, 16)) { 2784 andi(tmp2, tmp2, mask); 2785 } else { 2786 lis(tmp1, (int)((signed short) (mask >> 16))); 2787 ori(tmp1, tmp1, mask & 0x0000ffff); 2788 andr(tmp2, tmp2, tmp1); 2789 } 2790 2791 load_const(tmp1, (long) os::get_memory_serialize_page()); 2792 release(); 2793 stwx(R0, tmp1, tmp2); 2794 } 2795 2796 2797 // GC barrier helper macros 2798 2799 // Write the card table byte if needed. 2800 void MacroAssembler::card_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp) { 2801 CardTableModRefBS* bs = 2802 barrier_set_cast<CardTableModRefBS>(Universe::heap()->barrier_set()); 2803 assert(bs->kind() == BarrierSet::CardTableForRS || 2804 bs->kind() == BarrierSet::CardTableExtension, "wrong barrier"); 2805 #ifdef ASSERT 2806 cmpdi(CCR0, Rnew_val, 0); 2807 asm_assert_ne("null oop not allowed", 0x321); 2808 #endif 2809 card_table_write(bs->byte_map_base, Rtmp, Rstore_addr); 2810 } 2811 2812 // Write the card table byte. 2813 void MacroAssembler::card_table_write(jbyte* byte_map_base, Register Rtmp, Register Robj) { 2814 assert_different_registers(Robj, Rtmp, R0); 2815 load_const_optimized(Rtmp, (address)byte_map_base, R0); 2816 srdi(Robj, Robj, CardTableModRefBS::card_shift); 2817 li(R0, 0); // dirty 2818 if (UseConcMarkSweepGC) membar(Assembler::StoreStore); 2819 stbx(R0, Rtmp, Robj); 2820 } 2821 2822 #if INCLUDE_ALL_GCS 2823 // General G1 pre-barrier generator. 2824 // Goal: record the previous value if it is not null. 2825 void MacroAssembler::g1_write_barrier_pre(Register Robj, RegisterOrConstant offset, Register Rpre_val, 2826 Register Rtmp1, Register Rtmp2, bool needs_frame) { 2827 Label runtime, filtered; 2828 2829 // Is marking active? 2830 if (in_bytes(SATBMarkQueue::byte_width_of_active()) == 4) { 2831 lwz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_active()), R16_thread); 2832 } else { 2833 guarantee(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, "Assumption"); 2834 lbz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_active()), R16_thread); 2835 } 2836 cmpdi(CCR0, Rtmp1, 0); 2837 beq(CCR0, filtered); 2838 2839 // Do we need to load the previous value? 2840 if (Robj != noreg) { 2841 // Load the previous value... 2842 if (UseCompressedOops) { 2843 lwz(Rpre_val, offset, Robj); 2844 } else { 2845 ld(Rpre_val, offset, Robj); 2846 } 2847 // Previous value has been loaded into Rpre_val. 2848 } 2849 assert(Rpre_val != noreg, "must have a real register"); 2850 2851 // Is the previous value null? 2852 cmpdi(CCR0, Rpre_val, 0); 2853 beq(CCR0, filtered); 2854 2855 if (Robj != noreg && UseCompressedOops) { 2856 decode_heap_oop_not_null(Rpre_val); 2857 } 2858 2859 // OK, it's not filtered, so we'll need to call enqueue. In the normal 2860 // case, pre_val will be a scratch G-reg, but there are some cases in 2861 // which it's an O-reg. In the first case, do a normal call. In the 2862 // latter, do a save here and call the frameless version. 2863 2864 // Can we store original value in the thread's buffer? 2865 // Is index == 0? 2866 // (The index field is typed as size_t.) 2867 const Register Rbuffer = Rtmp1, Rindex = Rtmp2; 2868 2869 ld(Rindex, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_index()), R16_thread); 2870 cmpdi(CCR0, Rindex, 0); 2871 beq(CCR0, runtime); // If index == 0, goto runtime. 2872 ld(Rbuffer, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_buf()), R16_thread); 2873 2874 addi(Rindex, Rindex, -wordSize); // Decrement index. 2875 std(Rindex, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_index()), R16_thread); 2876 2877 // Record the previous value. 2878 stdx(Rpre_val, Rbuffer, Rindex); 2879 b(filtered); 2880 2881 bind(runtime); 2882 2883 // VM call need frame to access(write) O register. 2884 if (needs_frame) { 2885 save_LR_CR(Rtmp1); 2886 push_frame_reg_args(0, Rtmp2); 2887 } 2888 2889 if (Rpre_val->is_volatile() && Robj == noreg) mr(R31, Rpre_val); // Save pre_val across C call if it was preloaded. 2890 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), Rpre_val, R16_thread); 2891 if (Rpre_val->is_volatile() && Robj == noreg) mr(Rpre_val, R31); // restore 2892 2893 if (needs_frame) { 2894 pop_frame(); 2895 restore_LR_CR(Rtmp1); 2896 } 2897 2898 bind(filtered); 2899 } 2900 2901 // General G1 post-barrier generator 2902 // Store cross-region card. 2903 void MacroAssembler::g1_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp1, Register Rtmp2, Register Rtmp3, Label *filtered_ext) { 2904 Label runtime, filtered_int; 2905 Label& filtered = (filtered_ext != NULL) ? *filtered_ext : filtered_int; 2906 assert_different_registers(Rstore_addr, Rnew_val, Rtmp1, Rtmp2); 2907 2908 G1SATBCardTableLoggingModRefBS* bs = 2909 barrier_set_cast<G1SATBCardTableLoggingModRefBS>(Universe::heap()->barrier_set()); 2910 2911 // Does store cross heap regions? 2912 if (G1RSBarrierRegionFilter) { 2913 xorr(Rtmp1, Rstore_addr, Rnew_val); 2914 srdi_(Rtmp1, Rtmp1, HeapRegion::LogOfHRGrainBytes); 2915 beq(CCR0, filtered); 2916 } 2917 2918 // Crosses regions, storing NULL? 2919 #ifdef ASSERT 2920 cmpdi(CCR0, Rnew_val, 0); 2921 asm_assert_ne("null oop not allowed (G1)", 0x322); // Checked by caller on PPC64, so following branch is obsolete: 2922 //beq(CCR0, filtered); 2923 #endif 2924 2925 // Storing region crossing non-NULL, is card already dirty? 2926 assert(sizeof(*bs->byte_map_base) == sizeof(jbyte), "adjust this code"); 2927 const Register Rcard_addr = Rtmp1; 2928 Register Rbase = Rtmp2; 2929 load_const_optimized(Rbase, (address)bs->byte_map_base, /*temp*/ Rtmp3); 2930 2931 srdi(Rcard_addr, Rstore_addr, CardTableModRefBS::card_shift); 2932 2933 // Get the address of the card. 2934 lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr); 2935 cmpwi(CCR0, Rtmp3, (int)G1SATBCardTableModRefBS::g1_young_card_val()); 2936 beq(CCR0, filtered); 2937 2938 membar(Assembler::StoreLoad); 2939 lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr); // Reload after membar. 2940 cmpwi(CCR0, Rtmp3 /* card value */, CardTableModRefBS::dirty_card_val()); 2941 beq(CCR0, filtered); 2942 2943 // Storing a region crossing, non-NULL oop, card is clean. 2944 // Dirty card and log. 2945 li(Rtmp3, CardTableModRefBS::dirty_card_val()); 2946 //release(); // G1: oops are allowed to get visible after dirty marking. 2947 stbx(Rtmp3, Rbase, Rcard_addr); 2948 2949 add(Rcard_addr, Rbase, Rcard_addr); // This is the address which needs to get enqueued. 2950 Rbase = noreg; // end of lifetime 2951 2952 const Register Rqueue_index = Rtmp2, 2953 Rqueue_buf = Rtmp3; 2954 ld(Rqueue_index, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_index()), R16_thread); 2955 cmpdi(CCR0, Rqueue_index, 0); 2956 beq(CCR0, runtime); // index == 0 then jump to runtime 2957 ld(Rqueue_buf, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_buf()), R16_thread); 2958 2959 addi(Rqueue_index, Rqueue_index, -wordSize); // decrement index 2960 std(Rqueue_index, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_index()), R16_thread); 2961 2962 stdx(Rcard_addr, Rqueue_buf, Rqueue_index); // store card 2963 b(filtered); 2964 2965 bind(runtime); 2966 2967 // Save the live input values. 2968 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), Rcard_addr, R16_thread); 2969 2970 bind(filtered_int); 2971 } 2972 #endif // INCLUDE_ALL_GCS 2973 2974 // Values for last_Java_pc, and last_Java_sp must comply to the rules 2975 // in frame_ppc.hpp. 2976 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) { 2977 // Always set last_Java_pc and flags first because once last_Java_sp 2978 // is visible has_last_Java_frame is true and users will look at the 2979 // rest of the fields. (Note: flags should always be zero before we 2980 // get here so doesn't need to be set.) 2981 2982 // Verify that last_Java_pc was zeroed on return to Java 2983 asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread, 2984 "last_Java_pc not zeroed before leaving Java", 0x200); 2985 2986 // When returning from calling out from Java mode the frame anchor's 2987 // last_Java_pc will always be set to NULL. It is set here so that 2988 // if we are doing a call to native (not VM) that we capture the 2989 // known pc and don't have to rely on the native call having a 2990 // standard frame linkage where we can find the pc. 2991 if (last_Java_pc != noreg) 2992 std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 2993 2994 // Set last_Java_sp last. 2995 std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 2996 } 2997 2998 void MacroAssembler::reset_last_Java_frame(void) { 2999 asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()), 3000 R16_thread, "SP was not set, still zero", 0x202); 3001 3002 BLOCK_COMMENT("reset_last_Java_frame {"); 3003 li(R0, 0); 3004 3005 // _last_Java_sp = 0 3006 std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 3007 3008 // _last_Java_pc = 0 3009 std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 3010 BLOCK_COMMENT("} reset_last_Java_frame"); 3011 } 3012 3013 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) { 3014 assert_different_registers(sp, tmp1); 3015 3016 // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via 3017 // TOP_IJAVA_FRAME_ABI. 3018 // FIXME: assert that we really have a TOP_IJAVA_FRAME here! 3019 address entry = pc(); 3020 load_const_optimized(tmp1, entry); 3021 3022 set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1); 3023 } 3024 3025 void MacroAssembler::get_vm_result(Register oop_result) { 3026 // Read: 3027 // R16_thread 3028 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 3029 // 3030 // Updated: 3031 // oop_result 3032 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 3033 3034 verify_thread(); 3035 3036 ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread); 3037 li(R0, 0); 3038 std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread); 3039 3040 verify_oop(oop_result); 3041 } 3042 3043 void MacroAssembler::get_vm_result_2(Register metadata_result) { 3044 // Read: 3045 // R16_thread 3046 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 3047 // 3048 // Updated: 3049 // metadata_result 3050 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 3051 3052 ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 3053 li(R0, 0); 3054 std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 3055 } 3056 3057 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) { 3058 Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided. 3059 if (Universe::narrow_klass_base() != 0) { 3060 // Use dst as temp if it is free. 3061 sub_const_optimized(dst, current, Universe::narrow_klass_base(), R0); 3062 current = dst; 3063 } 3064 if (Universe::narrow_klass_shift() != 0) { 3065 srdi(dst, current, Universe::narrow_klass_shift()); 3066 current = dst; 3067 } 3068 return current; 3069 } 3070 3071 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) { 3072 if (UseCompressedClassPointers) { 3073 Register compressedKlass = encode_klass_not_null(ck, klass); 3074 stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop); 3075 } else { 3076 std(klass, oopDesc::klass_offset_in_bytes(), dst_oop); 3077 } 3078 } 3079 3080 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) { 3081 if (UseCompressedClassPointers) { 3082 if (val == noreg) { 3083 val = R0; 3084 li(val, 0); 3085 } 3086 stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed 3087 } 3088 } 3089 3090 int MacroAssembler::instr_size_for_decode_klass_not_null() { 3091 if (!UseCompressedClassPointers) return 0; 3092 int num_instrs = 1; // shift or move 3093 if (Universe::narrow_klass_base() != 0) num_instrs = 7; // shift + load const + add 3094 return num_instrs * BytesPerInstWord; 3095 } 3096 3097 void MacroAssembler::decode_klass_not_null(Register dst, Register src) { 3098 assert(dst != R0, "Dst reg may not be R0, as R0 is used here."); 3099 if (src == noreg) src = dst; 3100 Register shifted_src = src; 3101 if (Universe::narrow_klass_shift() != 0 || 3102 Universe::narrow_klass_base() == 0 && src != dst) { // Move required. 3103 shifted_src = dst; 3104 sldi(shifted_src, src, Universe::narrow_klass_shift()); 3105 } 3106 if (Universe::narrow_klass_base() != 0) { 3107 add_const_optimized(dst, shifted_src, Universe::narrow_klass_base(), R0); 3108 } 3109 } 3110 3111 void MacroAssembler::load_klass(Register dst, Register src) { 3112 if (UseCompressedClassPointers) { 3113 lwz(dst, oopDesc::klass_offset_in_bytes(), src); 3114 // Attention: no null check here! 3115 decode_klass_not_null(dst, dst); 3116 } else { 3117 ld(dst, oopDesc::klass_offset_in_bytes(), src); 3118 } 3119 } 3120 3121 // Clear Array 3122 // Kills both input registers. tmp == R0 is allowed. 3123 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp) { 3124 // Procedure for large arrays (uses data cache block zero instruction). 3125 Label startloop, fast, fastloop, small_rest, restloop, done; 3126 const int cl_size = VM_Version::L1_data_cache_line_size(), 3127 cl_dwords = cl_size>>3, 3128 cl_dw_addr_bits = exact_log2(cl_dwords), 3129 dcbz_min = 1; // Min count of dcbz executions, needs to be >0. 3130 3131 //2: 3132 cmpdi(CCR1, cnt_dwords, ((dcbz_min+1)<<cl_dw_addr_bits)-1); // Big enough? (ensure >=dcbz_min lines included). 3133 blt(CCR1, small_rest); // Too small. 3134 rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line. 3135 beq(CCR0, fast); // Already 128byte aligned. 3136 3137 subfic(tmp, tmp, cl_dwords); 3138 mtctr(tmp); // Set ctr to hit 128byte boundary (0<ctr<cl_dwords). 3139 subf(cnt_dwords, tmp, cnt_dwords); // rest. 3140 li(tmp, 0); 3141 //10: 3142 bind(startloop); // Clear at the beginning to reach 128byte boundary. 3143 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 3144 addi(base_ptr, base_ptr, 8); 3145 bdnz(startloop); 3146 //13: 3147 bind(fast); // Clear 128byte blocks. 3148 srdi(tmp, cnt_dwords, cl_dw_addr_bits); // Loop count for 128byte loop (>0). 3149 andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords. 3150 mtctr(tmp); // Load counter. 3151 //16: 3152 bind(fastloop); 3153 dcbz(base_ptr); // Clear 128byte aligned block. 3154 addi(base_ptr, base_ptr, cl_size); 3155 bdnz(fastloop); 3156 if (InsertEndGroupPPC64) { endgroup(); } else { nop(); } 3157 //20: 3158 bind(small_rest); 3159 cmpdi(CCR0, cnt_dwords, 0); // size 0? 3160 beq(CCR0, done); // rest == 0 3161 li(tmp, 0); 3162 mtctr(cnt_dwords); // Load counter. 3163 //24: 3164 bind(restloop); // Clear rest. 3165 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 3166 addi(base_ptr, base_ptr, 8); 3167 bdnz(restloop); 3168 //27: 3169 bind(done); 3170 } 3171 3172 /////////////////////////////////////////// String intrinsics //////////////////////////////////////////// 3173 3174 #ifdef COMPILER2 3175 // Intrinsics for CompactStrings 3176 3177 // Compress char[] to byte[] by compressing 16 bytes at once. 3178 void MacroAssembler::string_compress_16(Register src, Register dst, Register cnt, 3179 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, 3180 Label& Lfailure) { 3181 3182 const Register tmp0 = R0; 3183 assert_different_registers(src, dst, cnt, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5); 3184 Label Lloop, Lslow; 3185 3186 // Check if cnt >= 8 (= 16 bytes) 3187 lis(tmp1, 0xFF); // tmp1 = 0x00FF00FF00FF00FF 3188 srwi_(tmp2, cnt, 3); 3189 beq(CCR0, Lslow); 3190 ori(tmp1, tmp1, 0xFF); 3191 rldimi(tmp1, tmp1, 32, 0); 3192 mtctr(tmp2); 3193 3194 // 2x unrolled loop 3195 bind(Lloop); 3196 ld(tmp2, 0, src); // _0_1_2_3 (Big Endian) 3197 ld(tmp4, 8, src); // _4_5_6_7 3198 3199 orr(tmp0, tmp2, tmp4); 3200 rldicl(tmp3, tmp2, 6*8, 64-24); // _____1_2 3201 rldimi(tmp2, tmp2, 2*8, 2*8); // _0_2_3_3 3202 rldicl(tmp5, tmp4, 6*8, 64-24); // _____5_6 3203 rldimi(tmp4, tmp4, 2*8, 2*8); // _4_6_7_7 3204 3205 andc_(tmp0, tmp0, tmp1); 3206 bne(CCR0, Lfailure); // Not latin1. 3207 addi(src, src, 16); 3208 3209 rlwimi(tmp3, tmp2, 0*8, 24, 31);// _____1_3 3210 srdi(tmp2, tmp2, 3*8); // ____0_2_ 3211 rlwimi(tmp5, tmp4, 0*8, 24, 31);// _____5_7 3212 srdi(tmp4, tmp4, 3*8); // ____4_6_ 3213 3214 orr(tmp2, tmp2, tmp3); // ____0123 3215 orr(tmp4, tmp4, tmp5); // ____4567 3216 3217 stw(tmp2, 0, dst); 3218 stw(tmp4, 4, dst); 3219 addi(dst, dst, 8); 3220 bdnz(Lloop); 3221 3222 bind(Lslow); // Fallback to slow version 3223 } 3224 3225 // Compress char[] to byte[]. cnt must be positive int. 3226 void MacroAssembler::string_compress(Register src, Register dst, Register cnt, Register tmp, Label& Lfailure) { 3227 Label Lloop; 3228 mtctr(cnt); 3229 3230 bind(Lloop); 3231 lhz(tmp, 0, src); 3232 cmplwi(CCR0, tmp, 0xff); 3233 bgt(CCR0, Lfailure); // Not latin1. 3234 addi(src, src, 2); 3235 stb(tmp, 0, dst); 3236 addi(dst, dst, 1); 3237 bdnz(Lloop); 3238 } 3239 3240 // Inflate byte[] to char[] by inflating 16 bytes at once. 3241 void MacroAssembler::string_inflate_16(Register src, Register dst, Register cnt, 3242 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) { 3243 const Register tmp0 = R0; 3244 assert_different_registers(src, dst, cnt, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5); 3245 Label Lloop, Lslow; 3246 3247 // Check if cnt >= 8 3248 srwi_(tmp2, cnt, 3); 3249 beq(CCR0, Lslow); 3250 lis(tmp1, 0xFF); // tmp1 = 0x00FF00FF 3251 ori(tmp1, tmp1, 0xFF); 3252 mtctr(tmp2); 3253 3254 // 2x unrolled loop 3255 bind(Lloop); 3256 lwz(tmp2, 0, src); // ____0123 (Big Endian) 3257 lwz(tmp4, 4, src); // ____4567 3258 addi(src, src, 8); 3259 3260 rldicl(tmp3, tmp2, 7*8, 64-8); // _______2 3261 rlwimi(tmp2, tmp2, 3*8, 16, 23);// ____0113 3262 rldicl(tmp5, tmp4, 7*8, 64-8); // _______6 3263 rlwimi(tmp4, tmp4, 3*8, 16, 23);// ____4557 3264 3265 andc(tmp0, tmp2, tmp1); // ____0_1_ 3266 rlwimi(tmp2, tmp3, 2*8, 0, 23); // _____2_3 3267 andc(tmp3, tmp4, tmp1); // ____4_5_ 3268 rlwimi(tmp4, tmp5, 2*8, 0, 23); // _____6_7 3269 3270 rldimi(tmp2, tmp0, 3*8, 0*8); // _0_1_2_3 3271 rldimi(tmp4, tmp3, 3*8, 0*8); // _4_5_6_7 3272 3273 std(tmp2, 0, dst); 3274 std(tmp4, 8, dst); 3275 addi(dst, dst, 16); 3276 bdnz(Lloop); 3277 3278 bind(Lslow); // Fallback to slow version 3279 } 3280 3281 // Inflate byte[] to char[]. cnt must be positive int. 3282 void MacroAssembler::string_inflate(Register src, Register dst, Register cnt, Register tmp) { 3283 Label Lloop; 3284 mtctr(cnt); 3285 3286 bind(Lloop); 3287 lbz(tmp, 0, src); 3288 addi(src, src, 1); 3289 sth(tmp, 0, dst); 3290 addi(dst, dst, 2); 3291 bdnz(Lloop); 3292 } 3293 3294 void MacroAssembler::string_compare(Register str1, Register str2, 3295 Register cnt1, Register cnt2, 3296 Register tmp1, Register result, int ae) { 3297 const Register tmp0 = R0, 3298 diff = tmp1; 3299 3300 assert_different_registers(str1, str2, cnt1, cnt2, tmp0, tmp1, result); 3301 Label Ldone, Lslow, Lloop, Lreturn_diff; 3302 3303 // Note: Making use of the fact that compareTo(a, b) == -compareTo(b, a) 3304 // we interchange str1 and str2 in the UL case and negate the result. 3305 // Like this, str1 is always latin1 encoded, except for the UU case. 3306 // In addition, we need 0 (or sign which is 0) extend. 3307 3308 if (ae == StrIntrinsicNode::UU) { 3309 srwi(cnt1, cnt1, 1); 3310 } else { 3311 clrldi(cnt1, cnt1, 32); 3312 } 3313 3314 if (ae != StrIntrinsicNode::LL) { 3315 srwi(cnt2, cnt2, 1); 3316 } else { 3317 clrldi(cnt2, cnt2, 32); 3318 } 3319 3320 // See if the lengths are different, and calculate min in cnt1. 3321 // Save diff in case we need it for a tie-breaker. 3322 subf_(diff, cnt2, cnt1); // diff = cnt1 - cnt2 3323 // if (diff > 0) { cnt1 = cnt2; } 3324 if (VM_Version::has_isel()) { 3325 isel(cnt1, CCR0, Assembler::greater, /*invert*/ false, cnt2); 3326 } else { 3327 Label Lskip; 3328 blt(CCR0, Lskip); 3329 mr(cnt1, cnt2); 3330 bind(Lskip); 3331 } 3332 3333 // Rename registers 3334 Register chr1 = result; 3335 Register chr2 = tmp0; 3336 3337 // Compare multiple characters in fast loop (only implemented for same encoding). 3338 int stride1 = 8, stride2 = 8; 3339 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3340 int log2_chars_per_iter = (ae == StrIntrinsicNode::LL) ? 3 : 2; 3341 Label Lfastloop, Lskipfast; 3342 3343 srwi_(tmp0, cnt1, log2_chars_per_iter); 3344 beq(CCR0, Lskipfast); 3345 rldicl(cnt2, cnt1, 0, 64 - log2_chars_per_iter); // Remaining characters. 3346 li(cnt1, 1 << log2_chars_per_iter); // Initialize for failure case: Rescan characters from current iteration. 3347 mtctr(tmp0); 3348 3349 bind(Lfastloop); 3350 ld(chr1, 0, str1); 3351 ld(chr2, 0, str2); 3352 cmpd(CCR0, chr1, chr2); 3353 bne(CCR0, Lslow); 3354 addi(str1, str1, stride1); 3355 addi(str2, str2, stride2); 3356 bdnz(Lfastloop); 3357 mr(cnt1, cnt2); // Remaining characters. 3358 bind(Lskipfast); 3359 } 3360 3361 // Loop which searches the first difference character by character. 3362 cmpwi(CCR0, cnt1, 0); 3363 beq(CCR0, Lreturn_diff); 3364 bind(Lslow); 3365 mtctr(cnt1); 3366 3367 switch (ae) { 3368 case StrIntrinsicNode::LL: stride1 = 1; stride2 = 1; break; 3369 case StrIntrinsicNode::UL: // fallthru (see comment above) 3370 case StrIntrinsicNode::LU: stride1 = 1; stride2 = 2; break; 3371 case StrIntrinsicNode::UU: stride1 = 2; stride2 = 2; break; 3372 default: ShouldNotReachHere(); break; 3373 } 3374 3375 bind(Lloop); 3376 if (stride1 == 1) { lbz(chr1, 0, str1); } else { lhz(chr1, 0, str1); } 3377 if (stride2 == 1) { lbz(chr2, 0, str2); } else { lhz(chr2, 0, str2); } 3378 subf_(result, chr2, chr1); // result = chr1 - chr2 3379 bne(CCR0, Ldone); 3380 addi(str1, str1, stride1); 3381 addi(str2, str2, stride2); 3382 bdnz(Lloop); 3383 3384 // If strings are equal up to min length, return the length difference. 3385 bind(Lreturn_diff); 3386 mr(result, diff); 3387 3388 // Otherwise, return the difference between the first mismatched chars. 3389 bind(Ldone); 3390 if (ae == StrIntrinsicNode::UL) { 3391 neg(result, result); // Negate result (see note above). 3392 } 3393 } 3394 3395 void MacroAssembler::array_equals(bool is_array_equ, Register ary1, Register ary2, 3396 Register limit, Register tmp1, Register result, bool is_byte) { 3397 const Register tmp0 = R0; 3398 assert_different_registers(ary1, ary2, limit, tmp0, tmp1, result); 3399 Label Ldone, Lskiploop, Lloop, Lfastloop, Lskipfast; 3400 bool limit_needs_shift = false; 3401 3402 if (is_array_equ) { 3403 const int length_offset = arrayOopDesc::length_offset_in_bytes(); 3404 const int base_offset = arrayOopDesc::base_offset_in_bytes(is_byte ? T_BYTE : T_CHAR); 3405 3406 // Return true if the same array. 3407 cmpd(CCR0, ary1, ary2); 3408 beq(CCR0, Lskiploop); 3409 3410 // Return false if one of them is NULL. 3411 cmpdi(CCR0, ary1, 0); 3412 cmpdi(CCR1, ary2, 0); 3413 li(result, 0); 3414 cror(CCR0, Assembler::equal, CCR1, Assembler::equal); 3415 beq(CCR0, Ldone); 3416 3417 // Load the lengths of arrays. 3418 lwz(limit, length_offset, ary1); 3419 lwz(tmp0, length_offset, ary2); 3420 3421 // Return false if the two arrays are not equal length. 3422 cmpw(CCR0, limit, tmp0); 3423 bne(CCR0, Ldone); 3424 3425 // Load array addresses. 3426 addi(ary1, ary1, base_offset); 3427 addi(ary2, ary2, base_offset); 3428 } else { 3429 limit_needs_shift = !is_byte; 3430 li(result, 0); // Assume not equal. 3431 } 3432 3433 // Rename registers 3434 Register chr1 = tmp0; 3435 Register chr2 = tmp1; 3436 3437 // Compare 8 bytes per iteration in fast loop. 3438 const int log2_chars_per_iter = is_byte ? 3 : 2; 3439 3440 srwi_(tmp0, limit, log2_chars_per_iter + (limit_needs_shift ? 1 : 0)); 3441 beq(CCR0, Lskipfast); 3442 mtctr(tmp0); 3443 3444 bind(Lfastloop); 3445 ld(chr1, 0, ary1); 3446 ld(chr2, 0, ary2); 3447 addi(ary1, ary1, 8); 3448 addi(ary2, ary2, 8); 3449 cmpd(CCR0, chr1, chr2); 3450 bne(CCR0, Ldone); 3451 bdnz(Lfastloop); 3452 3453 bind(Lskipfast); 3454 rldicl_(limit, limit, limit_needs_shift ? 64 - 1 : 0, 64 - log2_chars_per_iter); // Remaining characters. 3455 beq(CCR0, Lskiploop); 3456 mtctr(limit); 3457 3458 // Character by character. 3459 bind(Lloop); 3460 if (is_byte) { 3461 lbz(chr1, 0, ary1); 3462 lbz(chr2, 0, ary2); 3463 addi(ary1, ary1, 1); 3464 addi(ary2, ary2, 1); 3465 } else { 3466 lhz(chr1, 0, ary1); 3467 lhz(chr2, 0, ary2); 3468 addi(ary1, ary1, 2); 3469 addi(ary2, ary2, 2); 3470 } 3471 cmpw(CCR0, chr1, chr2); 3472 bne(CCR0, Ldone); 3473 bdnz(Lloop); 3474 3475 bind(Lskiploop); 3476 li(result, 1); // All characters are equal. 3477 bind(Ldone); 3478 } 3479 3480 void MacroAssembler::string_indexof(Register result, Register haystack, Register haycnt, 3481 Register needle, ciTypeArray* needle_values, Register needlecnt, int needlecntval, 3482 Register tmp1, Register tmp2, Register tmp3, Register tmp4, int ae) { 3483 3484 // Ensure 0<needlecnt<=haycnt in ideal graph as prerequisite! 3485 Label L_TooShort, L_Found, L_NotFound, L_End; 3486 Register last_addr = haycnt, // Kill haycnt at the beginning. 3487 addr = tmp1, 3488 n_start = tmp2, 3489 ch1 = tmp3, 3490 ch2 = R0; 3491 3492 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 3493 const int h_csize = (ae == StrIntrinsicNode::LL) ? 1 : 2; 3494 const int n_csize = (ae == StrIntrinsicNode::UU) ? 2 : 1; 3495 3496 // ************************************************************************************************** 3497 // Prepare for main loop: optimized for needle count >=2, bail out otherwise. 3498 // ************************************************************************************************** 3499 3500 // Compute last haystack addr to use if no match gets found. 3501 clrldi(haycnt, haycnt, 32); // Ensure positive int is valid as 64 bit value. 3502 addi(addr, haystack, -h_csize); // Accesses use pre-increment. 3503 if (needlecntval == 0) { // variable needlecnt 3504 cmpwi(CCR6, needlecnt, 2); 3505 clrldi(needlecnt, needlecnt, 32); // Ensure positive int is valid as 64 bit value. 3506 blt(CCR6, L_TooShort); // Variable needlecnt: handle short needle separately. 3507 } 3508 3509 if (n_csize == 2) { lwz(n_start, 0, needle); } else { lhz(n_start, 0, needle); } // Load first 2 characters of needle. 3510 3511 if (needlecntval == 0) { // variable needlecnt 3512 subf(ch1, needlecnt, haycnt); // Last character index to compare is haycnt-needlecnt. 3513 addi(needlecnt, needlecnt, -2); // Rest of needle. 3514 } else { // constant needlecnt 3515 guarantee(needlecntval != 1, "IndexOf with single-character needle must be handled separately"); 3516 assert((needlecntval & 0x7fff) == needlecntval, "wrong immediate"); 3517 addi(ch1, haycnt, -needlecntval); // Last character index to compare is haycnt-needlecnt. 3518 if (needlecntval > 3) { li(needlecnt, needlecntval - 2); } // Rest of needle. 3519 } 3520 3521 if (h_csize == 2) { slwi(ch1, ch1, 1); } // Scale to number of bytes. 3522 3523 if (ae ==StrIntrinsicNode::UL) { 3524 srwi(tmp4, n_start, 1*8); // ___0 3525 rlwimi(n_start, tmp4, 2*8, 0, 23); // _0_1 3526 } 3527 3528 add(last_addr, haystack, ch1); // Point to last address to compare (haystack+2*(haycnt-needlecnt)). 3529 3530 // Main Loop (now we have at least 2 characters). 3531 Label L_OuterLoop, L_InnerLoop, L_FinalCheck, L_Comp1, L_Comp2; 3532 bind(L_OuterLoop); // Search for 1st 2 characters. 3533 Register addr_diff = tmp4; 3534 subf(addr_diff, addr, last_addr); // Difference between already checked address and last address to check. 3535 addi(addr, addr, h_csize); // This is the new address we want to use for comparing. 3536 srdi_(ch2, addr_diff, h_csize); 3537 beq(CCR0, L_FinalCheck); // 2 characters left? 3538 mtctr(ch2); // num of characters / 2 3539 bind(L_InnerLoop); // Main work horse (2x unrolled search loop) 3540 if (h_csize == 2) { // Load 2 characters of haystack (ignore alignment). 3541 lwz(ch1, 0, addr); 3542 lwz(ch2, 2, addr); 3543 } else { 3544 lhz(ch1, 0, addr); 3545 lhz(ch2, 1, addr); 3546 } 3547 cmpw(CCR0, ch1, n_start); // Compare 2 characters (1 would be sufficient but try to reduce branches to CompLoop). 3548 cmpw(CCR1, ch2, n_start); 3549 beq(CCR0, L_Comp1); // Did we find the needle start? 3550 beq(CCR1, L_Comp2); 3551 addi(addr, addr, 2 * h_csize); 3552 bdnz(L_InnerLoop); 3553 bind(L_FinalCheck); 3554 andi_(addr_diff, addr_diff, h_csize); // Remaining characters not covered by InnerLoop: (num of characters) & 1. 3555 beq(CCR0, L_NotFound); 3556 if (h_csize == 2) { lwz(ch1, 0, addr); } else { lhz(ch1, 0, addr); } // One position left at which we have to compare. 3557 cmpw(CCR1, ch1, n_start); 3558 beq(CCR1, L_Comp1); 3559 bind(L_NotFound); 3560 li(result, -1); // not found 3561 b(L_End); 3562 3563 // ************************************************************************************************** 3564 // Special Case: unfortunately, the variable needle case can be called with needlecnt<2 3565 // ************************************************************************************************** 3566 if (needlecntval == 0) { // We have to handle these cases separately. 3567 Label L_OneCharLoop; 3568 bind(L_TooShort); 3569 mtctr(haycnt); 3570 if (n_csize == 2) { lhz(n_start, 0, needle); } else { lbz(n_start, 0, needle); } // First character of needle 3571 bind(L_OneCharLoop); 3572 if (h_csize == 2) { lhzu(ch1, 2, addr); } else { lbzu(ch1, 1, addr); } 3573 cmpw(CCR1, ch1, n_start); 3574 beq(CCR1, L_Found); // Did we find the one character needle? 3575 bdnz(L_OneCharLoop); 3576 li(result, -1); // Not found. 3577 b(L_End); 3578 } 3579 3580 // ************************************************************************************************** 3581 // Regular Case Part II: compare rest of needle (first 2 characters have been compared already) 3582 // ************************************************************************************************** 3583 3584 // Compare the rest 3585 bind(L_Comp2); 3586 addi(addr, addr, h_csize); // First comparison has failed, 2nd one hit. 3587 bind(L_Comp1); // Addr points to possible needle start. 3588 if (needlecntval != 2) { // Const needlecnt==2? 3589 if (needlecntval != 3) { 3590 if (needlecntval == 0) { beq(CCR6, L_Found); } // Variable needlecnt==2? 3591 Register n_ind = tmp4, 3592 h_ind = n_ind; 3593 li(n_ind, 2 * n_csize); // First 2 characters are already compared, use index 2. 3594 mtctr(needlecnt); // Decremented by 2, still > 0. 3595 Label L_CompLoop; 3596 bind(L_CompLoop); 3597 if (ae ==StrIntrinsicNode::UL) { 3598 h_ind = ch1; 3599 sldi(h_ind, n_ind, 1); 3600 } 3601 if (n_csize == 2) { lhzx(ch2, needle, n_ind); } else { lbzx(ch2, needle, n_ind); } 3602 if (h_csize == 2) { lhzx(ch1, addr, h_ind); } else { lbzx(ch1, addr, h_ind); } 3603 cmpw(CCR1, ch1, ch2); 3604 bne(CCR1, L_OuterLoop); 3605 addi(n_ind, n_ind, n_csize); 3606 bdnz(L_CompLoop); 3607 } else { // No loop required if there's only one needle character left. 3608 if (n_csize == 2) { lhz(ch2, 2 * 2, needle); } else { lbz(ch2, 2 * 1, needle); } 3609 if (h_csize == 2) { lhz(ch1, 2 * 2, addr); } else { lbz(ch1, 2 * 1, addr); } 3610 cmpw(CCR1, ch1, ch2); 3611 bne(CCR1, L_OuterLoop); 3612 } 3613 } 3614 // Return index ... 3615 bind(L_Found); 3616 subf(result, haystack, addr); // relative to haystack, ... 3617 if (h_csize == 2) { srdi(result, result, 1); } // in characters. 3618 bind(L_End); 3619 } // string_indexof 3620 3621 void MacroAssembler::string_indexof_char(Register result, Register haystack, Register haycnt, 3622 Register needle, jchar needleChar, Register tmp1, Register tmp2, bool is_byte) { 3623 assert_different_registers(haystack, haycnt, needle, tmp1, tmp2); 3624 3625 Label L_InnerLoop, L_FinalCheck, L_Found1, L_Found2, L_NotFound, L_End; 3626 Register addr = tmp1, 3627 ch1 = tmp2, 3628 ch2 = R0; 3629 3630 const int h_csize = is_byte ? 1 : 2; 3631 3632 //4: 3633 srwi_(tmp2, haycnt, 1); // Shift right by exact_log2(UNROLL_FACTOR). 3634 mr(addr, haystack); 3635 beq(CCR0, L_FinalCheck); 3636 mtctr(tmp2); // Move to count register. 3637 //8: 3638 bind(L_InnerLoop); // Main work horse (2x unrolled search loop). 3639 if (!is_byte) { 3640 lhz(ch1, 0, addr); 3641 lhz(ch2, 2, addr); 3642 } else { 3643 lbz(ch1, 0, addr); 3644 lbz(ch2, 1, addr); 3645 } 3646 (needle != R0) ? cmpw(CCR0, ch1, needle) : cmplwi(CCR0, ch1, (unsigned int)needleChar); 3647 (needle != R0) ? cmpw(CCR1, ch2, needle) : cmplwi(CCR1, ch2, (unsigned int)needleChar); 3648 beq(CCR0, L_Found1); // Did we find the needle? 3649 beq(CCR1, L_Found2); 3650 addi(addr, addr, 2 * h_csize); 3651 bdnz(L_InnerLoop); 3652 //16: 3653 bind(L_FinalCheck); 3654 andi_(R0, haycnt, 1); 3655 beq(CCR0, L_NotFound); 3656 if (!is_byte) { lhz(ch1, 0, addr); } else { lbz(ch1, 0, addr); } // One position left at which we have to compare. 3657 (needle != R0) ? cmpw(CCR1, ch1, needle) : cmplwi(CCR1, ch1, (unsigned int)needleChar); 3658 beq(CCR1, L_Found1); 3659 //21: 3660 bind(L_NotFound); 3661 li(result, -1); // Not found. 3662 b(L_End); 3663 3664 bind(L_Found2); 3665 addi(addr, addr, h_csize); 3666 //24: 3667 bind(L_Found1); // Return index ... 3668 subf(result, haystack, addr); // relative to haystack, ... 3669 if (!is_byte) { srdi(result, result, 1); } // in characters. 3670 bind(L_End); 3671 } // string_indexof_char 3672 3673 3674 void MacroAssembler::has_negatives(Register src, Register cnt, Register result, 3675 Register tmp1, Register tmp2) { 3676 const Register tmp0 = R0; 3677 assert_different_registers(src, result, cnt, tmp0, tmp1, tmp2); 3678 Label Lfastloop, Lslow, Lloop, Lnoneg, Ldone; 3679 3680 // Check if cnt >= 8 (= 16 bytes) 3681 lis(tmp1, (int)(short)0x8080); // tmp1 = 0x8080808080808080 3682 srwi_(tmp2, cnt, 4); 3683 li(result, 1); // Assume there's a negative byte. 3684 beq(CCR0, Lslow); 3685 ori(tmp1, tmp1, 0x8080); 3686 rldimi(tmp1, tmp1, 32, 0); 3687 mtctr(tmp2); 3688 3689 // 2x unrolled loop 3690 bind(Lfastloop); 3691 ld(tmp2, 0, src); 3692 ld(tmp0, 8, src); 3693 3694 orr(tmp0, tmp2, tmp0); 3695 3696 and_(tmp0, tmp0, tmp1); 3697 bne(CCR0, Ldone); // Found negative byte. 3698 addi(src, src, 16); 3699 3700 bdnz(Lfastloop); 3701 3702 bind(Lslow); // Fallback to slow version 3703 rldicl_(tmp0, cnt, 0, 64-4); 3704 beq(CCR0, Lnoneg); 3705 mtctr(tmp0); 3706 bind(Lloop); 3707 lbz(tmp0, 0, src); 3708 addi(src, src, 1); 3709 andi_(tmp0, tmp0, 0x80); 3710 bne(CCR0, Ldone); // Found negative byte. 3711 bdnz(Lloop); 3712 bind(Lnoneg); 3713 li(result, 0); 3714 3715 bind(Ldone); 3716 } 3717 3718 3719 // Intrinsics for non-CompactStrings 3720 3721 // Search for a single jchar in an jchar[]. 3722 // 3723 // Assumes that result differs from all other registers. 3724 // 3725 // 'haystack' is the addresses of a jchar-array. 3726 // 'needle' is either the character to search for or R0. 3727 // 'needleChar' is the character to search for if 'needle' == R0.. 3728 // 'haycnt' is the length of the haystack. We assume 'haycnt' >=1. 3729 // 3730 // Preserves haystack, haycnt, needle and kills all other registers. 3731 // 3732 // If needle == R0, we search for the constant needleChar. 3733 void MacroAssembler::string_indexof_1(Register result, Register haystack, Register haycnt, 3734 Register needle, jchar needleChar, 3735 Register tmp1, Register tmp2) { 3736 3737 assert_different_registers(result, haystack, haycnt, needle, tmp1, tmp2); 3738 3739 Label L_InnerLoop, L_FinalCheck, L_Found1, L_Found2, L_Found3, L_NotFound, L_End; 3740 Register addr = tmp1, 3741 ch1 = tmp2, 3742 ch2 = R0; 3743 3744 //3: 3745 dcbtct(haystack, 0x00); // Indicate R/O access to haystack. 3746 3747 srwi_(tmp2, haycnt, 1); // Shift right by exact_log2(UNROLL_FACTOR). 3748 mr(addr, haystack); 3749 beq(CCR0, L_FinalCheck); 3750 mtctr(tmp2); // Move to count register. 3751 //8: 3752 bind(L_InnerLoop); // Main work horse (2x unrolled search loop). 3753 lhz(ch1, 0, addr); // Load characters from haystack. 3754 lhz(ch2, 2, addr); 3755 (needle != R0) ? cmpw(CCR0, ch1, needle) : cmplwi(CCR0, ch1, needleChar); 3756 (needle != R0) ? cmpw(CCR1, ch2, needle) : cmplwi(CCR1, ch2, needleChar); 3757 beq(CCR0, L_Found1); // Did we find the needle? 3758 beq(CCR1, L_Found2); 3759 addi(addr, addr, 4); 3760 bdnz(L_InnerLoop); 3761 //16: 3762 bind(L_FinalCheck); 3763 andi_(R0, haycnt, 1); 3764 beq(CCR0, L_NotFound); 3765 lhz(ch1, 0, addr); // One position left at which we have to compare. 3766 (needle != R0) ? cmpw(CCR1, ch1, needle) : cmplwi(CCR1, ch1, needleChar); 3767 beq(CCR1, L_Found3); 3768 //21: 3769 bind(L_NotFound); 3770 li(result, -1); // Not found. 3771 b(L_End); 3772 3773 bind(L_Found2); 3774 addi(addr, addr, 2); 3775 //24: 3776 bind(L_Found1); 3777 bind(L_Found3); // Return index ... 3778 subf(addr, haystack, addr); // relative to haystack, 3779 srdi(result, addr, 1); // in characters. 3780 bind(L_End); 3781 } 3782 3783 3784 // Implementation of IndexOf for jchar arrays. 3785 // 3786 // The length of haystack and needle are not constant, i.e. passed in a register. 3787 // 3788 // Preserves registers haystack, needle. 3789 // Kills registers haycnt, needlecnt. 3790 // Assumes that result differs from all other registers. 3791 // Haystack, needle are the addresses of jchar-arrays. 3792 // Haycnt, needlecnt are the lengths of them, respectively. 3793 // 3794 // Needlecntval must be zero or 15-bit unsigned immediate and > 1. 3795 void MacroAssembler::string_indexof(Register result, Register haystack, Register haycnt, 3796 Register needle, ciTypeArray* needle_values, Register needlecnt, int needlecntval, 3797 Register tmp1, Register tmp2, Register tmp3, Register tmp4) { 3798 3799 // Ensure 0<needlecnt<=haycnt in ideal graph as prerequisite! 3800 Label L_TooShort, L_Found, L_NotFound, L_End; 3801 Register last_addr = haycnt, // Kill haycnt at the beginning. 3802 addr = tmp1, 3803 n_start = tmp2, 3804 ch1 = tmp3, 3805 ch2 = R0; 3806 3807 // ************************************************************************************************** 3808 // Prepare for main loop: optimized for needle count >=2, bail out otherwise. 3809 // ************************************************************************************************** 3810 3811 //1 (variable) or 3 (const): 3812 dcbtct(needle, 0x00); // Indicate R/O access to str1. 3813 dcbtct(haystack, 0x00); // Indicate R/O access to str2. 3814 3815 // Compute last haystack addr to use if no match gets found. 3816 if (needlecntval == 0) { // variable needlecnt 3817 //3: 3818 subf(ch1, needlecnt, haycnt); // Last character index to compare is haycnt-needlecnt. 3819 addi(addr, haystack, -2); // Accesses use pre-increment. 3820 cmpwi(CCR6, needlecnt, 2); 3821 blt(CCR6, L_TooShort); // Variable needlecnt: handle short needle separately. 3822 slwi(ch1, ch1, 1); // Scale to number of bytes. 3823 lwz(n_start, 0, needle); // Load first 2 characters of needle. 3824 add(last_addr, haystack, ch1); // Point to last address to compare (haystack+2*(haycnt-needlecnt)). 3825 addi(needlecnt, needlecnt, -2); // Rest of needle. 3826 } else { // constant needlecnt 3827 guarantee(needlecntval != 1, "IndexOf with single-character needle must be handled separately"); 3828 assert((needlecntval & 0x7fff) == needlecntval, "wrong immediate"); 3829 //5: 3830 addi(ch1, haycnt, -needlecntval); // Last character index to compare is haycnt-needlecnt. 3831 lwz(n_start, 0, needle); // Load first 2 characters of needle. 3832 addi(addr, haystack, -2); // Accesses use pre-increment. 3833 slwi(ch1, ch1, 1); // Scale to number of bytes. 3834 add(last_addr, haystack, ch1); // Point to last address to compare (haystack+2*(haycnt-needlecnt)). 3835 li(needlecnt, needlecntval-2); // Rest of needle. 3836 } 3837 3838 // Main Loop (now we have at least 3 characters). 3839 //11: 3840 Label L_OuterLoop, L_InnerLoop, L_FinalCheck, L_Comp1, L_Comp2, L_Comp3; 3841 bind(L_OuterLoop); // Search for 1st 2 characters. 3842 Register addr_diff = tmp4; 3843 subf(addr_diff, addr, last_addr); // Difference between already checked address and last address to check. 3844 addi(addr, addr, 2); // This is the new address we want to use for comparing. 3845 srdi_(ch2, addr_diff, 2); 3846 beq(CCR0, L_FinalCheck); // 2 characters left? 3847 mtctr(ch2); // addr_diff/4 3848 //16: 3849 bind(L_InnerLoop); // Main work horse (2x unrolled search loop) 3850 lwz(ch1, 0, addr); // Load 2 characters of haystack (ignore alignment). 3851 lwz(ch2, 2, addr); 3852 cmpw(CCR0, ch1, n_start); // Compare 2 characters (1 would be sufficient but try to reduce branches to CompLoop). 3853 cmpw(CCR1, ch2, n_start); 3854 beq(CCR0, L_Comp1); // Did we find the needle start? 3855 beq(CCR1, L_Comp2); 3856 addi(addr, addr, 4); 3857 bdnz(L_InnerLoop); 3858 //24: 3859 bind(L_FinalCheck); 3860 rldicl_(addr_diff, addr_diff, 64-1, 63); // Remaining characters not covered by InnerLoop: (addr_diff>>1)&1. 3861 beq(CCR0, L_NotFound); 3862 lwz(ch1, 0, addr); // One position left at which we have to compare. 3863 cmpw(CCR1, ch1, n_start); 3864 beq(CCR1, L_Comp3); 3865 //29: 3866 bind(L_NotFound); 3867 li(result, -1); // not found 3868 b(L_End); 3869 3870 3871 // ************************************************************************************************** 3872 // Special Case: unfortunately, the variable needle case can be called with needlecnt<2 3873 // ************************************************************************************************** 3874 //31: 3875 if ((needlecntval>>1) !=1 ) { // Const needlecnt is 2 or 3? Reduce code size. 3876 int nopcnt = 5; 3877 if (needlecntval !=0 ) ++nopcnt; // Balance alignment (other case: see below). 3878 if (needlecntval == 0) { // We have to handle these cases separately. 3879 Label L_OneCharLoop; 3880 bind(L_TooShort); 3881 mtctr(haycnt); 3882 lhz(n_start, 0, needle); // First character of needle 3883 bind(L_OneCharLoop); 3884 lhzu(ch1, 2, addr); 3885 cmpw(CCR1, ch1, n_start); 3886 beq(CCR1, L_Found); // Did we find the one character needle? 3887 bdnz(L_OneCharLoop); 3888 li(result, -1); // Not found. 3889 b(L_End); 3890 } // 8 instructions, so no impact on alignment. 3891 for (int x = 0; x < nopcnt; ++x) nop(); 3892 } 3893 3894 // ************************************************************************************************** 3895 // Regular Case Part II: compare rest of needle (first 2 characters have been compared already) 3896 // ************************************************************************************************** 3897 3898 // Compare the rest 3899 //36 if needlecntval==0, else 37: 3900 bind(L_Comp2); 3901 addi(addr, addr, 2); // First comparison has failed, 2nd one hit. 3902 bind(L_Comp1); // Addr points to possible needle start. 3903 bind(L_Comp3); // Could have created a copy and use a different return address but saving code size here. 3904 if (needlecntval != 2) { // Const needlecnt==2? 3905 if (needlecntval != 3) { 3906 if (needlecntval == 0) beq(CCR6, L_Found); // Variable needlecnt==2? 3907 Register ind_reg = tmp4; 3908 li(ind_reg, 2*2); // First 2 characters are already compared, use index 2. 3909 mtctr(needlecnt); // Decremented by 2, still > 0. 3910 //40: 3911 Label L_CompLoop; 3912 bind(L_CompLoop); 3913 lhzx(ch2, needle, ind_reg); 3914 lhzx(ch1, addr, ind_reg); 3915 cmpw(CCR1, ch1, ch2); 3916 bne(CCR1, L_OuterLoop); 3917 addi(ind_reg, ind_reg, 2); 3918 bdnz(L_CompLoop); 3919 } else { // No loop required if there's only one needle character left. 3920 lhz(ch2, 2*2, needle); 3921 lhz(ch1, 2*2, addr); 3922 cmpw(CCR1, ch1, ch2); 3923 bne(CCR1, L_OuterLoop); 3924 } 3925 } 3926 // Return index ... 3927 //46: 3928 bind(L_Found); 3929 subf(addr, haystack, addr); // relative to haystack, ... 3930 srdi(result, addr, 1); // in characters. 3931 //48: 3932 bind(L_End); 3933 } 3934 3935 // Implementation of Compare for jchar arrays. 3936 // 3937 // Kills the registers str1, str2, cnt1, cnt2. 3938 // Kills cr0, ctr. 3939 // Assumes that result differes from the input registers. 3940 void MacroAssembler::string_compare(Register str1_reg, Register str2_reg, Register cnt1_reg, Register cnt2_reg, 3941 Register result_reg, Register tmp_reg) { 3942 assert_different_registers(result_reg, str1_reg, str2_reg, cnt1_reg, cnt2_reg, tmp_reg); 3943 3944 Label Ldone, Lslow_case, Lslow_loop, Lfast_loop; 3945 Register cnt_diff = R0, 3946 limit_reg = cnt1_reg, 3947 chr1_reg = result_reg, 3948 chr2_reg = cnt2_reg, 3949 addr_diff = str2_reg; 3950 3951 // 'cnt_reg' contains the number of characters in the string's character array for the 3952 // pre-CompactStrings strings implementation and the number of bytes in the string's 3953 // byte array for the CompactStrings strings implementation. 3954 const int HAS_COMPACT_STRING = java_lang_String::has_coder_field() ? 1 : 0; // '1' = byte array, '0' = char array 3955 3956 // Offset 0 should be 32 byte aligned. 3957 //-6: 3958 srawi(cnt1_reg, cnt1_reg, HAS_COMPACT_STRING); 3959 srawi(cnt2_reg, cnt2_reg, HAS_COMPACT_STRING); 3960 //-4: 3961 dcbtct(str1_reg, 0x00); // Indicate R/O access to str1. 3962 dcbtct(str2_reg, 0x00); // Indicate R/O access to str2. 3963 //-2: 3964 // Compute min(cnt1, cnt2) and check if 0 (bail out if we don't need to compare characters). 3965 subf(result_reg, cnt2_reg, cnt1_reg); // difference between cnt1/2 3966 subf_(addr_diff, str1_reg, str2_reg); // alias? 3967 beq(CCR0, Ldone); // return cnt difference if both ones are identical 3968 srawi(limit_reg, result_reg, 31); // generate signmask (cnt1/2 must be non-negative so cnt_diff can't overflow) 3969 mr(cnt_diff, result_reg); 3970 andr(limit_reg, result_reg, limit_reg); // difference or zero (negative): cnt1<cnt2 ? cnt1-cnt2 : 0 3971 add_(limit_reg, cnt2_reg, limit_reg); // min(cnt1, cnt2)==0? 3972 beq(CCR0, Ldone); // return cnt difference if one has 0 length 3973 3974 lhz(chr1_reg, 0, str1_reg); // optional: early out if first characters mismatch 3975 lhzx(chr2_reg, str1_reg, addr_diff); // optional: early out if first characters mismatch 3976 addi(tmp_reg, limit_reg, -1); // min(cnt1, cnt2)-1 3977 subf_(result_reg, chr2_reg, chr1_reg); // optional: early out if first characters mismatch 3978 bne(CCR0, Ldone); // optional: early out if first characters mismatch 3979 3980 // Set loop counter by scaling down tmp_reg 3981 srawi_(chr2_reg, tmp_reg, exact_log2(4)); // (min(cnt1, cnt2)-1)/4 3982 ble(CCR0, Lslow_case); // need >4 characters for fast loop 3983 andi(limit_reg, tmp_reg, 4-1); // remaining characters 3984 3985 // Adapt str1_reg str2_reg for the first loop iteration 3986 mtctr(chr2_reg); // (min(cnt1, cnt2)-1)/4 3987 addi(limit_reg, limit_reg, 4+1); // compare last 5-8 characters in slow_case if mismatch found in fast_loop 3988 //16: 3989 // Compare the rest of the characters 3990 bind(Lfast_loop); 3991 ld(chr1_reg, 0, str1_reg); 3992 ldx(chr2_reg, str1_reg, addr_diff); 3993 cmpd(CCR0, chr2_reg, chr1_reg); 3994 bne(CCR0, Lslow_case); // return chr1_reg 3995 addi(str1_reg, str1_reg, 4*2); 3996 bdnz(Lfast_loop); 3997 addi(limit_reg, limit_reg, -4); // no mismatch found in fast_loop, only 1-4 characters missing 3998 //23: 3999 bind(Lslow_case); 4000 mtctr(limit_reg); 4001 //24: 4002 bind(Lslow_loop); 4003 lhz(chr1_reg, 0, str1_reg); 4004 lhzx(chr2_reg, str1_reg, addr_diff); 4005 subf_(result_reg, chr2_reg, chr1_reg); 4006 bne(CCR0, Ldone); // return chr1_reg 4007 addi(str1_reg, str1_reg, 1*2); 4008 bdnz(Lslow_loop); 4009 //30: 4010 // If strings are equal up to min length, return the length difference. 4011 mr(result_reg, cnt_diff); 4012 nop(); // alignment 4013 //32: 4014 // Otherwise, return the difference between the first mismatched chars. 4015 bind(Ldone); 4016 } 4017 4018 4019 // Compare char[] arrays. 4020 // 4021 // str1_reg USE only 4022 // str2_reg USE only 4023 // cnt_reg USE_DEF, due to tmp reg shortage 4024 // result_reg DEF only, might compromise USE only registers 4025 void MacroAssembler::char_arrays_equals(Register str1_reg, Register str2_reg, Register cnt_reg, Register result_reg, 4026 Register tmp1_reg, Register tmp2_reg, Register tmp3_reg, Register tmp4_reg, 4027 Register tmp5_reg) { 4028 4029 // Str1 may be the same register as str2 which can occur e.g. after scalar replacement. 4030 assert_different_registers(result_reg, str1_reg, cnt_reg, tmp1_reg, tmp2_reg, tmp3_reg, tmp4_reg, tmp5_reg); 4031 assert_different_registers(result_reg, str2_reg, cnt_reg, tmp1_reg, tmp2_reg, tmp3_reg, tmp4_reg, tmp5_reg); 4032 4033 // Offset 0 should be 32 byte aligned. 4034 Label Linit_cbc, Lcbc, Lloop, Ldone_true, Ldone_false; 4035 Register index_reg = tmp5_reg; 4036 Register cbc_iter = tmp4_reg; 4037 4038 // 'cnt_reg' contains the number of characters in the string's character array for the 4039 // pre-CompactStrings strings implementation and the number of bytes in the string's 4040 // byte array for the CompactStrings strings implementation. 4041 const int HAS_COMPACT_STRING = java_lang_String::has_coder_field() ? 1 : 0; // '1' = byte array, '0' = char array 4042 4043 //-1: 4044 dcbtct(str1_reg, 0x00); // Indicate R/O access to str1. 4045 dcbtct(str2_reg, 0x00); // Indicate R/O access to str2. 4046 //1: 4047 // cbc_iter: remaining characters after the '4 java characters per iteration' loop. 4048 rlwinm(cbc_iter, cnt_reg, 32 - HAS_COMPACT_STRING, 30, 31); // (cnt_reg % (HAS_COMPACT_STRING ? 8 : 4)) >> HAS_COMPACT_STRING 4049 li(index_reg, 0); // init 4050 li(result_reg, 0); // assume false 4051 // tmp2_reg: units of 4 java characters (i.e. 8 bytes) per iteration (main loop). 4052 srwi_(tmp2_reg, cnt_reg, exact_log2(4 << HAS_COMPACT_STRING)); // cnt_reg / (HAS_COMPACT_STRING ? 8 : 4) 4053 4054 cmpwi(CCR1, cbc_iter, 0); // CCR1 = (cbc_iter==0) 4055 beq(CCR0, Linit_cbc); // too short 4056 mtctr(tmp2_reg); 4057 //8: 4058 bind(Lloop); 4059 ldx(tmp1_reg, str1_reg, index_reg); 4060 ldx(tmp2_reg, str2_reg, index_reg); 4061 cmpd(CCR0, tmp1_reg, tmp2_reg); 4062 bne(CCR0, Ldone_false); // Unequal char pair found -> done. 4063 addi(index_reg, index_reg, 4*sizeof(jchar)); 4064 bdnz(Lloop); 4065 //14: 4066 bind(Linit_cbc); 4067 beq(CCR1, Ldone_true); 4068 mtctr(cbc_iter); 4069 //16: 4070 bind(Lcbc); 4071 lhzx(tmp1_reg, str1_reg, index_reg); 4072 lhzx(tmp2_reg, str2_reg, index_reg); 4073 cmpw(CCR0, tmp1_reg, tmp2_reg); 4074 bne(CCR0, Ldone_false); // Unequal char pair found -> done. 4075 addi(index_reg, index_reg, 1*sizeof(jchar)); 4076 bdnz(Lcbc); 4077 nop(); 4078 bind(Ldone_true); 4079 li(result_reg, 1); 4080 //24: 4081 bind(Ldone_false); 4082 } 4083 4084 4085 void MacroAssembler::char_arrays_equalsImm(Register str1_reg, Register str2_reg, int cntval, Register result_reg, 4086 Register tmp1_reg, Register tmp2_reg) { 4087 // Str1 may be the same register as str2 which can occur e.g. after scalar replacement. 4088 assert_different_registers(result_reg, str1_reg, tmp1_reg, tmp2_reg); 4089 assert_different_registers(result_reg, str2_reg, tmp1_reg, tmp2_reg); 4090 assert(sizeof(jchar) == 2, "must be"); 4091 assert(cntval >= 0 && ((cntval & 0x7fff) == cntval), "wrong immediate"); 4092 4093 // 'cntval' contains the number of characters in the string's character array for the 4094 // pre-CompactStrings strings implementation and the number of bytes in the string's 4095 // byte array for the CompactStrings strings implementation. 4096 cntval >>= (java_lang_String::has_coder_field() ? 1 : 0); // '1' = byte array strings, '0' = char array strings 4097 4098 Label Ldone_false; 4099 4100 if (cntval < 16) { // short case 4101 if (cntval != 0) li(result_reg, 0); // assume false 4102 4103 const int num_bytes = cntval*sizeof(jchar); 4104 int index = 0; 4105 for (int next_index; (next_index = index + 8) <= num_bytes; index = next_index) { 4106 ld(tmp1_reg, index, str1_reg); 4107 ld(tmp2_reg, index, str2_reg); 4108 cmpd(CCR0, tmp1_reg, tmp2_reg); 4109 bne(CCR0, Ldone_false); 4110 } 4111 if (cntval & 2) { 4112 lwz(tmp1_reg, index, str1_reg); 4113 lwz(tmp2_reg, index, str2_reg); 4114 cmpw(CCR0, tmp1_reg, tmp2_reg); 4115 bne(CCR0, Ldone_false); 4116 index += 4; 4117 } 4118 if (cntval & 1) { 4119 lhz(tmp1_reg, index, str1_reg); 4120 lhz(tmp2_reg, index, str2_reg); 4121 cmpw(CCR0, tmp1_reg, tmp2_reg); 4122 bne(CCR0, Ldone_false); 4123 } 4124 // fallthrough: true 4125 } else { 4126 Label Lloop; 4127 Register index_reg = tmp1_reg; 4128 const int loopcnt = cntval/4; 4129 assert(loopcnt > 0, "must be"); 4130 // Offset 0 should be 32 byte aligned. 4131 //2: 4132 dcbtct(str1_reg, 0x00); // Indicate R/O access to str1. 4133 dcbtct(str2_reg, 0x00); // Indicate R/O access to str2. 4134 li(tmp2_reg, loopcnt); 4135 li(index_reg, 0); // init 4136 li(result_reg, 0); // assume false 4137 mtctr(tmp2_reg); 4138 //8: 4139 bind(Lloop); 4140 ldx(R0, str1_reg, index_reg); 4141 ldx(tmp2_reg, str2_reg, index_reg); 4142 cmpd(CCR0, R0, tmp2_reg); 4143 bne(CCR0, Ldone_false); // Unequal char pair found -> done. 4144 addi(index_reg, index_reg, 4*sizeof(jchar)); 4145 bdnz(Lloop); 4146 //14: 4147 if (cntval & 2) { 4148 lwzx(R0, str1_reg, index_reg); 4149 lwzx(tmp2_reg, str2_reg, index_reg); 4150 cmpw(CCR0, R0, tmp2_reg); 4151 bne(CCR0, Ldone_false); 4152 if (cntval & 1) addi(index_reg, index_reg, 2*sizeof(jchar)); 4153 } 4154 if (cntval & 1) { 4155 lhzx(R0, str1_reg, index_reg); 4156 lhzx(tmp2_reg, str2_reg, index_reg); 4157 cmpw(CCR0, R0, tmp2_reg); 4158 bne(CCR0, Ldone_false); 4159 } 4160 // fallthru: true 4161 } 4162 li(result_reg, 1); 4163 bind(Ldone_false); 4164 } 4165 4166 #endif // Compiler2 4167 4168 // Helpers for Intrinsic Emitters 4169 // 4170 // Revert the byte order of a 32bit value in a register 4171 // src: 0x44556677 4172 // dst: 0x77665544 4173 // Three steps to obtain the result: 4174 // 1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word 4175 // into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared. 4176 // This value initializes dst. 4177 // 2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost 4178 // byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go. 4179 // This value is mask inserted into dst with a [0..23] mask of 1s. 4180 // 3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position. 4181 // This value is mask inserted into dst with a [8..15] mask of 1s. 4182 void MacroAssembler::load_reverse_32(Register dst, Register src) { 4183 assert_different_registers(dst, src); 4184 4185 rldicl(dst, src, (4+1)*8, 56); // Rotate byte 4 into position 7 (rightmost), clear all to the left. 4186 rlwimi(dst, src, 3*8, 0, 23); // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone. 4187 rlwimi(dst, src, 1*8, 8, 15); // Insert byte 6 into position 5, leave the rest alone. 4188 } 4189 4190 // Calculate the column addresses of the crc32 lookup table into distinct registers. 4191 // This loop-invariant calculation is moved out of the loop body, reducing the loop 4192 // body size from 20 to 16 instructions. 4193 // Returns the offset that was used to calculate the address of column tc3. 4194 // Due to register shortage, setting tc3 may overwrite table. With the return offset 4195 // at hand, the original table address can be easily reconstructed. 4196 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) { 4197 4198 #ifdef VM_LITTLE_ENDIAN 4199 // This is what we implement (the DOLIT4 part): 4200 // ========================================================================= */ 4201 // #define DOLIT4 c ^= *buf4++; \ 4202 // c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \ 4203 // crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24] 4204 // #define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4 4205 // ========================================================================= */ 4206 const int ix0 = 3*(4*CRC32_COLUMN_SIZE); 4207 const int ix1 = 2*(4*CRC32_COLUMN_SIZE); 4208 const int ix2 = 1*(4*CRC32_COLUMN_SIZE); 4209 const int ix3 = 0*(4*CRC32_COLUMN_SIZE); 4210 #else 4211 // This is what we implement (the DOBIG4 part): 4212 // ========================================================================= 4213 // #define DOBIG4 c ^= *++buf4; \ 4214 // c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \ 4215 // crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24] 4216 // #define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4 4217 // ========================================================================= 4218 const int ix0 = 4*(4*CRC32_COLUMN_SIZE); 4219 const int ix1 = 5*(4*CRC32_COLUMN_SIZE); 4220 const int ix2 = 6*(4*CRC32_COLUMN_SIZE); 4221 const int ix3 = 7*(4*CRC32_COLUMN_SIZE); 4222 #endif 4223 assert_different_registers(table, tc0, tc1, tc2); 4224 assert(table == tc3, "must be!"); 4225 4226 addi(tc0, table, ix0); 4227 addi(tc1, table, ix1); 4228 addi(tc2, table, ix2); 4229 if (ix3 != 0) addi(tc3, table, ix3); 4230 4231 return ix3; 4232 } 4233 4234 /** 4235 * uint32_t crc; 4236 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8); 4237 */ 4238 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) { 4239 assert_different_registers(crc, table, tmp); 4240 assert_different_registers(val, table); 4241 4242 if (crc == val) { // Must rotate first to use the unmodified value. 4243 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 4244 // As we use a word (4-byte) instruction, we have to adapt the mask bit positions. 4245 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 4246 } else { 4247 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 4248 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 4249 } 4250 lwzx(tmp, table, tmp); 4251 xorr(crc, crc, tmp); 4252 } 4253 4254 /** 4255 * uint32_t crc; 4256 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8); 4257 */ 4258 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) { 4259 fold_byte_crc32(crc, crc, table, tmp); 4260 } 4261 4262 /** 4263 * Emits code to update CRC-32 with a byte value according to constants in table. 4264 * 4265 * @param [in,out]crc Register containing the crc. 4266 * @param [in]val Register containing the byte to fold into the CRC. 4267 * @param [in]table Register containing the table of crc constants. 4268 * 4269 * uint32_t crc; 4270 * val = crc_table[(val ^ crc) & 0xFF]; 4271 * crc = val ^ (crc >> 8); 4272 */ 4273 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 4274 BLOCK_COMMENT("update_byte_crc32:"); 4275 xorr(val, val, crc); 4276 fold_byte_crc32(crc, val, table, val); 4277 } 4278 4279 /** 4280 * @param crc register containing existing CRC (32-bit) 4281 * @param buf register pointing to input byte buffer (byte*) 4282 * @param len register containing number of bytes 4283 * @param table register pointing to CRC table 4284 */ 4285 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table, 4286 Register data, bool loopAlignment, bool invertCRC) { 4287 assert_different_registers(crc, buf, len, table, data); 4288 4289 Label L_mainLoop, L_done; 4290 const int mainLoop_stepping = 1; 4291 const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4; 4292 4293 // Process all bytes in a single-byte loop. 4294 clrldi_(len, len, 32); // Enforce 32 bit. Anything to do? 4295 beq(CCR0, L_done); 4296 4297 if (invertCRC) { 4298 nand(crc, crc, crc); // ~c 4299 } 4300 4301 mtctr(len); 4302 align(mainLoop_alignment); 4303 BIND(L_mainLoop); 4304 lbz(data, 0, buf); // Byte from buffer, zero-extended. 4305 addi(buf, buf, mainLoop_stepping); // Advance buffer position. 4306 update_byte_crc32(crc, data, table); 4307 bdnz(L_mainLoop); // Iterate. 4308 4309 if (invertCRC) { 4310 nand(crc, crc, crc); // ~c 4311 } 4312 4313 bind(L_done); 4314 } 4315 4316 /** 4317 * Emits code to update CRC-32 with a 4-byte value according to constants in table 4318 * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c 4319 */ 4320 // A not on the lookup table address(es): 4321 // The lookup table consists of two sets of four columns each. 4322 // The columns {0..3} are used for little-endian machines. 4323 // The columns {4..7} are used for big-endian machines. 4324 // To save the effort of adding the column offset to the table address each time 4325 // a table element is looked up, it is possible to pass the pre-calculated 4326 // column addresses. 4327 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary. 4328 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc, 4329 Register t0, Register t1, Register t2, Register t3, 4330 Register tc0, Register tc1, Register tc2, Register tc3) { 4331 assert_different_registers(crc, t3); 4332 4333 // XOR crc with next four bytes of buffer. 4334 lwz(t3, bufDisp, buf); 4335 if (bufInc != 0) { 4336 addi(buf, buf, bufInc); 4337 } 4338 xorr(t3, t3, crc); 4339 4340 // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices. 4341 rlwinm(t0, t3, 2, 24-2, 31-2); // ((t1 >> 0) & 0xff) << 2 4342 rlwinm(t1, t3, 32+(2- 8), 24-2, 31-2); // ((t1 >> 8) & 0xff) << 2 4343 rlwinm(t2, t3, 32+(2-16), 24-2, 31-2); // ((t1 >> 16) & 0xff) << 2 4344 rlwinm(t3, t3, 32+(2-24), 24-2, 31-2); // ((t1 >> 24) & 0xff) << 2 4345 4346 // Use the pre-calculated column addresses. 4347 // Load pre-calculated table values. 4348 lwzx(t0, tc0, t0); 4349 lwzx(t1, tc1, t1); 4350 lwzx(t2, tc2, t2); 4351 lwzx(t3, tc3, t3); 4352 4353 // Calculate new crc from table values. 4354 xorr(t0, t0, t1); 4355 xorr(t2, t2, t3); 4356 xorr(crc, t0, t2); // Now crc contains the final checksum value. 4357 } 4358 4359 /** 4360 * @param crc register containing existing CRC (32-bit) 4361 * @param buf register pointing to input byte buffer (byte*) 4362 * @param len register containing number of bytes 4363 * @param table register pointing to CRC table 4364 * 4365 * Uses R9..R12 as work register. Must be saved/restored by caller! 4366 */ 4367 void MacroAssembler::kernel_crc32_2word(Register crc, Register buf, Register len, Register table, 4368 Register t0, Register t1, Register t2, Register t3, 4369 Register tc0, Register tc1, Register tc2, Register tc3) { 4370 assert_different_registers(crc, buf, len, table); 4371 4372 Label L_mainLoop, L_tail; 4373 Register tmp = t0; 4374 Register data = t0; 4375 Register tmp2 = t1; 4376 const int mainLoop_stepping = 8; 4377 const int tailLoop_stepping = 1; 4378 const int log_stepping = exact_log2(mainLoop_stepping); 4379 const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32; 4380 const int complexThreshold = 2*mainLoop_stepping; 4381 4382 // Don't test for len <= 0 here. This pathological case should not occur anyway. 4383 // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles. 4384 // The situation itself is detected and handled correctly by the conditional branches 4385 // following aghi(len, -stepping) and aghi(len, +stepping). 4386 assert(tailLoop_stepping == 1, "check tailLoop_stepping!"); 4387 4388 BLOCK_COMMENT("kernel_crc32_2word {"); 4389 4390 nand(crc, crc, crc); // ~c 4391 4392 // Check for short (<mainLoop_stepping) buffer. 4393 cmpdi(CCR0, len, complexThreshold); 4394 blt(CCR0, L_tail); 4395 4396 // Pre-mainLoop alignment did show a slight (1%) positive effect on performance. 4397 // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions. 4398 { 4399 // Align buf addr to mainLoop_stepping boundary. 4400 neg(tmp2, buf); // Calculate # preLoop iterations for alignment. 4401 rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63. 4402 4403 if (complexThreshold > mainLoop_stepping) { 4404 sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 4405 } else { 4406 sub(tmp, len, tmp2); // Remaining bytes for main loop. 4407 cmpdi(CCR0, tmp, mainLoop_stepping); 4408 blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing 4409 mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 4410 } 4411 update_byteLoop_crc32(crc, buf, tmp2, table, data, false, false); 4412 } 4413 4414 srdi(tmp2, len, log_stepping); // #iterations for mainLoop 4415 andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop 4416 mtctr(tmp2); 4417 4418 #ifdef VM_LITTLE_ENDIAN 4419 Register crc_rv = crc; 4420 #else 4421 Register crc_rv = tmp; // Load_reverse needs separate registers to work on. 4422 // Occupies tmp, but frees up crc. 4423 load_reverse_32(crc_rv, crc); // Revert byte order because we are dealing with big-endian data. 4424 tmp = crc; 4425 #endif 4426 4427 int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3); 4428 4429 align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement. 4430 BIND(L_mainLoop); 4431 update_1word_crc32(crc_rv, buf, table, 0, 0, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 4432 update_1word_crc32(crc_rv, buf, table, 4, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 4433 bdnz(L_mainLoop); 4434 4435 #ifndef VM_LITTLE_ENDIAN 4436 load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data. 4437 tmp = crc_rv; // Tmp uses it's original register again. 4438 #endif 4439 4440 // Restore original table address for tailLoop. 4441 if (reconstructTableOffset != 0) { 4442 addi(table, table, -reconstructTableOffset); 4443 } 4444 4445 // Process last few (<complexThreshold) bytes of buffer. 4446 BIND(L_tail); 4447 update_byteLoop_crc32(crc, buf, len, table, data, false, false); 4448 4449 nand(crc, crc, crc); // ~c 4450 BLOCK_COMMENT("} kernel_crc32_2word"); 4451 } 4452 4453 /** 4454 * @param crc register containing existing CRC (32-bit) 4455 * @param buf register pointing to input byte buffer (byte*) 4456 * @param len register containing number of bytes 4457 * @param table register pointing to CRC table 4458 * 4459 * uses R9..R12 as work register. Must be saved/restored by caller! 4460 */ 4461 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table, 4462 Register t0, Register t1, Register t2, Register t3, 4463 Register tc0, Register tc1, Register tc2, Register tc3) { 4464 assert_different_registers(crc, buf, len, table); 4465 4466 Label L_mainLoop, L_tail; 4467 Register tmp = t0; 4468 Register data = t0; 4469 Register tmp2 = t1; 4470 const int mainLoop_stepping = 4; 4471 const int tailLoop_stepping = 1; 4472 const int log_stepping = exact_log2(mainLoop_stepping); 4473 const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32; 4474 const int complexThreshold = 2*mainLoop_stepping; 4475 4476 // Don't test for len <= 0 here. This pathological case should not occur anyway. 4477 // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles. 4478 // The situation itself is detected and handled correctly by the conditional branches 4479 // following aghi(len, -stepping) and aghi(len, +stepping). 4480 assert(tailLoop_stepping == 1, "check tailLoop_stepping!"); 4481 4482 BLOCK_COMMENT("kernel_crc32_1word {"); 4483 4484 nand(crc, crc, crc); // ~c 4485 4486 // Check for short (<mainLoop_stepping) buffer. 4487 cmpdi(CCR0, len, complexThreshold); 4488 blt(CCR0, L_tail); 4489 4490 // Pre-mainLoop alignment did show a slight (1%) positive effect on performance. 4491 // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions. 4492 { 4493 // Align buf addr to mainLoop_stepping boundary. 4494 neg(tmp2, buf); // Calculate # preLoop iterations for alignment. 4495 rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63. 4496 4497 if (complexThreshold > mainLoop_stepping) { 4498 sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 4499 } else { 4500 sub(tmp, len, tmp2); // Remaining bytes for main loop. 4501 cmpdi(CCR0, tmp, mainLoop_stepping); 4502 blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing 4503 mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 4504 } 4505 update_byteLoop_crc32(crc, buf, tmp2, table, data, false, false); 4506 } 4507 4508 srdi(tmp2, len, log_stepping); // #iterations for mainLoop 4509 andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop 4510 mtctr(tmp2); 4511 4512 #ifdef VM_LITTLE_ENDIAN 4513 Register crc_rv = crc; 4514 #else 4515 Register crc_rv = tmp; // Load_reverse needs separate registers to work on. 4516 // Occupies tmp, but frees up crc. 4517 load_reverse_32(crc_rv, crc); // Revert byte order because we are dealing with big-endian data. 4518 tmp = crc; 4519 #endif 4520 4521 int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3); 4522 4523 align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement. 4524 BIND(L_mainLoop); 4525 update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 4526 bdnz(L_mainLoop); 4527 4528 #ifndef VM_LITTLE_ENDIAN 4529 load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data. 4530 tmp = crc_rv; // Tmp uses it's original register again. 4531 #endif 4532 4533 // Restore original table address for tailLoop. 4534 if (reconstructTableOffset != 0) { 4535 addi(table, table, -reconstructTableOffset); 4536 } 4537 4538 // Process last few (<complexThreshold) bytes of buffer. 4539 BIND(L_tail); 4540 update_byteLoop_crc32(crc, buf, len, table, data, false, false); 4541 4542 nand(crc, crc, crc); // ~c 4543 BLOCK_COMMENT("} kernel_crc32_1word"); 4544 } 4545 4546 /** 4547 * @param crc register containing existing CRC (32-bit) 4548 * @param buf register pointing to input byte buffer (byte*) 4549 * @param len register containing number of bytes 4550 * @param table register pointing to CRC table 4551 * 4552 * Uses R7_ARG5, R8_ARG6 as work registers. 4553 */ 4554 void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len, Register table, 4555 Register t0, Register t1, Register t2, Register t3) { 4556 assert_different_registers(crc, buf, len, table); 4557 4558 Register data = t0; // Holds the current byte to be folded into crc. 4559 4560 BLOCK_COMMENT("kernel_crc32_1byte {"); 4561 4562 // Process all bytes in a single-byte loop. 4563 update_byteLoop_crc32(crc, buf, len, table, data, true, true); 4564 4565 BLOCK_COMMENT("} kernel_crc32_1byte"); 4566 } 4567 4568 void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp) { 4569 assert_different_registers(crc, buf, /* len, not used!! */ table, tmp); 4570 4571 BLOCK_COMMENT("kernel_crc32_singleByte:"); 4572 nand(crc, crc, crc); // ~c 4573 4574 lbz(tmp, 0, buf); // Byte from buffer, zero-extended. 4575 update_byte_crc32(crc, tmp, table); 4576 4577 nand(crc, crc, crc); // ~c 4578 } 4579 4580 // dest_lo += src1 + src2 4581 // dest_hi += carry1 + carry2 4582 void MacroAssembler::add2_with_carry(Register dest_hi, 4583 Register dest_lo, 4584 Register src1, Register src2) { 4585 li(R0, 0); 4586 addc(dest_lo, dest_lo, src1); 4587 adde(dest_hi, dest_hi, R0); 4588 addc(dest_lo, dest_lo, src2); 4589 adde(dest_hi, dest_hi, R0); 4590 } 4591 4592 // Multiply 64 bit by 64 bit first loop. 4593 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, 4594 Register x_xstart, 4595 Register y, Register y_idx, 4596 Register z, 4597 Register carry, 4598 Register product_high, Register product, 4599 Register idx, Register kdx, 4600 Register tmp) { 4601 // jlong carry, x[], y[], z[]; 4602 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) { 4603 // huge_128 product = y[idx] * x[xstart] + carry; 4604 // z[kdx] = (jlong)product; 4605 // carry = (jlong)(product >>> 64); 4606 // } 4607 // z[xstart] = carry; 4608 4609 Label L_first_loop, L_first_loop_exit; 4610 Label L_one_x, L_one_y, L_multiply; 4611 4612 addic_(xstart, xstart, -1); 4613 blt(CCR0, L_one_x); // Special case: length of x is 1. 4614 4615 // Load next two integers of x. 4616 sldi(tmp, xstart, LogBytesPerInt); 4617 ldx(x_xstart, x, tmp); 4618 #ifdef VM_LITTLE_ENDIAN 4619 rldicl(x_xstart, x_xstart, 32, 0); 4620 #endif 4621 4622 align(32, 16); 4623 bind(L_first_loop); 4624 4625 cmpdi(CCR0, idx, 1); 4626 blt(CCR0, L_first_loop_exit); 4627 addi(idx, idx, -2); 4628 beq(CCR0, L_one_y); 4629 4630 // Load next two integers of y. 4631 sldi(tmp, idx, LogBytesPerInt); 4632 ldx(y_idx, y, tmp); 4633 #ifdef VM_LITTLE_ENDIAN 4634 rldicl(y_idx, y_idx, 32, 0); 4635 #endif 4636 4637 4638 bind(L_multiply); 4639 multiply64(product_high, product, x_xstart, y_idx); 4640 4641 li(tmp, 0); 4642 addc(product, product, carry); // Add carry to result. 4643 adde(product_high, product_high, tmp); // Add carry of the last addition. 4644 addi(kdx, kdx, -2); 4645 4646 // Store result. 4647 #ifdef VM_LITTLE_ENDIAN 4648 rldicl(product, product, 32, 0); 4649 #endif 4650 sldi(tmp, kdx, LogBytesPerInt); 4651 stdx(product, z, tmp); 4652 mr_if_needed(carry, product_high); 4653 b(L_first_loop); 4654 4655 4656 bind(L_one_y); // Load one 32 bit portion of y as (0,value). 4657 4658 lwz(y_idx, 0, y); 4659 b(L_multiply); 4660 4661 4662 bind(L_one_x); // Load one 32 bit portion of x as (0,value). 4663 4664 lwz(x_xstart, 0, x); 4665 b(L_first_loop); 4666 4667 bind(L_first_loop_exit); 4668 } 4669 4670 // Multiply 64 bit by 64 bit and add 128 bit. 4671 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, 4672 Register z, Register yz_idx, 4673 Register idx, Register carry, 4674 Register product_high, Register product, 4675 Register tmp, int offset) { 4676 4677 // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry; 4678 // z[kdx] = (jlong)product; 4679 4680 sldi(tmp, idx, LogBytesPerInt); 4681 if (offset) { 4682 addi(tmp, tmp, offset); 4683 } 4684 ldx(yz_idx, y, tmp); 4685 #ifdef VM_LITTLE_ENDIAN 4686 rldicl(yz_idx, yz_idx, 32, 0); 4687 #endif 4688 4689 multiply64(product_high, product, x_xstart, yz_idx); 4690 ldx(yz_idx, z, tmp); 4691 #ifdef VM_LITTLE_ENDIAN 4692 rldicl(yz_idx, yz_idx, 32, 0); 4693 #endif 4694 4695 add2_with_carry(product_high, product, carry, yz_idx); 4696 4697 sldi(tmp, idx, LogBytesPerInt); 4698 if (offset) { 4699 addi(tmp, tmp, offset); 4700 } 4701 #ifdef VM_LITTLE_ENDIAN 4702 rldicl(product, product, 32, 0); 4703 #endif 4704 stdx(product, z, tmp); 4705 } 4706 4707 // Multiply 128 bit by 128 bit. Unrolled inner loop. 4708 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, 4709 Register y, Register z, 4710 Register yz_idx, Register idx, Register carry, 4711 Register product_high, Register product, 4712 Register carry2, Register tmp) { 4713 4714 // jlong carry, x[], y[], z[]; 4715 // int kdx = ystart+1; 4716 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 4717 // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry; 4718 // z[kdx+idx+1] = (jlong)product; 4719 // jlong carry2 = (jlong)(product >>> 64); 4720 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2; 4721 // z[kdx+idx] = (jlong)product; 4722 // carry = (jlong)(product >>> 64); 4723 // } 4724 // idx += 2; 4725 // if (idx > 0) { 4726 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry; 4727 // z[kdx+idx] = (jlong)product; 4728 // carry = (jlong)(product >>> 64); 4729 // } 4730 4731 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 4732 const Register jdx = R0; 4733 4734 // Scale the index. 4735 srdi_(jdx, idx, 2); 4736 beq(CCR0, L_third_loop_exit); 4737 mtctr(jdx); 4738 4739 align(32, 16); 4740 bind(L_third_loop); 4741 4742 addi(idx, idx, -4); 4743 4744 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8); 4745 mr_if_needed(carry2, product_high); 4746 4747 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0); 4748 mr_if_needed(carry, product_high); 4749 bdnz(L_third_loop); 4750 4751 bind(L_third_loop_exit); // Handle any left-over operand parts. 4752 4753 andi_(idx, idx, 0x3); 4754 beq(CCR0, L_post_third_loop_done); 4755 4756 Label L_check_1; 4757 4758 addic_(idx, idx, -2); 4759 blt(CCR0, L_check_1); 4760 4761 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0); 4762 mr_if_needed(carry, product_high); 4763 4764 bind(L_check_1); 4765 4766 addi(idx, idx, 0x2); 4767 andi_(idx, idx, 0x1); 4768 addic_(idx, idx, -1); 4769 blt(CCR0, L_post_third_loop_done); 4770 4771 sldi(tmp, idx, LogBytesPerInt); 4772 lwzx(yz_idx, y, tmp); 4773 multiply64(product_high, product, x_xstart, yz_idx); 4774 lwzx(yz_idx, z, tmp); 4775 4776 add2_with_carry(product_high, product, yz_idx, carry); 4777 4778 sldi(tmp, idx, LogBytesPerInt); 4779 stwx(product, z, tmp); 4780 srdi(product, product, 32); 4781 4782 sldi(product_high, product_high, 32); 4783 orr(product, product, product_high); 4784 mr_if_needed(carry, product); 4785 4786 bind(L_post_third_loop_done); 4787 } // multiply_128_x_128_loop 4788 4789 void MacroAssembler::multiply_to_len(Register x, Register xlen, 4790 Register y, Register ylen, 4791 Register z, Register zlen, 4792 Register tmp1, Register tmp2, 4793 Register tmp3, Register tmp4, 4794 Register tmp5, Register tmp6, 4795 Register tmp7, Register tmp8, 4796 Register tmp9, Register tmp10, 4797 Register tmp11, Register tmp12, 4798 Register tmp13) { 4799 4800 ShortBranchVerifier sbv(this); 4801 4802 assert_different_registers(x, xlen, y, ylen, z, zlen, 4803 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6); 4804 assert_different_registers(x, xlen, y, ylen, z, zlen, 4805 tmp1, tmp2, tmp3, tmp4, tmp5, tmp7); 4806 assert_different_registers(x, xlen, y, ylen, z, zlen, 4807 tmp1, tmp2, tmp3, tmp4, tmp5, tmp8); 4808 4809 const Register idx = tmp1; 4810 const Register kdx = tmp2; 4811 const Register xstart = tmp3; 4812 4813 const Register y_idx = tmp4; 4814 const Register carry = tmp5; 4815 const Register product = tmp6; 4816 const Register product_high = tmp7; 4817 const Register x_xstart = tmp8; 4818 const Register tmp = tmp9; 4819 4820 // First Loop. 4821 // 4822 // final static long LONG_MASK = 0xffffffffL; 4823 // int xstart = xlen - 1; 4824 // int ystart = ylen - 1; 4825 // long carry = 0; 4826 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 4827 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; 4828 // z[kdx] = (int)product; 4829 // carry = product >>> 32; 4830 // } 4831 // z[xstart] = (int)carry; 4832 4833 mr_if_needed(idx, ylen); // idx = ylen 4834 mr_if_needed(kdx, zlen); // kdx = xlen + ylen 4835 li(carry, 0); // carry = 0 4836 4837 Label L_done; 4838 4839 addic_(xstart, xlen, -1); 4840 blt(CCR0, L_done); 4841 4842 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, 4843 carry, product_high, product, idx, kdx, tmp); 4844 4845 Label L_second_loop; 4846 4847 cmpdi(CCR0, kdx, 0); 4848 beq(CCR0, L_second_loop); 4849 4850 Label L_carry; 4851 4852 addic_(kdx, kdx, -1); 4853 beq(CCR0, L_carry); 4854 4855 // Store lower 32 bits of carry. 4856 sldi(tmp, kdx, LogBytesPerInt); 4857 stwx(carry, z, tmp); 4858 srdi(carry, carry, 32); 4859 addi(kdx, kdx, -1); 4860 4861 4862 bind(L_carry); 4863 4864 // Store upper 32 bits of carry. 4865 sldi(tmp, kdx, LogBytesPerInt); 4866 stwx(carry, z, tmp); 4867 4868 // Second and third (nested) loops. 4869 // 4870 // for (int i = xstart-1; i >= 0; i--) { // Second loop 4871 // carry = 0; 4872 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop 4873 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + 4874 // (z[k] & LONG_MASK) + carry; 4875 // z[k] = (int)product; 4876 // carry = product >>> 32; 4877 // } 4878 // z[i] = (int)carry; 4879 // } 4880 // 4881 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx 4882 4883 bind(L_second_loop); 4884 4885 li(carry, 0); // carry = 0; 4886 4887 addic_(xstart, xstart, -1); // i = xstart-1; 4888 blt(CCR0, L_done); 4889 4890 Register zsave = tmp10; 4891 4892 mr(zsave, z); 4893 4894 4895 Label L_last_x; 4896 4897 sldi(tmp, xstart, LogBytesPerInt); 4898 add(z, z, tmp); // z = z + k - j 4899 addi(z, z, 4); 4900 addic_(xstart, xstart, -1); // i = xstart-1; 4901 blt(CCR0, L_last_x); 4902 4903 sldi(tmp, xstart, LogBytesPerInt); 4904 ldx(x_xstart, x, tmp); 4905 #ifdef VM_LITTLE_ENDIAN 4906 rldicl(x_xstart, x_xstart, 32, 0); 4907 #endif 4908 4909 4910 Label L_third_loop_prologue; 4911 4912 bind(L_third_loop_prologue); 4913 4914 Register xsave = tmp11; 4915 Register xlensave = tmp12; 4916 Register ylensave = tmp13; 4917 4918 mr(xsave, x); 4919 mr(xlensave, xstart); 4920 mr(ylensave, ylen); 4921 4922 4923 multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen, 4924 carry, product_high, product, x, tmp); 4925 4926 mr(z, zsave); 4927 mr(x, xsave); 4928 mr(xlen, xlensave); // This is the decrement of the loop counter! 4929 mr(ylen, ylensave); 4930 4931 addi(tmp3, xlen, 1); 4932 sldi(tmp, tmp3, LogBytesPerInt); 4933 stwx(carry, z, tmp); 4934 addic_(tmp3, tmp3, -1); 4935 blt(CCR0, L_done); 4936 4937 srdi(carry, carry, 32); 4938 sldi(tmp, tmp3, LogBytesPerInt); 4939 stwx(carry, z, tmp); 4940 b(L_second_loop); 4941 4942 // Next infrequent code is moved outside loops. 4943 bind(L_last_x); 4944 4945 lwz(x_xstart, 0, x); 4946 b(L_third_loop_prologue); 4947 4948 bind(L_done); 4949 } // multiply_to_len 4950 4951 void MacroAssembler::asm_assert(bool check_equal, const char *msg, int id) { 4952 #ifdef ASSERT 4953 Label ok; 4954 if (check_equal) { 4955 beq(CCR0, ok); 4956 } else { 4957 bne(CCR0, ok); 4958 } 4959 stop(msg, id); 4960 bind(ok); 4961 #endif 4962 } 4963 4964 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset, 4965 Register mem_base, const char* msg, int id) { 4966 #ifdef ASSERT 4967 switch (size) { 4968 case 4: 4969 lwz(R0, mem_offset, mem_base); 4970 cmpwi(CCR0, R0, 0); 4971 break; 4972 case 8: 4973 ld(R0, mem_offset, mem_base); 4974 cmpdi(CCR0, R0, 0); 4975 break; 4976 default: 4977 ShouldNotReachHere(); 4978 } 4979 asm_assert(check_equal, msg, id); 4980 #endif // ASSERT 4981 } 4982 4983 void MacroAssembler::verify_thread() { 4984 if (VerifyThread) { 4985 unimplemented("'VerifyThread' currently not implemented on PPC"); 4986 } 4987 } 4988 4989 // READ: oop. KILL: R0. Volatile floats perhaps. 4990 void MacroAssembler::verify_oop(Register oop, const char* msg) { 4991 if (!VerifyOops) { 4992 return; 4993 } 4994 4995 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address(); 4996 const Register tmp = R11; // Will be preserved. 4997 const int nbytes_save = MacroAssembler::num_volatile_regs * 8; 4998 save_volatile_gprs(R1_SP, -nbytes_save); // except R0 4999 5000 mr_if_needed(R4_ARG2, oop); 5001 save_LR_CR(tmp); // save in old frame 5002 push_frame_reg_args(nbytes_save, tmp); 5003 // load FunctionDescriptor** / entry_address * 5004 load_const_optimized(tmp, fd, R0); 5005 // load FunctionDescriptor* / entry_address 5006 ld(tmp, 0, tmp); 5007 load_const_optimized(R3_ARG1, (address)msg, R0); 5008 // Call destination for its side effect. 5009 call_c(tmp); 5010 5011 pop_frame(); 5012 restore_LR_CR(tmp); 5013 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0 5014 } 5015 5016 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) { 5017 if (!VerifyOops) { 5018 return; 5019 } 5020 5021 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address(); 5022 const Register tmp = R11; // Will be preserved. 5023 const int nbytes_save = MacroAssembler::num_volatile_regs * 8; 5024 save_volatile_gprs(R1_SP, -nbytes_save); // except R0 5025 5026 ld(R4_ARG2, offs, base); 5027 save_LR_CR(tmp); // save in old frame 5028 push_frame_reg_args(nbytes_save, tmp); 5029 // load FunctionDescriptor** / entry_address * 5030 load_const_optimized(tmp, fd, R0); 5031 // load FunctionDescriptor* / entry_address 5032 ld(tmp, 0, tmp); 5033 load_const_optimized(R3_ARG1, (address)msg, R0); 5034 // Call destination for its side effect. 5035 call_c(tmp); 5036 5037 pop_frame(); 5038 restore_LR_CR(tmp); 5039 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0 5040 } 5041 5042 const char* stop_types[] = { 5043 "stop", 5044 "untested", 5045 "unimplemented", 5046 "shouldnotreachhere" 5047 }; 5048 5049 static void stop_on_request(int tp, const char* msg) { 5050 tty->print("PPC assembly code requires stop: (%s) %s\n", stop_types[tp%/*stop_end*/4], msg); 5051 guarantee(false, "PPC assembly code requires stop: %s", msg); 5052 } 5053 5054 // Call a C-function that prints output. 5055 void MacroAssembler::stop(int type, const char* msg, int id) { 5056 #ifndef PRODUCT 5057 block_comment(err_msg("stop: %s %s {", stop_types[type%stop_end], msg)); 5058 #else 5059 block_comment("stop {"); 5060 #endif 5061 5062 // setup arguments 5063 load_const_optimized(R3_ARG1, type); 5064 load_const_optimized(R4_ARG2, (void *)msg, /*tmp=*/R0); 5065 call_VM_leaf(CAST_FROM_FN_PTR(address, stop_on_request), R3_ARG1, R4_ARG2); 5066 illtrap(); 5067 emit_int32(id); 5068 block_comment("} stop;"); 5069 } 5070 5071 #ifndef PRODUCT 5072 // Write pattern 0x0101010101010101 in memory region [low-before, high+after]. 5073 // Val, addr are temp registers. 5074 // If low == addr, addr is killed. 5075 // High is preserved. 5076 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) { 5077 if (!ZapMemory) return; 5078 5079 assert_different_registers(low, val); 5080 5081 BLOCK_COMMENT("zap memory region {"); 5082 load_const_optimized(val, 0x0101010101010101); 5083 int size = before + after; 5084 if (low == high && size < 5 && size > 0) { 5085 int offset = -before*BytesPerWord; 5086 for (int i = 0; i < size; ++i) { 5087 std(val, offset, low); 5088 offset += (1*BytesPerWord); 5089 } 5090 } else { 5091 addi(addr, low, -before*BytesPerWord); 5092 assert_different_registers(high, val); 5093 if (after) addi(high, high, after * BytesPerWord); 5094 Label loop; 5095 bind(loop); 5096 std(val, 0, addr); 5097 addi(addr, addr, 8); 5098 cmpd(CCR6, addr, high); 5099 ble(CCR6, loop); 5100 if (after) addi(high, high, -after * BytesPerWord); // Correct back to old value. 5101 } 5102 BLOCK_COMMENT("} zap memory region"); 5103 } 5104 5105 #endif // !PRODUCT 5106 5107 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() { 5108 int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true); 5109 assert(sizeof(bool) == 1, "PowerPC ABI"); 5110 masm->lbz(temp, simm16_offset, temp); 5111 masm->cmpwi(CCR0, temp, 0); 5112 masm->beq(CCR0, _label); 5113 } 5114 5115 SkipIfEqualZero::~SkipIfEqualZero() { 5116 _masm->bind(_label); 5117 }