1 /* 2 * Copyright (c) 1997, 2016, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2012, 2016 SAP SE. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.inline.hpp" 28 #include "compiler/disassembler.hpp" 29 #include "gc/shared/cardTableModRefBS.hpp" 30 #include "gc/shared/collectedHeap.inline.hpp" 31 #include "interpreter/interpreter.hpp" 32 #include "memory/resourceArea.hpp" 33 #include "nativeInst_ppc.hpp" 34 #include "prims/methodHandles.hpp" 35 #include "runtime/biasedLocking.hpp" 36 #include "runtime/icache.hpp" 37 #include "runtime/interfaceSupport.hpp" 38 #include "runtime/objectMonitor.hpp" 39 #include "runtime/os.hpp" 40 #include "runtime/sharedRuntime.hpp" 41 #include "runtime/stubRoutines.hpp" 42 #include "utilities/macros.hpp" 43 #if INCLUDE_ALL_GCS 44 #include "gc/g1/g1CollectedHeap.inline.hpp" 45 #include "gc/g1/g1SATBCardTableModRefBS.hpp" 46 #include "gc/g1/heapRegion.hpp" 47 #endif // INCLUDE_ALL_GCS 48 #ifdef COMPILER2 49 #include "opto/intrinsicnode.hpp" 50 #endif 51 52 #ifdef PRODUCT 53 #define BLOCK_COMMENT(str) // nothing 54 #else 55 #define BLOCK_COMMENT(str) block_comment(str) 56 #endif 57 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 58 59 #ifdef ASSERT 60 // On RISC, there's no benefit to verifying instruction boundaries. 61 bool AbstractAssembler::pd_check_instruction_mark() { return false; } 62 #endif 63 64 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) { 65 assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range"); 66 if (Assembler::is_simm(si31, 16)) { 67 ld(d, si31, a); 68 if (emit_filler_nop) nop(); 69 } else { 70 const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31); 71 const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31); 72 addis(d, a, hi); 73 ld(d, lo, d); 74 } 75 } 76 77 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) { 78 assert_different_registers(d, a); 79 ld_largeoffset_unchecked(d, si31, a, emit_filler_nop); 80 } 81 82 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base, 83 size_t size_in_bytes, bool is_signed) { 84 switch (size_in_bytes) { 85 case 8: ld(dst, offs, base); break; 86 case 4: is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break; 87 case 2: is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break; 88 case 1: lbz(dst, offs, base); if (is_signed) extsb(dst, dst); break; // lba doesn't exist :( 89 default: ShouldNotReachHere(); 90 } 91 } 92 93 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base, 94 size_t size_in_bytes) { 95 switch (size_in_bytes) { 96 case 8: std(dst, offs, base); break; 97 case 4: stw(dst, offs, base); break; 98 case 2: sth(dst, offs, base); break; 99 case 1: stb(dst, offs, base); break; 100 default: ShouldNotReachHere(); 101 } 102 } 103 104 void MacroAssembler::align(int modulus, int max, int rem) { 105 int padding = (rem + modulus - (offset() % modulus)) % modulus; 106 if (padding > max) return; 107 for (int c = (padding >> 2); c > 0; --c) { nop(); } 108 } 109 110 // Issue instructions that calculate given TOC from global TOC. 111 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16, 112 bool add_relocation, bool emit_dummy_addr) { 113 int offset = -1; 114 if (emit_dummy_addr) { 115 offset = -128; // dummy address 116 } else if (addr != (address)(intptr_t)-1) { 117 offset = MacroAssembler::offset_to_global_toc(addr); 118 } 119 120 if (hi16) { 121 addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset)); 122 } 123 if (lo16) { 124 if (add_relocation) { 125 // Relocate at the addi to avoid confusion with a load from the method's TOC. 126 relocate(internal_word_Relocation::spec(addr)); 127 } 128 addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset)); 129 } 130 } 131 132 int MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) { 133 const int offset = MacroAssembler::offset_to_global_toc(addr); 134 135 const address inst2_addr = a; 136 const int inst2 = *(int *)inst2_addr; 137 138 // The relocation points to the second instruction, the addi, 139 // and the addi reads and writes the same register dst. 140 const int dst = inv_rt_field(inst2); 141 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 142 143 // Now, find the preceding addis which writes to dst. 144 int inst1 = 0; 145 address inst1_addr = inst2_addr - BytesPerInstWord; 146 while (inst1_addr >= bound) { 147 inst1 = *(int *) inst1_addr; 148 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 149 // Stop, found the addis which writes dst. 150 break; 151 } 152 inst1_addr -= BytesPerInstWord; 153 } 154 155 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 156 set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset)); 157 set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset)); 158 return (int)((intptr_t)addr - (intptr_t)inst1_addr); 159 } 160 161 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) { 162 const address inst2_addr = a; 163 const int inst2 = *(int *)inst2_addr; 164 165 // The relocation points to the second instruction, the addi, 166 // and the addi reads and writes the same register dst. 167 const int dst = inv_rt_field(inst2); 168 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 169 170 // Now, find the preceding addis which writes to dst. 171 int inst1 = 0; 172 address inst1_addr = inst2_addr - BytesPerInstWord; 173 while (inst1_addr >= bound) { 174 inst1 = *(int *) inst1_addr; 175 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 176 // stop, found the addis which writes dst 177 break; 178 } 179 inst1_addr -= BytesPerInstWord; 180 } 181 182 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 183 184 int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0); 185 // -1 is a special case 186 if (offset == -1) { 187 return (address)(intptr_t)-1; 188 } else { 189 return global_toc() + offset; 190 } 191 } 192 193 #ifdef _LP64 194 // Patch compressed oops or klass constants. 195 // Assembler sequence is 196 // 1) compressed oops: 197 // lis rx = const.hi 198 // ori rx = rx | const.lo 199 // 2) compressed klass: 200 // lis rx = const.hi 201 // clrldi rx = rx & 0xFFFFffff // clearMS32b, optional 202 // ori rx = rx | const.lo 203 // Clrldi will be passed by. 204 int MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) { 205 assert(UseCompressedOops, "Should only patch compressed oops"); 206 207 const address inst2_addr = a; 208 const int inst2 = *(int *)inst2_addr; 209 210 // The relocation points to the second instruction, the ori, 211 // and the ori reads and writes the same register dst. 212 const int dst = inv_rta_field(inst2); 213 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 214 // Now, find the preceding addis which writes to dst. 215 int inst1 = 0; 216 address inst1_addr = inst2_addr - BytesPerInstWord; 217 bool inst1_found = false; 218 while (inst1_addr >= bound) { 219 inst1 = *(int *)inst1_addr; 220 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; } 221 inst1_addr -= BytesPerInstWord; 222 } 223 assert(inst1_found, "inst is not lis"); 224 225 int xc = (data >> 16) & 0xffff; 226 int xd = (data >> 0) & 0xffff; 227 228 set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo 229 set_imm((int *)inst2_addr, (xd)); // unsigned int 230 return (int)((intptr_t)inst2_addr - (intptr_t)inst1_addr); 231 } 232 233 // Get compressed oop or klass constant. 234 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) { 235 assert(UseCompressedOops, "Should only patch compressed oops"); 236 237 const address inst2_addr = a; 238 const int inst2 = *(int *)inst2_addr; 239 240 // The relocation points to the second instruction, the ori, 241 // and the ori reads and writes the same register dst. 242 const int dst = inv_rta_field(inst2); 243 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 244 // Now, find the preceding lis which writes to dst. 245 int inst1 = 0; 246 address inst1_addr = inst2_addr - BytesPerInstWord; 247 bool inst1_found = false; 248 249 while (inst1_addr >= bound) { 250 inst1 = *(int *) inst1_addr; 251 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;} 252 inst1_addr -= BytesPerInstWord; 253 } 254 assert(inst1_found, "inst is not lis"); 255 256 uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff)); 257 uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16); 258 259 return (int) (xl | xh); 260 } 261 #endif // _LP64 262 263 // Returns true if successful. 264 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a, 265 Register toc, bool fixed_size) { 266 int toc_offset = 0; 267 // Use RelocationHolder::none for the constant pool entry, otherwise 268 // we will end up with a failing NativeCall::verify(x) where x is 269 // the address of the constant pool entry. 270 // FIXME: We should insert relocation information for oops at the constant 271 // pool entries instead of inserting it at the loads; patching of a constant 272 // pool entry should be less expensive. 273 address const_address = address_constant((address)a.value(), RelocationHolder::none); 274 if (const_address == NULL) { return false; } // allocation failure 275 // Relocate at the pc of the load. 276 relocate(a.rspec()); 277 toc_offset = (int)(const_address - code()->consts()->start()); 278 ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size); 279 return true; 280 } 281 282 bool MacroAssembler::is_load_const_from_method_toc_at(address a) { 283 const address inst1_addr = a; 284 const int inst1 = *(int *)inst1_addr; 285 286 // The relocation points to the ld or the addis. 287 return (is_ld(inst1)) || 288 (is_addis(inst1) && inv_ra_field(inst1) != 0); 289 } 290 291 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) { 292 assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc"); 293 294 const address inst1_addr = a; 295 const int inst1 = *(int *)inst1_addr; 296 297 if (is_ld(inst1)) { 298 return inv_d1_field(inst1); 299 } else if (is_addis(inst1)) { 300 const int dst = inv_rt_field(inst1); 301 302 // Now, find the succeeding ld which reads and writes to dst. 303 address inst2_addr = inst1_addr + BytesPerInstWord; 304 int inst2 = 0; 305 while (true) { 306 inst2 = *(int *) inst2_addr; 307 if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) { 308 // Stop, found the ld which reads and writes dst. 309 break; 310 } 311 inst2_addr += BytesPerInstWord; 312 } 313 return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2); 314 } 315 ShouldNotReachHere(); 316 return 0; 317 } 318 319 // Get the constant from a `load_const' sequence. 320 long MacroAssembler::get_const(address a) { 321 assert(is_load_const_at(a), "not a load of a constant"); 322 const int *p = (const int*) a; 323 unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48); 324 if (is_ori(*(p+1))) { 325 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32); 326 x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16); 327 x |= (((unsigned long) (get_imm(a,4) & 0xffff))); 328 } else if (is_lis(*(p+1))) { 329 x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32); 330 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16); 331 x |= (((unsigned long) (get_imm(a,3) & 0xffff))); 332 } else { 333 ShouldNotReachHere(); 334 return (long) 0; 335 } 336 return (long) x; 337 } 338 339 // Patch the 64 bit constant of a `load_const' sequence. This is a low 340 // level procedure. It neither flushes the instruction cache nor is it 341 // mt safe. 342 void MacroAssembler::patch_const(address a, long x) { 343 assert(is_load_const_at(a), "not a load of a constant"); 344 int *p = (int*) a; 345 if (is_ori(*(p+1))) { 346 set_imm(0 + p, (x >> 48) & 0xffff); 347 set_imm(1 + p, (x >> 32) & 0xffff); 348 set_imm(3 + p, (x >> 16) & 0xffff); 349 set_imm(4 + p, x & 0xffff); 350 } else if (is_lis(*(p+1))) { 351 set_imm(0 + p, (x >> 48) & 0xffff); 352 set_imm(2 + p, (x >> 32) & 0xffff); 353 set_imm(1 + p, (x >> 16) & 0xffff); 354 set_imm(3 + p, x & 0xffff); 355 } else { 356 ShouldNotReachHere(); 357 } 358 } 359 360 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) { 361 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 362 int index = oop_recorder()->allocate_metadata_index(obj); 363 RelocationHolder rspec = metadata_Relocation::spec(index); 364 return AddressLiteral((address)obj, rspec); 365 } 366 367 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) { 368 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 369 int index = oop_recorder()->find_index(obj); 370 RelocationHolder rspec = metadata_Relocation::spec(index); 371 return AddressLiteral((address)obj, rspec); 372 } 373 374 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) { 375 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 376 int oop_index = oop_recorder()->allocate_oop_index(obj); 377 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 378 } 379 380 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) { 381 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 382 int oop_index = oop_recorder()->find_index(obj); 383 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 384 } 385 386 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr, 387 Register tmp, int offset) { 388 intptr_t value = *delayed_value_addr; 389 if (value != 0) { 390 return RegisterOrConstant(value + offset); 391 } 392 393 // Load indirectly to solve generation ordering problem. 394 // static address, no relocation 395 int simm16_offset = load_const_optimized(tmp, delayed_value_addr, noreg, true); 396 ld(tmp, simm16_offset, tmp); // must be aligned ((xa & 3) == 0) 397 398 if (offset != 0) { 399 addi(tmp, tmp, offset); 400 } 401 402 return RegisterOrConstant(tmp); 403 } 404 405 #ifndef PRODUCT 406 void MacroAssembler::pd_print_patched_instruction(address branch) { 407 Unimplemented(); // TODO: PPC port 408 } 409 #endif // ndef PRODUCT 410 411 // Conditional far branch for destinations encodable in 24+2 bits. 412 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) { 413 414 // If requested by flag optimize, relocate the bc_far as a 415 // runtime_call and prepare for optimizing it when the code gets 416 // relocated. 417 if (optimize == bc_far_optimize_on_relocate) { 418 relocate(relocInfo::runtime_call_type); 419 } 420 421 // variant 2: 422 // 423 // b!cxx SKIP 424 // bxx DEST 425 // SKIP: 426 // 427 428 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 429 opposite_bcond(inv_boint_bcond(boint))); 430 431 // We emit two branches. 432 // First, a conditional branch which jumps around the far branch. 433 const address not_taken_pc = pc() + 2 * BytesPerInstWord; 434 const address bc_pc = pc(); 435 bc(opposite_boint, biint, not_taken_pc); 436 437 const int bc_instr = *(int*)bc_pc; 438 assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition"); 439 assert(opposite_boint == inv_bo_field(bc_instr), "postcondition"); 440 assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))), 441 opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))), 442 "postcondition"); 443 assert(biint == inv_bi_field(bc_instr), "postcondition"); 444 445 // Second, an unconditional far branch which jumps to dest. 446 // Note: target(dest) remembers the current pc (see CodeSection::target) 447 // and returns the current pc if the label is not bound yet; when 448 // the label gets bound, the unconditional far branch will be patched. 449 const address target_pc = target(dest); 450 const address b_pc = pc(); 451 b(target_pc); 452 453 assert(not_taken_pc == pc(), "postcondition"); 454 assert(dest.is_bound() || target_pc == b_pc, "postcondition"); 455 } 456 457 // 1 or 2 instructions 458 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) { 459 if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) { 460 bc(boint, biint, dest); 461 } else { 462 bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate); 463 } 464 } 465 466 bool MacroAssembler::is_bc_far_at(address instruction_addr) { 467 return is_bc_far_variant1_at(instruction_addr) || 468 is_bc_far_variant2_at(instruction_addr) || 469 is_bc_far_variant3_at(instruction_addr); 470 } 471 472 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) { 473 if (is_bc_far_variant1_at(instruction_addr)) { 474 const address instruction_1_addr = instruction_addr; 475 const int instruction_1 = *(int*)instruction_1_addr; 476 return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr); 477 } else if (is_bc_far_variant2_at(instruction_addr)) { 478 const address instruction_2_addr = instruction_addr + 4; 479 return bxx_destination(instruction_2_addr); 480 } else if (is_bc_far_variant3_at(instruction_addr)) { 481 return instruction_addr + 8; 482 } 483 // variant 4 ??? 484 ShouldNotReachHere(); 485 return NULL; 486 } 487 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) { 488 489 if (is_bc_far_variant3_at(instruction_addr)) { 490 // variant 3, far cond branch to the next instruction, already patched to nops: 491 // 492 // nop 493 // endgroup 494 // SKIP/DEST: 495 // 496 return; 497 } 498 499 // first, extract boint and biint from the current branch 500 int boint = 0; 501 int biint = 0; 502 503 ResourceMark rm; 504 const int code_size = 2 * BytesPerInstWord; 505 CodeBuffer buf(instruction_addr, code_size); 506 MacroAssembler masm(&buf); 507 if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) { 508 // Far branch to next instruction: Optimize it by patching nops (produce variant 3). 509 masm.nop(); 510 masm.endgroup(); 511 } else { 512 if (is_bc_far_variant1_at(instruction_addr)) { 513 // variant 1, the 1st instruction contains the destination address: 514 // 515 // bcxx DEST 516 // nop 517 // 518 const int instruction_1 = *(int*)(instruction_addr); 519 boint = inv_bo_field(instruction_1); 520 biint = inv_bi_field(instruction_1); 521 } else if (is_bc_far_variant2_at(instruction_addr)) { 522 // variant 2, the 2nd instruction contains the destination address: 523 // 524 // b!cxx SKIP 525 // bxx DEST 526 // SKIP: 527 // 528 const int instruction_1 = *(int*)(instruction_addr); 529 boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))), 530 opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1)))); 531 biint = inv_bi_field(instruction_1); 532 } else { 533 // variant 4??? 534 ShouldNotReachHere(); 535 } 536 537 // second, set the new branch destination and optimize the code 538 if (dest != instruction_addr + 4 && // the bc_far is still unbound! 539 masm.is_within_range_of_bcxx(dest, instruction_addr)) { 540 // variant 1: 541 // 542 // bcxx DEST 543 // nop 544 // 545 masm.bc(boint, biint, dest); 546 masm.nop(); 547 } else { 548 // variant 2: 549 // 550 // b!cxx SKIP 551 // bxx DEST 552 // SKIP: 553 // 554 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 555 opposite_bcond(inv_boint_bcond(boint))); 556 const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord; 557 masm.bc(opposite_boint, biint, not_taken_pc); 558 masm.b(dest); 559 } 560 } 561 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 562 } 563 564 // Emit a NOT mt-safe patchable 64 bit absolute call/jump. 565 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) { 566 // get current pc 567 uint64_t start_pc = (uint64_t) pc(); 568 569 const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last 570 const address pc_of_b = (address) (start_pc + (0*BytesPerInstWord)); // b is first 571 572 // relocate here 573 if (rt != relocInfo::none) { 574 relocate(rt); 575 } 576 577 if ( ReoptimizeCallSequences && 578 (( link && is_within_range_of_b(dest, pc_of_bl)) || 579 (!link && is_within_range_of_b(dest, pc_of_b)))) { 580 // variant 2: 581 // Emit an optimized, pc-relative call/jump. 582 583 if (link) { 584 // some padding 585 nop(); 586 nop(); 587 nop(); 588 nop(); 589 nop(); 590 nop(); 591 592 // do the call 593 assert(pc() == pc_of_bl, "just checking"); 594 bl(dest, relocInfo::none); 595 } else { 596 // do the jump 597 assert(pc() == pc_of_b, "just checking"); 598 b(dest, relocInfo::none); 599 600 // some padding 601 nop(); 602 nop(); 603 nop(); 604 nop(); 605 nop(); 606 nop(); 607 } 608 609 // Assert that we can identify the emitted call/jump. 610 assert(is_bxx64_patchable_variant2_at((address)start_pc, link), 611 "can't identify emitted call"); 612 } else { 613 // variant 1: 614 mr(R0, R11); // spill R11 -> R0. 615 616 // Load the destination address into CTR, 617 // calculate destination relative to global toc. 618 calculate_address_from_global_toc(R11, dest, true, true, false); 619 620 mtctr(R11); 621 mr(R11, R0); // spill R11 <- R0. 622 nop(); 623 624 // do the call/jump 625 if (link) { 626 bctrl(); 627 } else{ 628 bctr(); 629 } 630 // Assert that we can identify the emitted call/jump. 631 assert(is_bxx64_patchable_variant1b_at((address)start_pc, link), 632 "can't identify emitted call"); 633 } 634 635 // Assert that we can identify the emitted call/jump. 636 assert(is_bxx64_patchable_at((address)start_pc, link), 637 "can't identify emitted call"); 638 assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest, 639 "wrong encoding of dest address"); 640 } 641 642 // Identify a bxx64_patchable instruction. 643 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) { 644 return is_bxx64_patchable_variant1b_at(instruction_addr, link) 645 //|| is_bxx64_patchable_variant1_at(instruction_addr, link) 646 || is_bxx64_patchable_variant2_at(instruction_addr, link); 647 } 648 649 // Does the call64_patchable instruction use a pc-relative encoding of 650 // the call destination? 651 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) { 652 // variant 2 is pc-relative 653 return is_bxx64_patchable_variant2_at(instruction_addr, link); 654 } 655 656 // Identify variant 1. 657 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) { 658 unsigned int* instr = (unsigned int*) instruction_addr; 659 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 660 && is_mtctr(instr[5]) // mtctr 661 && is_load_const_at(instruction_addr); 662 } 663 664 // Identify variant 1b: load destination relative to global toc. 665 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) { 666 unsigned int* instr = (unsigned int*) instruction_addr; 667 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 668 && is_mtctr(instr[3]) // mtctr 669 && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr); 670 } 671 672 // Identify variant 2. 673 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) { 674 unsigned int* instr = (unsigned int*) instruction_addr; 675 if (link) { 676 return is_bl (instr[6]) // bl dest is last 677 && is_nop(instr[0]) // nop 678 && is_nop(instr[1]) // nop 679 && is_nop(instr[2]) // nop 680 && is_nop(instr[3]) // nop 681 && is_nop(instr[4]) // nop 682 && is_nop(instr[5]); // nop 683 } else { 684 return is_b (instr[0]) // b dest is first 685 && is_nop(instr[1]) // nop 686 && is_nop(instr[2]) // nop 687 && is_nop(instr[3]) // nop 688 && is_nop(instr[4]) // nop 689 && is_nop(instr[5]) // nop 690 && is_nop(instr[6]); // nop 691 } 692 } 693 694 // Set dest address of a bxx64_patchable instruction. 695 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) { 696 ResourceMark rm; 697 int code_size = MacroAssembler::bxx64_patchable_size; 698 CodeBuffer buf(instruction_addr, code_size); 699 MacroAssembler masm(&buf); 700 masm.bxx64_patchable(dest, relocInfo::none, link); 701 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 702 } 703 704 // Get dest address of a bxx64_patchable instruction. 705 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) { 706 if (is_bxx64_patchable_variant1_at(instruction_addr, link)) { 707 return (address) (unsigned long) get_const(instruction_addr); 708 } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) { 709 unsigned int* instr = (unsigned int*) instruction_addr; 710 if (link) { 711 const int instr_idx = 6; // bl is last 712 int branchoffset = branch_destination(instr[instr_idx], 0); 713 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 714 } else { 715 const int instr_idx = 0; // b is first 716 int branchoffset = branch_destination(instr[instr_idx], 0); 717 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 718 } 719 // Load dest relative to global toc. 720 } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) { 721 return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, 722 instruction_addr); 723 } else { 724 ShouldNotReachHere(); 725 return NULL; 726 } 727 } 728 729 // Uses ordering which corresponds to ABI: 730 // _savegpr0_14: std r14,-144(r1) 731 // _savegpr0_15: std r15,-136(r1) 732 // _savegpr0_16: std r16,-128(r1) 733 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) { 734 std(R14, offset, dst); offset += 8; 735 std(R15, offset, dst); offset += 8; 736 std(R16, offset, dst); offset += 8; 737 std(R17, offset, dst); offset += 8; 738 std(R18, offset, dst); offset += 8; 739 std(R19, offset, dst); offset += 8; 740 std(R20, offset, dst); offset += 8; 741 std(R21, offset, dst); offset += 8; 742 std(R22, offset, dst); offset += 8; 743 std(R23, offset, dst); offset += 8; 744 std(R24, offset, dst); offset += 8; 745 std(R25, offset, dst); offset += 8; 746 std(R26, offset, dst); offset += 8; 747 std(R27, offset, dst); offset += 8; 748 std(R28, offset, dst); offset += 8; 749 std(R29, offset, dst); offset += 8; 750 std(R30, offset, dst); offset += 8; 751 std(R31, offset, dst); offset += 8; 752 753 stfd(F14, offset, dst); offset += 8; 754 stfd(F15, offset, dst); offset += 8; 755 stfd(F16, offset, dst); offset += 8; 756 stfd(F17, offset, dst); offset += 8; 757 stfd(F18, offset, dst); offset += 8; 758 stfd(F19, offset, dst); offset += 8; 759 stfd(F20, offset, dst); offset += 8; 760 stfd(F21, offset, dst); offset += 8; 761 stfd(F22, offset, dst); offset += 8; 762 stfd(F23, offset, dst); offset += 8; 763 stfd(F24, offset, dst); offset += 8; 764 stfd(F25, offset, dst); offset += 8; 765 stfd(F26, offset, dst); offset += 8; 766 stfd(F27, offset, dst); offset += 8; 767 stfd(F28, offset, dst); offset += 8; 768 stfd(F29, offset, dst); offset += 8; 769 stfd(F30, offset, dst); offset += 8; 770 stfd(F31, offset, dst); 771 } 772 773 // Uses ordering which corresponds to ABI: 774 // _restgpr0_14: ld r14,-144(r1) 775 // _restgpr0_15: ld r15,-136(r1) 776 // _restgpr0_16: ld r16,-128(r1) 777 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) { 778 ld(R14, offset, src); offset += 8; 779 ld(R15, offset, src); offset += 8; 780 ld(R16, offset, src); offset += 8; 781 ld(R17, offset, src); offset += 8; 782 ld(R18, offset, src); offset += 8; 783 ld(R19, offset, src); offset += 8; 784 ld(R20, offset, src); offset += 8; 785 ld(R21, offset, src); offset += 8; 786 ld(R22, offset, src); offset += 8; 787 ld(R23, offset, src); offset += 8; 788 ld(R24, offset, src); offset += 8; 789 ld(R25, offset, src); offset += 8; 790 ld(R26, offset, src); offset += 8; 791 ld(R27, offset, src); offset += 8; 792 ld(R28, offset, src); offset += 8; 793 ld(R29, offset, src); offset += 8; 794 ld(R30, offset, src); offset += 8; 795 ld(R31, offset, src); offset += 8; 796 797 // FP registers 798 lfd(F14, offset, src); offset += 8; 799 lfd(F15, offset, src); offset += 8; 800 lfd(F16, offset, src); offset += 8; 801 lfd(F17, offset, src); offset += 8; 802 lfd(F18, offset, src); offset += 8; 803 lfd(F19, offset, src); offset += 8; 804 lfd(F20, offset, src); offset += 8; 805 lfd(F21, offset, src); offset += 8; 806 lfd(F22, offset, src); offset += 8; 807 lfd(F23, offset, src); offset += 8; 808 lfd(F24, offset, src); offset += 8; 809 lfd(F25, offset, src); offset += 8; 810 lfd(F26, offset, src); offset += 8; 811 lfd(F27, offset, src); offset += 8; 812 lfd(F28, offset, src); offset += 8; 813 lfd(F29, offset, src); offset += 8; 814 lfd(F30, offset, src); offset += 8; 815 lfd(F31, offset, src); 816 } 817 818 // For verify_oops. 819 void MacroAssembler::save_volatile_gprs(Register dst, int offset) { 820 std(R2, offset, dst); offset += 8; 821 std(R3, offset, dst); offset += 8; 822 std(R4, offset, dst); offset += 8; 823 std(R5, offset, dst); offset += 8; 824 std(R6, offset, dst); offset += 8; 825 std(R7, offset, dst); offset += 8; 826 std(R8, offset, dst); offset += 8; 827 std(R9, offset, dst); offset += 8; 828 std(R10, offset, dst); offset += 8; 829 std(R11, offset, dst); offset += 8; 830 std(R12, offset, dst); offset += 8; 831 832 stfd(F0, offset, dst); offset += 8; 833 stfd(F1, offset, dst); offset += 8; 834 stfd(F2, offset, dst); offset += 8; 835 stfd(F3, offset, dst); offset += 8; 836 stfd(F4, offset, dst); offset += 8; 837 stfd(F5, offset, dst); offset += 8; 838 stfd(F6, offset, dst); offset += 8; 839 stfd(F7, offset, dst); offset += 8; 840 stfd(F8, offset, dst); offset += 8; 841 stfd(F9, offset, dst); offset += 8; 842 stfd(F10, offset, dst); offset += 8; 843 stfd(F11, offset, dst); offset += 8; 844 stfd(F12, offset, dst); offset += 8; 845 stfd(F13, offset, dst); 846 } 847 848 // For verify_oops. 849 void MacroAssembler::restore_volatile_gprs(Register src, int offset) { 850 ld(R2, offset, src); offset += 8; 851 ld(R3, offset, src); offset += 8; 852 ld(R4, offset, src); offset += 8; 853 ld(R5, offset, src); offset += 8; 854 ld(R6, offset, src); offset += 8; 855 ld(R7, offset, src); offset += 8; 856 ld(R8, offset, src); offset += 8; 857 ld(R9, offset, src); offset += 8; 858 ld(R10, offset, src); offset += 8; 859 ld(R11, offset, src); offset += 8; 860 ld(R12, offset, src); offset += 8; 861 862 lfd(F0, offset, src); offset += 8; 863 lfd(F1, offset, src); offset += 8; 864 lfd(F2, offset, src); offset += 8; 865 lfd(F3, offset, src); offset += 8; 866 lfd(F4, offset, src); offset += 8; 867 lfd(F5, offset, src); offset += 8; 868 lfd(F6, offset, src); offset += 8; 869 lfd(F7, offset, src); offset += 8; 870 lfd(F8, offset, src); offset += 8; 871 lfd(F9, offset, src); offset += 8; 872 lfd(F10, offset, src); offset += 8; 873 lfd(F11, offset, src); offset += 8; 874 lfd(F12, offset, src); offset += 8; 875 lfd(F13, offset, src); 876 } 877 878 void MacroAssembler::save_LR_CR(Register tmp) { 879 mfcr(tmp); 880 std(tmp, _abi(cr), R1_SP); 881 mflr(tmp); 882 std(tmp, _abi(lr), R1_SP); 883 // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad) 884 } 885 886 void MacroAssembler::restore_LR_CR(Register tmp) { 887 assert(tmp != R1_SP, "must be distinct"); 888 ld(tmp, _abi(lr), R1_SP); 889 mtlr(tmp); 890 ld(tmp, _abi(cr), R1_SP); 891 mtcr(tmp); 892 } 893 894 address MacroAssembler::get_PC_trash_LR(Register result) { 895 Label L; 896 bl(L); 897 bind(L); 898 address lr_pc = pc(); 899 mflr(result); 900 return lr_pc; 901 } 902 903 void MacroAssembler::resize_frame(Register offset, Register tmp) { 904 #ifdef ASSERT 905 assert_different_registers(offset, tmp, R1_SP); 906 andi_(tmp, offset, frame::alignment_in_bytes-1); 907 asm_assert_eq("resize_frame: unaligned", 0x204); 908 #endif 909 910 // tmp <- *(SP) 911 ld(tmp, _abi(callers_sp), R1_SP); 912 // addr <- SP + offset; 913 // *(addr) <- tmp; 914 // SP <- addr 915 stdux(tmp, R1_SP, offset); 916 } 917 918 void MacroAssembler::resize_frame(int offset, Register tmp) { 919 assert(is_simm(offset, 16), "too big an offset"); 920 assert_different_registers(tmp, R1_SP); 921 assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned"); 922 // tmp <- *(SP) 923 ld(tmp, _abi(callers_sp), R1_SP); 924 // addr <- SP + offset; 925 // *(addr) <- tmp; 926 // SP <- addr 927 stdu(tmp, offset, R1_SP); 928 } 929 930 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) { 931 // (addr == tmp1) || (addr == tmp2) is allowed here! 932 assert(tmp1 != tmp2, "must be distinct"); 933 934 // compute offset w.r.t. current stack pointer 935 // tmp_1 <- addr - SP (!) 936 subf(tmp1, R1_SP, addr); 937 938 // atomically update SP keeping back link. 939 resize_frame(tmp1/* offset */, tmp2/* tmp */); 940 } 941 942 void MacroAssembler::push_frame(Register bytes, Register tmp) { 943 #ifdef ASSERT 944 assert(bytes != R0, "r0 not allowed here"); 945 andi_(R0, bytes, frame::alignment_in_bytes-1); 946 asm_assert_eq("push_frame(Reg, Reg): unaligned", 0x203); 947 #endif 948 neg(tmp, bytes); 949 stdux(R1_SP, R1_SP, tmp); 950 } 951 952 // Push a frame of size `bytes'. 953 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) { 954 long offset = align_addr(bytes, frame::alignment_in_bytes); 955 if (is_simm(-offset, 16)) { 956 stdu(R1_SP, -offset, R1_SP); 957 } else { 958 load_const_optimized(tmp, -offset); 959 stdux(R1_SP, R1_SP, tmp); 960 } 961 } 962 963 // Push a frame of size `bytes' plus abi_reg_args on top. 964 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) { 965 push_frame(bytes + frame::abi_reg_args_size, tmp); 966 } 967 968 // Setup up a new C frame with a spill area for non-volatile GPRs and 969 // additional space for local variables. 970 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes, 971 Register tmp) { 972 push_frame(bytes + frame::abi_reg_args_size + frame::spill_nonvolatiles_size, tmp); 973 } 974 975 // Pop current C frame. 976 void MacroAssembler::pop_frame() { 977 ld(R1_SP, _abi(callers_sp), R1_SP); 978 } 979 980 #if defined(ABI_ELFv2) 981 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) { 982 // TODO(asmundak): make sure the caller uses R12 as function descriptor 983 // most of the times. 984 if (R12 != r_function_entry) { 985 mr(R12, r_function_entry); 986 } 987 mtctr(R12); 988 // Do a call or a branch. 989 if (and_link) { 990 bctrl(); 991 } else { 992 bctr(); 993 } 994 _last_calls_return_pc = pc(); 995 996 return _last_calls_return_pc; 997 } 998 999 // Call a C function via a function descriptor and use full C 1000 // calling conventions. Updates and returns _last_calls_return_pc. 1001 address MacroAssembler::call_c(Register r_function_entry) { 1002 return branch_to(r_function_entry, /*and_link=*/true); 1003 } 1004 1005 // For tail calls: only branch, don't link, so callee returns to caller of this function. 1006 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) { 1007 return branch_to(r_function_entry, /*and_link=*/false); 1008 } 1009 1010 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) { 1011 load_const(R12, function_entry, R0); 1012 return branch_to(R12, /*and_link=*/true); 1013 } 1014 1015 #else 1016 // Generic version of a call to C function via a function descriptor 1017 // with variable support for C calling conventions (TOC, ENV, etc.). 1018 // Updates and returns _last_calls_return_pc. 1019 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call, 1020 bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) { 1021 // we emit standard ptrgl glue code here 1022 assert((function_descriptor != R0), "function_descriptor cannot be R0"); 1023 1024 // retrieve necessary entries from the function descriptor 1025 ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor); 1026 mtctr(R0); 1027 1028 if (load_toc_of_callee) { 1029 ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor); 1030 } 1031 if (load_env_of_callee) { 1032 ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor); 1033 } else if (load_toc_of_callee) { 1034 li(R11, 0); 1035 } 1036 1037 // do a call or a branch 1038 if (and_link) { 1039 bctrl(); 1040 } else { 1041 bctr(); 1042 } 1043 _last_calls_return_pc = pc(); 1044 1045 return _last_calls_return_pc; 1046 } 1047 1048 // Call a C function via a function descriptor and use full C calling 1049 // conventions. 1050 // We don't use the TOC in generated code, so there is no need to save 1051 // and restore its value. 1052 address MacroAssembler::call_c(Register fd) { 1053 return branch_to(fd, /*and_link=*/true, 1054 /*save toc=*/false, 1055 /*restore toc=*/false, 1056 /*load toc=*/true, 1057 /*load env=*/true); 1058 } 1059 1060 address MacroAssembler::call_c_and_return_to_caller(Register fd) { 1061 return branch_to(fd, /*and_link=*/false, 1062 /*save toc=*/false, 1063 /*restore toc=*/false, 1064 /*load toc=*/true, 1065 /*load env=*/true); 1066 } 1067 1068 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) { 1069 if (rt != relocInfo::none) { 1070 // this call needs to be relocatable 1071 if (!ReoptimizeCallSequences 1072 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1073 || fd == NULL // support code-size estimation 1074 || !fd->is_friend_function() 1075 || fd->entry() == NULL) { 1076 // it's not a friend function as defined by class FunctionDescriptor, 1077 // so do a full call-c here. 1078 load_const(R11, (address)fd, R0); 1079 1080 bool has_env = (fd != NULL && fd->env() != NULL); 1081 return branch_to(R11, /*and_link=*/true, 1082 /*save toc=*/false, 1083 /*restore toc=*/false, 1084 /*load toc=*/true, 1085 /*load env=*/has_env); 1086 } else { 1087 // It's a friend function. Load the entry point and don't care about 1088 // toc and env. Use an optimizable call instruction, but ensure the 1089 // same code-size as in the case of a non-friend function. 1090 nop(); 1091 nop(); 1092 nop(); 1093 bl64_patchable(fd->entry(), rt); 1094 _last_calls_return_pc = pc(); 1095 return _last_calls_return_pc; 1096 } 1097 } else { 1098 // This call does not need to be relocatable, do more aggressive 1099 // optimizations. 1100 if (!ReoptimizeCallSequences 1101 || !fd->is_friend_function()) { 1102 // It's not a friend function as defined by class FunctionDescriptor, 1103 // so do a full call-c here. 1104 load_const(R11, (address)fd, R0); 1105 return branch_to(R11, /*and_link=*/true, 1106 /*save toc=*/false, 1107 /*restore toc=*/false, 1108 /*load toc=*/true, 1109 /*load env=*/true); 1110 } else { 1111 // it's a friend function, load the entry point and don't care about 1112 // toc and env. 1113 address dest = fd->entry(); 1114 if (is_within_range_of_b(dest, pc())) { 1115 bl(dest); 1116 } else { 1117 bl64_patchable(dest, rt); 1118 } 1119 _last_calls_return_pc = pc(); 1120 return _last_calls_return_pc; 1121 } 1122 } 1123 } 1124 1125 // Call a C function. All constants needed reside in TOC. 1126 // 1127 // Read the address to call from the TOC. 1128 // Read env from TOC, if fd specifies an env. 1129 // Read new TOC from TOC. 1130 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd, 1131 relocInfo::relocType rt, Register toc) { 1132 if (!ReoptimizeCallSequences 1133 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1134 || !fd->is_friend_function()) { 1135 // It's not a friend function as defined by class FunctionDescriptor, 1136 // so do a full call-c here. 1137 assert(fd->entry() != NULL, "function must be linked"); 1138 1139 AddressLiteral fd_entry(fd->entry()); 1140 bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true); 1141 mtctr(R11); 1142 if (fd->env() == NULL) { 1143 li(R11, 0); 1144 nop(); 1145 } else { 1146 AddressLiteral fd_env(fd->env()); 1147 success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true); 1148 } 1149 AddressLiteral fd_toc(fd->toc()); 1150 // Set R2_TOC (load from toc) 1151 success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true); 1152 bctrl(); 1153 _last_calls_return_pc = pc(); 1154 if (!success) { return NULL; } 1155 } else { 1156 // It's a friend function, load the entry point and don't care about 1157 // toc and env. Use an optimizable call instruction, but ensure the 1158 // same code-size as in the case of a non-friend function. 1159 nop(); 1160 bl64_patchable(fd->entry(), rt); 1161 _last_calls_return_pc = pc(); 1162 } 1163 return _last_calls_return_pc; 1164 } 1165 #endif // ABI_ELFv2 1166 1167 void MacroAssembler::call_VM_base(Register oop_result, 1168 Register last_java_sp, 1169 address entry_point, 1170 bool check_exceptions) { 1171 BLOCK_COMMENT("call_VM {"); 1172 // Determine last_java_sp register. 1173 if (!last_java_sp->is_valid()) { 1174 last_java_sp = R1_SP; 1175 } 1176 set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1); 1177 1178 // ARG1 must hold thread address. 1179 mr(R3_ARG1, R16_thread); 1180 #if defined(ABI_ELFv2) 1181 address return_pc = call_c(entry_point, relocInfo::none); 1182 #else 1183 address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none); 1184 #endif 1185 1186 reset_last_Java_frame(); 1187 1188 // Check for pending exceptions. 1189 if (check_exceptions) { 1190 // We don't check for exceptions here. 1191 ShouldNotReachHere(); 1192 } 1193 1194 // Get oop result if there is one and reset the value in the thread. 1195 if (oop_result->is_valid()) { 1196 get_vm_result(oop_result); 1197 } 1198 1199 _last_calls_return_pc = return_pc; 1200 BLOCK_COMMENT("} call_VM"); 1201 } 1202 1203 void MacroAssembler::call_VM_leaf_base(address entry_point) { 1204 BLOCK_COMMENT("call_VM_leaf {"); 1205 #if defined(ABI_ELFv2) 1206 call_c(entry_point, relocInfo::none); 1207 #else 1208 call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none); 1209 #endif 1210 BLOCK_COMMENT("} call_VM_leaf"); 1211 } 1212 1213 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) { 1214 call_VM_base(oop_result, noreg, entry_point, check_exceptions); 1215 } 1216 1217 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, 1218 bool check_exceptions) { 1219 // R3_ARG1 is reserved for the thread. 1220 mr_if_needed(R4_ARG2, arg_1); 1221 call_VM(oop_result, entry_point, check_exceptions); 1222 } 1223 1224 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, 1225 bool check_exceptions) { 1226 // R3_ARG1 is reserved for the thread 1227 mr_if_needed(R4_ARG2, arg_1); 1228 assert(arg_2 != R4_ARG2, "smashed argument"); 1229 mr_if_needed(R5_ARG3, arg_2); 1230 call_VM(oop_result, entry_point, check_exceptions); 1231 } 1232 1233 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3, 1234 bool check_exceptions) { 1235 // R3_ARG1 is reserved for the thread 1236 mr_if_needed(R4_ARG2, arg_1); 1237 assert(arg_2 != R4_ARG2, "smashed argument"); 1238 mr_if_needed(R5_ARG3, arg_2); 1239 mr_if_needed(R6_ARG4, arg_3); 1240 call_VM(oop_result, entry_point, check_exceptions); 1241 } 1242 1243 void MacroAssembler::call_VM_leaf(address entry_point) { 1244 call_VM_leaf_base(entry_point); 1245 } 1246 1247 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) { 1248 mr_if_needed(R3_ARG1, arg_1); 1249 call_VM_leaf(entry_point); 1250 } 1251 1252 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) { 1253 mr_if_needed(R3_ARG1, arg_1); 1254 assert(arg_2 != R3_ARG1, "smashed argument"); 1255 mr_if_needed(R4_ARG2, arg_2); 1256 call_VM_leaf(entry_point); 1257 } 1258 1259 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) { 1260 mr_if_needed(R3_ARG1, arg_1); 1261 assert(arg_2 != R3_ARG1, "smashed argument"); 1262 mr_if_needed(R4_ARG2, arg_2); 1263 assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument"); 1264 mr_if_needed(R5_ARG3, arg_3); 1265 call_VM_leaf(entry_point); 1266 } 1267 1268 // Check whether instruction is a read access to the polling page 1269 // which was emitted by load_from_polling_page(..). 1270 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext, 1271 address* polling_address_ptr) { 1272 if (!is_ld(instruction)) 1273 return false; // It's not a ld. Fail. 1274 1275 int rt = inv_rt_field(instruction); 1276 int ra = inv_ra_field(instruction); 1277 int ds = inv_ds_field(instruction); 1278 if (!(ds == 0 && ra != 0 && rt == 0)) { 1279 return false; // It's not a ld(r0, X, ra). Fail. 1280 } 1281 1282 if (!ucontext) { 1283 // Set polling address. 1284 if (polling_address_ptr != NULL) { 1285 *polling_address_ptr = NULL; 1286 } 1287 return true; // No ucontext given. Can't check value of ra. Assume true. 1288 } 1289 1290 #ifdef LINUX 1291 // Ucontext given. Check that register ra contains the address of 1292 // the safepoing polling page. 1293 ucontext_t* uc = (ucontext_t*) ucontext; 1294 // Set polling address. 1295 address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds; 1296 if (polling_address_ptr != NULL) { 1297 *polling_address_ptr = addr; 1298 } 1299 return os::is_poll_address(addr); 1300 #else 1301 // Not on Linux, ucontext must be NULL. 1302 ShouldNotReachHere(); 1303 return false; 1304 #endif 1305 } 1306 1307 bool MacroAssembler::is_memory_serialization(int instruction, JavaThread* thread, void* ucontext) { 1308 #ifdef LINUX 1309 ucontext_t* uc = (ucontext_t*) ucontext; 1310 1311 if (is_stwx(instruction) || is_stwux(instruction)) { 1312 int ra = inv_ra_field(instruction); 1313 int rb = inv_rb_field(instruction); 1314 1315 // look up content of ra and rb in ucontext 1316 address ra_val=(address)uc->uc_mcontext.regs->gpr[ra]; 1317 long rb_val=(long)uc->uc_mcontext.regs->gpr[rb]; 1318 return os::is_memory_serialize_page(thread, ra_val+rb_val); 1319 } else if (is_stw(instruction) || is_stwu(instruction)) { 1320 int ra = inv_ra_field(instruction); 1321 int d1 = inv_d1_field(instruction); 1322 1323 // look up content of ra in ucontext 1324 address ra_val=(address)uc->uc_mcontext.regs->gpr[ra]; 1325 return os::is_memory_serialize_page(thread, ra_val+d1); 1326 } else { 1327 return false; 1328 } 1329 #else 1330 // workaround not needed on !LINUX :-) 1331 ShouldNotCallThis(); 1332 return false; 1333 #endif 1334 } 1335 1336 void MacroAssembler::bang_stack_with_offset(int offset) { 1337 // When increasing the stack, the old stack pointer will be written 1338 // to the new top of stack according to the PPC64 abi. 1339 // Therefore, stack banging is not necessary when increasing 1340 // the stack by <= os::vm_page_size() bytes. 1341 // When increasing the stack by a larger amount, this method is 1342 // called repeatedly to bang the intermediate pages. 1343 1344 // Stack grows down, caller passes positive offset. 1345 assert(offset > 0, "must bang with positive offset"); 1346 1347 long stdoffset = -offset; 1348 1349 if (is_simm(stdoffset, 16)) { 1350 // Signed 16 bit offset, a simple std is ok. 1351 if (UseLoadInstructionsForStackBangingPPC64) { 1352 ld(R0, (int)(signed short)stdoffset, R1_SP); 1353 } else { 1354 std(R0,(int)(signed short)stdoffset, R1_SP); 1355 } 1356 } else if (is_simm(stdoffset, 31)) { 1357 const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset); 1358 const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset); 1359 1360 Register tmp = R11; 1361 addis(tmp, R1_SP, hi); 1362 if (UseLoadInstructionsForStackBangingPPC64) { 1363 ld(R0, lo, tmp); 1364 } else { 1365 std(R0, lo, tmp); 1366 } 1367 } else { 1368 ShouldNotReachHere(); 1369 } 1370 } 1371 1372 // If instruction is a stack bang of the form 1373 // std R0, x(Ry), (see bang_stack_with_offset()) 1374 // stdu R1_SP, x(R1_SP), (see push_frame(), resize_frame()) 1375 // or stdux R1_SP, Rx, R1_SP (see push_frame(), resize_frame()) 1376 // return the banged address. Otherwise, return 0. 1377 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) { 1378 #ifdef LINUX 1379 ucontext_t* uc = (ucontext_t*) ucontext; 1380 int rs = inv_rs_field(instruction); 1381 int ra = inv_ra_field(instruction); 1382 if ( (is_ld(instruction) && rs == 0 && UseLoadInstructionsForStackBangingPPC64) 1383 || (is_std(instruction) && rs == 0 && !UseLoadInstructionsForStackBangingPPC64) 1384 || (is_stdu(instruction) && rs == 1)) { 1385 int ds = inv_ds_field(instruction); 1386 // return banged address 1387 return ds+(address)uc->uc_mcontext.regs->gpr[ra]; 1388 } else if (is_stdux(instruction) && rs == 1) { 1389 int rb = inv_rb_field(instruction); 1390 address sp = (address)uc->uc_mcontext.regs->gpr[1]; 1391 long rb_val = (long)uc->uc_mcontext.regs->gpr[rb]; 1392 return ra != 1 || rb_val >= 0 ? NULL // not a stack bang 1393 : sp + rb_val; // banged address 1394 } 1395 return NULL; // not a stack bang 1396 #else 1397 // workaround not needed on !LINUX :-) 1398 ShouldNotCallThis(); 1399 return NULL; 1400 #endif 1401 } 1402 1403 void MacroAssembler::reserved_stack_check(Register return_pc) { 1404 // Test if reserved zone needs to be enabled. 1405 Label no_reserved_zone_enabling; 1406 1407 ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread); 1408 cmpld(CCR0, R1_SP, R0); 1409 blt_predict_taken(CCR0, no_reserved_zone_enabling); 1410 1411 // Enable reserved zone again, throw stack overflow exception. 1412 push_frame_reg_args(0, R0); 1413 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread); 1414 pop_frame(); 1415 mtlr(return_pc); 1416 load_const_optimized(R0, StubRoutines::throw_delayed_StackOverflowError_entry()); 1417 mtctr(R0); 1418 bctr(); 1419 1420 should_not_reach_here(); 1421 1422 bind(no_reserved_zone_enabling); 1423 } 1424 1425 // CmpxchgX sets condition register to cmpX(current, compare). 1426 void MacroAssembler::cmpxchgw(ConditionRegister flag, Register dest_current_value, 1427 Register compare_value, Register exchange_value, 1428 Register addr_base, int semantics, bool cmpxchgx_hint, 1429 Register int_flag_success, bool contention_hint, bool weak) { 1430 Label retry; 1431 Label failed; 1432 Label done; 1433 1434 // Save one branch if result is returned via register and 1435 // result register is different from the other ones. 1436 bool use_result_reg = (int_flag_success != noreg); 1437 bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value && 1438 int_flag_success != exchange_value && int_flag_success != addr_base); 1439 assert(!weak || flag == CCR0, "weak only supported with CCR0"); 1440 1441 if (use_result_reg && preset_result_reg) { 1442 li(int_flag_success, 0); // preset (assume cas failed) 1443 } 1444 1445 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1446 if (contention_hint) { // Don't try to reserve if cmp fails. 1447 lwz(dest_current_value, 0, addr_base); 1448 cmpw(flag, dest_current_value, compare_value); 1449 bne(flag, failed); 1450 } 1451 1452 // release/fence semantics 1453 if (semantics & MemBarRel) { 1454 release(); 1455 } 1456 1457 // atomic emulation loop 1458 bind(retry); 1459 1460 lwarx(dest_current_value, addr_base, cmpxchgx_hint); 1461 cmpw(flag, dest_current_value, compare_value); 1462 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1463 bne_predict_not_taken(flag, failed); 1464 } else { 1465 bne( flag, failed); 1466 } 1467 // branch to done => (flag == ne), (dest_current_value != compare_value) 1468 // fall through => (flag == eq), (dest_current_value == compare_value) 1469 1470 stwcx_(exchange_value, addr_base); 1471 if (!weak || use_result_reg) { 1472 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1473 bne_predict_not_taken(CCR0, weak ? failed : retry); // StXcx_ sets CCR0. 1474 } else { 1475 bne( CCR0, weak ? failed : retry); // StXcx_ sets CCR0. 1476 } 1477 } 1478 // fall through => (flag == eq), (dest_current_value == compare_value), (swapped) 1479 1480 // Result in register (must do this at the end because int_flag_success can be the 1481 // same register as one above). 1482 if (use_result_reg) { 1483 li(int_flag_success, 1); 1484 } 1485 1486 if (semantics & MemBarFenceAfter) { 1487 fence(); 1488 } else if (semantics & MemBarAcq) { 1489 isync(); 1490 } 1491 1492 if (use_result_reg && !preset_result_reg) { 1493 b(done); 1494 } 1495 1496 bind(failed); 1497 if (use_result_reg && !preset_result_reg) { 1498 li(int_flag_success, 0); 1499 } 1500 1501 bind(done); 1502 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1503 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1504 } 1505 1506 // Preforms atomic compare exchange: 1507 // if (compare_value == *addr_base) 1508 // *addr_base = exchange_value 1509 // int_flag_success = 1; 1510 // else 1511 // int_flag_success = 0; 1512 // 1513 // ConditionRegister flag = cmp(compare_value, *addr_base) 1514 // Register dest_current_value = *addr_base 1515 // Register compare_value Used to compare with value in memory 1516 // Register exchange_value Written to memory if compare_value == *addr_base 1517 // Register addr_base The memory location to compareXChange 1518 // Register int_flag_success Set to 1 if exchange_value was written to *addr_base 1519 // 1520 // To avoid the costly compare exchange the value is tested beforehand. 1521 // Several special cases exist to avoid that unnecessary information is generated. 1522 // 1523 void MacroAssembler::cmpxchgd(ConditionRegister flag, 1524 Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value, 1525 Register addr_base, int semantics, bool cmpxchgx_hint, 1526 Register int_flag_success, Label* failed_ext, bool contention_hint, bool weak) { 1527 Label retry; 1528 Label failed_int; 1529 Label& failed = (failed_ext != NULL) ? *failed_ext : failed_int; 1530 Label done; 1531 1532 // Save one branch if result is returned via register and result register is different from the other ones. 1533 bool use_result_reg = (int_flag_success!=noreg); 1534 bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() && 1535 int_flag_success!=exchange_value && int_flag_success!=addr_base); 1536 assert(!weak || flag == CCR0, "weak only supported with CCR0"); 1537 assert(int_flag_success == noreg || failed_ext == NULL, "cannot have both"); 1538 1539 if (use_result_reg && preset_result_reg) { 1540 li(int_flag_success, 0); // preset (assume cas failed) 1541 } 1542 1543 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1544 if (contention_hint) { // Don't try to reserve if cmp fails. 1545 ld(dest_current_value, 0, addr_base); 1546 cmpd(flag, compare_value, dest_current_value); 1547 bne(flag, failed); 1548 } 1549 1550 // release/fence semantics 1551 if (semantics & MemBarRel) { 1552 release(); 1553 } 1554 1555 // atomic emulation loop 1556 bind(retry); 1557 1558 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1559 cmpd(flag, compare_value, dest_current_value); 1560 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1561 bne_predict_not_taken(flag, failed); 1562 } else { 1563 bne( flag, failed); 1564 } 1565 1566 stdcx_(exchange_value, addr_base); 1567 if (!weak || use_result_reg || failed_ext) { 1568 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1569 bne_predict_not_taken(CCR0, weak ? failed : retry); // stXcx_ sets CCR0 1570 } else { 1571 bne( CCR0, weak ? failed : retry); // stXcx_ sets CCR0 1572 } 1573 } 1574 1575 // result in register (must do this at the end because int_flag_success can be the same register as one above) 1576 if (use_result_reg) { 1577 li(int_flag_success, 1); 1578 } 1579 1580 if (semantics & MemBarFenceAfter) { 1581 fence(); 1582 } else if (semantics & MemBarAcq) { 1583 isync(); 1584 } 1585 1586 if (use_result_reg && !preset_result_reg) { 1587 b(done); 1588 } 1589 1590 bind(failed_int); 1591 if (use_result_reg && !preset_result_reg) { 1592 li(int_flag_success, 0); 1593 } 1594 1595 bind(done); 1596 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1597 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1598 } 1599 1600 // Look up the method for a megamorphic invokeinterface call. 1601 // The target method is determined by <intf_klass, itable_index>. 1602 // The receiver klass is in recv_klass. 1603 // On success, the result will be in method_result, and execution falls through. 1604 // On failure, execution transfers to the given label. 1605 void MacroAssembler::lookup_interface_method(Register recv_klass, 1606 Register intf_klass, 1607 RegisterOrConstant itable_index, 1608 Register method_result, 1609 Register scan_temp, 1610 Register sethi_temp, 1611 Label& L_no_such_interface) { 1612 assert_different_registers(recv_klass, intf_klass, method_result, scan_temp); 1613 assert(itable_index.is_constant() || itable_index.as_register() == method_result, 1614 "caller must use same register for non-constant itable index as for method"); 1615 1616 // Compute start of first itableOffsetEntry (which is at the end of the vtable). 1617 int vtable_base = in_bytes(Klass::vtable_start_offset()); 1618 int itentry_off = itableMethodEntry::method_offset_in_bytes(); 1619 int logMEsize = exact_log2(itableMethodEntry::size() * wordSize); 1620 int scan_step = itableOffsetEntry::size() * wordSize; 1621 int log_vte_size= exact_log2(vtableEntry::size_in_bytes()); 1622 1623 lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass); 1624 // %%% We should store the aligned, prescaled offset in the klassoop. 1625 // Then the next several instructions would fold away. 1626 1627 sldi(scan_temp, scan_temp, log_vte_size); 1628 addi(scan_temp, scan_temp, vtable_base); 1629 add(scan_temp, recv_klass, scan_temp); 1630 1631 // Adjust recv_klass by scaled itable_index, so we can free itable_index. 1632 if (itable_index.is_register()) { 1633 Register itable_offset = itable_index.as_register(); 1634 sldi(itable_offset, itable_offset, logMEsize); 1635 if (itentry_off) addi(itable_offset, itable_offset, itentry_off); 1636 add(recv_klass, itable_offset, recv_klass); 1637 } else { 1638 long itable_offset = (long)itable_index.as_constant(); 1639 load_const_optimized(sethi_temp, (itable_offset<<logMEsize)+itentry_off); // static address, no relocation 1640 add(recv_klass, sethi_temp, recv_klass); 1641 } 1642 1643 // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) { 1644 // if (scan->interface() == intf) { 1645 // result = (klass + scan->offset() + itable_index); 1646 // } 1647 // } 1648 Label search, found_method; 1649 1650 for (int peel = 1; peel >= 0; peel--) { 1651 // %%%% Could load both offset and interface in one ldx, if they were 1652 // in the opposite order. This would save a load. 1653 ld(method_result, itableOffsetEntry::interface_offset_in_bytes(), scan_temp); 1654 1655 // Check that this entry is non-null. A null entry means that 1656 // the receiver class doesn't implement the interface, and wasn't the 1657 // same as when the caller was compiled. 1658 cmpd(CCR0, method_result, intf_klass); 1659 1660 if (peel) { 1661 beq(CCR0, found_method); 1662 } else { 1663 bne(CCR0, search); 1664 // (invert the test to fall through to found_method...) 1665 } 1666 1667 if (!peel) break; 1668 1669 bind(search); 1670 1671 cmpdi(CCR0, method_result, 0); 1672 beq(CCR0, L_no_such_interface); 1673 addi(scan_temp, scan_temp, scan_step); 1674 } 1675 1676 bind(found_method); 1677 1678 // Got a hit. 1679 int ito_offset = itableOffsetEntry::offset_offset_in_bytes(); 1680 lwz(scan_temp, ito_offset, scan_temp); 1681 ldx(method_result, scan_temp, recv_klass); 1682 } 1683 1684 // virtual method calling 1685 void MacroAssembler::lookup_virtual_method(Register recv_klass, 1686 RegisterOrConstant vtable_index, 1687 Register method_result) { 1688 1689 assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg()); 1690 1691 const int base = in_bytes(Klass::vtable_start_offset()); 1692 assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); 1693 1694 if (vtable_index.is_register()) { 1695 sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord); 1696 add(recv_klass, vtable_index.as_register(), recv_klass); 1697 } else { 1698 addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord); 1699 } 1700 ld(R19_method, base + vtableEntry::method_offset_in_bytes(), recv_klass); 1701 } 1702 1703 /////////////////////////////////////////// subtype checking //////////////////////////////////////////// 1704 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, 1705 Register super_klass, 1706 Register temp1_reg, 1707 Register temp2_reg, 1708 Label* L_success, 1709 Label* L_failure, 1710 Label* L_slow_path, 1711 RegisterOrConstant super_check_offset) { 1712 1713 const Register check_cache_offset = temp1_reg; 1714 const Register cached_super = temp2_reg; 1715 1716 assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super); 1717 1718 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1719 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1720 1721 bool must_load_sco = (super_check_offset.constant_or_zero() == -1); 1722 bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset); 1723 1724 Label L_fallthrough; 1725 int label_nulls = 0; 1726 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 1727 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 1728 if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; } 1729 assert(label_nulls <= 1 || 1730 (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path), 1731 "at most one NULL in the batch, usually"); 1732 1733 // If the pointers are equal, we are done (e.g., String[] elements). 1734 // This self-check enables sharing of secondary supertype arrays among 1735 // non-primary types such as array-of-interface. Otherwise, each such 1736 // type would need its own customized SSA. 1737 // We move this check to the front of the fast path because many 1738 // type checks are in fact trivially successful in this manner, 1739 // so we get a nicely predicted branch right at the start of the check. 1740 cmpd(CCR0, sub_klass, super_klass); 1741 beq(CCR0, *L_success); 1742 1743 // Check the supertype display: 1744 if (must_load_sco) { 1745 // The super check offset is always positive... 1746 lwz(check_cache_offset, sco_offset, super_klass); 1747 super_check_offset = RegisterOrConstant(check_cache_offset); 1748 // super_check_offset is register. 1749 assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register()); 1750 } 1751 // The loaded value is the offset from KlassOopDesc. 1752 1753 ld(cached_super, super_check_offset, sub_klass); 1754 cmpd(CCR0, cached_super, super_klass); 1755 1756 // This check has worked decisively for primary supers. 1757 // Secondary supers are sought in the super_cache ('super_cache_addr'). 1758 // (Secondary supers are interfaces and very deeply nested subtypes.) 1759 // This works in the same check above because of a tricky aliasing 1760 // between the super_cache and the primary super display elements. 1761 // (The 'super_check_addr' can address either, as the case requires.) 1762 // Note that the cache is updated below if it does not help us find 1763 // what we need immediately. 1764 // So if it was a primary super, we can just fail immediately. 1765 // Otherwise, it's the slow path for us (no success at this point). 1766 1767 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); } 1768 1769 if (super_check_offset.is_register()) { 1770 beq(CCR0, *L_success); 1771 cmpwi(CCR0, super_check_offset.as_register(), sc_offset); 1772 if (L_failure == &L_fallthrough) { 1773 beq(CCR0, *L_slow_path); 1774 } else { 1775 bne(CCR0, *L_failure); 1776 FINAL_JUMP(*L_slow_path); 1777 } 1778 } else { 1779 if (super_check_offset.as_constant() == sc_offset) { 1780 // Need a slow path; fast failure is impossible. 1781 if (L_slow_path == &L_fallthrough) { 1782 beq(CCR0, *L_success); 1783 } else { 1784 bne(CCR0, *L_slow_path); 1785 FINAL_JUMP(*L_success); 1786 } 1787 } else { 1788 // No slow path; it's a fast decision. 1789 if (L_failure == &L_fallthrough) { 1790 beq(CCR0, *L_success); 1791 } else { 1792 bne(CCR0, *L_failure); 1793 FINAL_JUMP(*L_success); 1794 } 1795 } 1796 } 1797 1798 bind(L_fallthrough); 1799 #undef FINAL_JUMP 1800 } 1801 1802 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 1803 Register super_klass, 1804 Register temp1_reg, 1805 Register temp2_reg, 1806 Label* L_success, 1807 Register result_reg) { 1808 const Register array_ptr = temp1_reg; // current value from cache array 1809 const Register temp = temp2_reg; 1810 1811 assert_different_registers(sub_klass, super_klass, array_ptr, temp); 1812 1813 int source_offset = in_bytes(Klass::secondary_supers_offset()); 1814 int target_offset = in_bytes(Klass::secondary_super_cache_offset()); 1815 1816 int length_offset = Array<Klass*>::length_offset_in_bytes(); 1817 int base_offset = Array<Klass*>::base_offset_in_bytes(); 1818 1819 Label hit, loop, failure, fallthru; 1820 1821 ld(array_ptr, source_offset, sub_klass); 1822 1823 // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated."); 1824 lwz(temp, length_offset, array_ptr); 1825 cmpwi(CCR0, temp, 0); 1826 beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0 1827 1828 mtctr(temp); // load ctr 1829 1830 bind(loop); 1831 // Oops in table are NO MORE compressed. 1832 ld(temp, base_offset, array_ptr); 1833 cmpd(CCR0, temp, super_klass); 1834 beq(CCR0, hit); 1835 addi(array_ptr, array_ptr, BytesPerWord); 1836 bdnz(loop); 1837 1838 bind(failure); 1839 if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss) 1840 b(fallthru); 1841 1842 bind(hit); 1843 std(super_klass, target_offset, sub_klass); // save result to cache 1844 if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit) 1845 if (L_success != NULL) { b(*L_success); } 1846 else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided 1847 1848 bind(fallthru); 1849 } 1850 1851 // Try fast path, then go to slow one if not successful 1852 void MacroAssembler::check_klass_subtype(Register sub_klass, 1853 Register super_klass, 1854 Register temp1_reg, 1855 Register temp2_reg, 1856 Label& L_success) { 1857 Label L_failure; 1858 check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure); 1859 check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success); 1860 bind(L_failure); // Fallthru if not successful. 1861 } 1862 1863 void MacroAssembler::check_method_handle_type(Register mtype_reg, Register mh_reg, 1864 Register temp_reg, 1865 Label& wrong_method_type) { 1866 assert_different_registers(mtype_reg, mh_reg, temp_reg); 1867 // Compare method type against that of the receiver. 1868 load_heap_oop_not_null(temp_reg, delayed_value(java_lang_invoke_MethodHandle::type_offset_in_bytes, temp_reg), mh_reg); 1869 cmpd(CCR0, temp_reg, mtype_reg); 1870 bne(CCR0, wrong_method_type); 1871 } 1872 1873 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot, 1874 Register temp_reg, 1875 int extra_slot_offset) { 1876 // cf. TemplateTable::prepare_invoke(), if (load_receiver). 1877 int stackElementSize = Interpreter::stackElementSize; 1878 int offset = extra_slot_offset * stackElementSize; 1879 if (arg_slot.is_constant()) { 1880 offset += arg_slot.as_constant() * stackElementSize; 1881 return offset; 1882 } else { 1883 assert(temp_reg != noreg, "must specify"); 1884 sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize)); 1885 if (offset != 0) 1886 addi(temp_reg, temp_reg, offset); 1887 return temp_reg; 1888 } 1889 } 1890 1891 // Supports temp2_reg = R0. 1892 void MacroAssembler::biased_locking_enter(ConditionRegister cr_reg, Register obj_reg, 1893 Register mark_reg, Register temp_reg, 1894 Register temp2_reg, Label& done, Label* slow_case) { 1895 assert(UseBiasedLocking, "why call this otherwise?"); 1896 1897 #ifdef ASSERT 1898 assert_different_registers(obj_reg, mark_reg, temp_reg, temp2_reg); 1899 #endif 1900 1901 Label cas_label; 1902 1903 // Branch to done if fast path fails and no slow_case provided. 1904 Label *slow_case_int = (slow_case != NULL) ? slow_case : &done; 1905 1906 // Biased locking 1907 // See whether the lock is currently biased toward our thread and 1908 // whether the epoch is still valid 1909 // Note that the runtime guarantees sufficient alignment of JavaThread 1910 // pointers to allow age to be placed into low bits 1911 assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, 1912 "biased locking makes assumptions about bit layout"); 1913 1914 if (PrintBiasedLockingStatistics) { 1915 load_const(temp2_reg, (address) BiasedLocking::total_entry_count_addr(), temp_reg); 1916 lwzx(temp_reg, temp2_reg); 1917 addi(temp_reg, temp_reg, 1); 1918 stwx(temp_reg, temp2_reg); 1919 } 1920 1921 andi(temp_reg, mark_reg, markOopDesc::biased_lock_mask_in_place); 1922 cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern); 1923 bne(cr_reg, cas_label); 1924 1925 load_klass(temp_reg, obj_reg); 1926 1927 load_const_optimized(temp2_reg, ~((int) markOopDesc::age_mask_in_place)); 1928 ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg); 1929 orr(temp_reg, R16_thread, temp_reg); 1930 xorr(temp_reg, mark_reg, temp_reg); 1931 andr(temp_reg, temp_reg, temp2_reg); 1932 cmpdi(cr_reg, temp_reg, 0); 1933 if (PrintBiasedLockingStatistics) { 1934 Label l; 1935 bne(cr_reg, l); 1936 load_const(temp2_reg, (address) BiasedLocking::biased_lock_entry_count_addr()); 1937 lwzx(mark_reg, temp2_reg); 1938 addi(mark_reg, mark_reg, 1); 1939 stwx(mark_reg, temp2_reg); 1940 // restore mark_reg 1941 ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg); 1942 bind(l); 1943 } 1944 beq(cr_reg, done); 1945 1946 Label try_revoke_bias; 1947 Label try_rebias; 1948 1949 // At this point we know that the header has the bias pattern and 1950 // that we are not the bias owner in the current epoch. We need to 1951 // figure out more details about the state of the header in order to 1952 // know what operations can be legally performed on the object's 1953 // header. 1954 1955 // If the low three bits in the xor result aren't clear, that means 1956 // the prototype header is no longer biased and we have to revoke 1957 // the bias on this object. 1958 andi(temp2_reg, temp_reg, markOopDesc::biased_lock_mask_in_place); 1959 cmpwi(cr_reg, temp2_reg, 0); 1960 bne(cr_reg, try_revoke_bias); 1961 1962 // Biasing is still enabled for this data type. See whether the 1963 // epoch of the current bias is still valid, meaning that the epoch 1964 // bits of the mark word are equal to the epoch bits of the 1965 // prototype header. (Note that the prototype header's epoch bits 1966 // only change at a safepoint.) If not, attempt to rebias the object 1967 // toward the current thread. Note that we must be absolutely sure 1968 // that the current epoch is invalid in order to do this because 1969 // otherwise the manipulations it performs on the mark word are 1970 // illegal. 1971 1972 int shift_amount = 64 - markOopDesc::epoch_shift; 1973 // rotate epoch bits to right (little) end and set other bits to 0 1974 // [ big part | epoch | little part ] -> [ 0..0 | epoch ] 1975 rldicl_(temp2_reg, temp_reg, shift_amount, 64 - markOopDesc::epoch_bits); 1976 // branch if epoch bits are != 0, i.e. they differ, because the epoch has been incremented 1977 bne(CCR0, try_rebias); 1978 1979 // The epoch of the current bias is still valid but we know nothing 1980 // about the owner; it might be set or it might be clear. Try to 1981 // acquire the bias of the object using an atomic operation. If this 1982 // fails we will go in to the runtime to revoke the object's bias. 1983 // Note that we first construct the presumed unbiased header so we 1984 // don't accidentally blow away another thread's valid bias. 1985 andi(mark_reg, mark_reg, (markOopDesc::biased_lock_mask_in_place | 1986 markOopDesc::age_mask_in_place | 1987 markOopDesc::epoch_mask_in_place)); 1988 orr(temp_reg, R16_thread, mark_reg); 1989 1990 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 1991 1992 // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg). 1993 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg, 1994 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg, 1995 /*where=*/obj_reg, 1996 MacroAssembler::MemBarAcq, 1997 MacroAssembler::cmpxchgx_hint_acquire_lock(), 1998 noreg, slow_case_int); // bail out if failed 1999 2000 // If the biasing toward our thread failed, this means that 2001 // another thread succeeded in biasing it toward itself and we 2002 // need to revoke that bias. The revocation will occur in the 2003 // interpreter runtime in the slow case. 2004 if (PrintBiasedLockingStatistics) { 2005 load_const(temp2_reg, (address) BiasedLocking::anonymously_biased_lock_entry_count_addr(), temp_reg); 2006 lwzx(temp_reg, temp2_reg); 2007 addi(temp_reg, temp_reg, 1); 2008 stwx(temp_reg, temp2_reg); 2009 } 2010 b(done); 2011 2012 bind(try_rebias); 2013 // At this point we know the epoch has expired, meaning that the 2014 // current "bias owner", if any, is actually invalid. Under these 2015 // circumstances _only_, we are allowed to use the current header's 2016 // value as the comparison value when doing the cas to acquire the 2017 // bias in the current epoch. In other words, we allow transfer of 2018 // the bias from one thread to another directly in this situation. 2019 load_klass(temp_reg, obj_reg); 2020 andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place); 2021 orr(temp2_reg, R16_thread, temp2_reg); 2022 ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg); 2023 orr(temp_reg, temp2_reg, temp_reg); 2024 2025 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2026 2027 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg, 2028 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg, 2029 /*where=*/obj_reg, 2030 MacroAssembler::MemBarAcq, 2031 MacroAssembler::cmpxchgx_hint_acquire_lock(), 2032 noreg, slow_case_int); // bail out if failed 2033 2034 // If the biasing toward our thread failed, this means that 2035 // another thread succeeded in biasing it toward itself and we 2036 // need to revoke that bias. The revocation will occur in the 2037 // interpreter runtime in the slow case. 2038 if (PrintBiasedLockingStatistics) { 2039 load_const(temp2_reg, (address) BiasedLocking::rebiased_lock_entry_count_addr(), temp_reg); 2040 lwzx(temp_reg, temp2_reg); 2041 addi(temp_reg, temp_reg, 1); 2042 stwx(temp_reg, temp2_reg); 2043 } 2044 b(done); 2045 2046 bind(try_revoke_bias); 2047 // The prototype mark in the klass doesn't have the bias bit set any 2048 // more, indicating that objects of this data type are not supposed 2049 // to be biased any more. We are going to try to reset the mark of 2050 // this object to the prototype value and fall through to the 2051 // CAS-based locking scheme. Note that if our CAS fails, it means 2052 // that another thread raced us for the privilege of revoking the 2053 // bias of this particular object, so it's okay to continue in the 2054 // normal locking code. 2055 load_klass(temp_reg, obj_reg); 2056 ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg); 2057 andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place); 2058 orr(temp_reg, temp_reg, temp2_reg); 2059 2060 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2061 2062 // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg). 2063 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg, 2064 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg, 2065 /*where=*/obj_reg, 2066 MacroAssembler::MemBarAcq, 2067 MacroAssembler::cmpxchgx_hint_acquire_lock()); 2068 2069 // reload markOop in mark_reg before continuing with lightweight locking 2070 ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg); 2071 2072 // Fall through to the normal CAS-based lock, because no matter what 2073 // the result of the above CAS, some thread must have succeeded in 2074 // removing the bias bit from the object's header. 2075 if (PrintBiasedLockingStatistics) { 2076 Label l; 2077 bne(cr_reg, l); 2078 load_const(temp2_reg, (address) BiasedLocking::revoked_lock_entry_count_addr(), temp_reg); 2079 lwzx(temp_reg, temp2_reg); 2080 addi(temp_reg, temp_reg, 1); 2081 stwx(temp_reg, temp2_reg); 2082 bind(l); 2083 } 2084 2085 bind(cas_label); 2086 } 2087 2088 void MacroAssembler::biased_locking_exit (ConditionRegister cr_reg, Register mark_addr, Register temp_reg, Label& done) { 2089 // Check for biased locking unlock case, which is a no-op 2090 // Note: we do not have to check the thread ID for two reasons. 2091 // First, the interpreter checks for IllegalMonitorStateException at 2092 // a higher level. Second, if the bias was revoked while we held the 2093 // lock, the object could not be rebiased toward another thread, so 2094 // the bias bit would be clear. 2095 2096 ld(temp_reg, 0, mark_addr); 2097 andi(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place); 2098 2099 cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern); 2100 beq(cr_reg, done); 2101 } 2102 2103 // allocation (for C1) 2104 void MacroAssembler::eden_allocate( 2105 Register obj, // result: pointer to object after successful allocation 2106 Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise 2107 int con_size_in_bytes, // object size in bytes if known at compile time 2108 Register t1, // temp register 2109 Register t2, // temp register 2110 Label& slow_case // continuation point if fast allocation fails 2111 ) { 2112 b(slow_case); 2113 } 2114 2115 void MacroAssembler::tlab_allocate( 2116 Register obj, // result: pointer to object after successful allocation 2117 Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise 2118 int con_size_in_bytes, // object size in bytes if known at compile time 2119 Register t1, // temp register 2120 Label& slow_case // continuation point if fast allocation fails 2121 ) { 2122 // make sure arguments make sense 2123 assert_different_registers(obj, var_size_in_bytes, t1); 2124 assert(0 <= con_size_in_bytes && is_simm13(con_size_in_bytes), "illegal object size"); 2125 assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment"); 2126 2127 const Register new_top = t1; 2128 //verify_tlab(); not implemented 2129 2130 ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread); 2131 ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread); 2132 if (var_size_in_bytes == noreg) { 2133 addi(new_top, obj, con_size_in_bytes); 2134 } else { 2135 add(new_top, obj, var_size_in_bytes); 2136 } 2137 cmpld(CCR0, new_top, R0); 2138 bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case); 2139 2140 #ifdef ASSERT 2141 // make sure new free pointer is properly aligned 2142 { 2143 Label L; 2144 andi_(R0, new_top, MinObjAlignmentInBytesMask); 2145 beq(CCR0, L); 2146 stop("updated TLAB free is not properly aligned", 0x934); 2147 bind(L); 2148 } 2149 #endif // ASSERT 2150 2151 // update the tlab top pointer 2152 std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread); 2153 //verify_tlab(); not implemented 2154 } 2155 void MacroAssembler::tlab_refill(Label& retry_tlab, Label& try_eden, Label& slow_case) { 2156 unimplemented("tlab_refill"); 2157 } 2158 void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register t1, Register t2) { 2159 unimplemented("incr_allocated_bytes"); 2160 } 2161 2162 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset, 2163 int insts_call_instruction_offset, Register Rtoc) { 2164 // Start the stub. 2165 address stub = start_a_stub(64); 2166 if (stub == NULL) { return NULL; } // CodeCache full: bail out 2167 2168 // Create a trampoline stub relocation which relates this trampoline stub 2169 // with the call instruction at insts_call_instruction_offset in the 2170 // instructions code-section. 2171 relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset)); 2172 const int stub_start_offset = offset(); 2173 2174 // For java_to_interp stubs we use R11_scratch1 as scratch register 2175 // and in call trampoline stubs we use R12_scratch2. This way we 2176 // can distinguish them (see is_NativeCallTrampolineStub_at()). 2177 Register reg_scratch = R12_scratch2; 2178 2179 // Now, create the trampoline stub's code: 2180 // - load the TOC 2181 // - load the call target from the constant pool 2182 // - call 2183 if (Rtoc == noreg) { 2184 calculate_address_from_global_toc(reg_scratch, method_toc()); 2185 Rtoc = reg_scratch; 2186 } 2187 2188 ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false); 2189 mtctr(reg_scratch); 2190 bctr(); 2191 2192 const address stub_start_addr = addr_at(stub_start_offset); 2193 2194 // Assert that the encoded destination_toc_offset can be identified and that it is correct. 2195 assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(), 2196 "encoded offset into the constant pool must match"); 2197 // Trampoline_stub_size should be good. 2198 assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size"); 2199 assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline"); 2200 2201 // End the stub. 2202 end_a_stub(); 2203 return stub; 2204 } 2205 2206 // TM on PPC64. 2207 void MacroAssembler::atomic_inc_ptr(Register addr, Register result, int simm16) { 2208 Label retry; 2209 bind(retry); 2210 ldarx(result, addr, /*hint*/ false); 2211 addi(result, result, simm16); 2212 stdcx_(result, addr); 2213 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 2214 bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0 2215 } else { 2216 bne( CCR0, retry); // stXcx_ sets CCR0 2217 } 2218 } 2219 2220 void MacroAssembler::atomic_ori_int(Register addr, Register result, int uimm16) { 2221 Label retry; 2222 bind(retry); 2223 lwarx(result, addr, /*hint*/ false); 2224 ori(result, result, uimm16); 2225 stwcx_(result, addr); 2226 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 2227 bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0 2228 } else { 2229 bne( CCR0, retry); // stXcx_ sets CCR0 2230 } 2231 } 2232 2233 #if INCLUDE_RTM_OPT 2234 2235 // Update rtm_counters based on abort status 2236 // input: abort_status 2237 // rtm_counters (RTMLockingCounters*) 2238 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters_Reg) { 2239 // Mapping to keep PreciseRTMLockingStatistics similar to x86. 2240 // x86 ppc (! means inverted, ? means not the same) 2241 // 0 31 Set if abort caused by XABORT instruction. 2242 // 1 ! 7 If set, the transaction may succeed on a retry. This bit is always clear if bit 0 is set. 2243 // 2 13 Set if another logical processor conflicted with a memory address that was part of the transaction that aborted. 2244 // 3 10 Set if an internal buffer overflowed. 2245 // 4 ?12 Set if a debug breakpoint was hit. 2246 // 5 ?32 Set if an abort occurred during execution of a nested transaction. 2247 const int tm_failure_bit[] = {Assembler::tm_tabort, // Note: Seems like signal handler sets this, too. 2248 Assembler::tm_failure_persistent, // inverted: transient 2249 Assembler::tm_trans_cf, 2250 Assembler::tm_footprint_of, 2251 Assembler::tm_non_trans_cf, 2252 Assembler::tm_suspended}; 2253 const bool tm_failure_inv[] = {false, true, false, false, false, false}; 2254 assert(sizeof(tm_failure_bit)/sizeof(int) == RTMLockingCounters::ABORT_STATUS_LIMIT, "adapt mapping!"); 2255 2256 const Register addr_Reg = R0; 2257 // Keep track of offset to where rtm_counters_Reg had pointed to. 2258 int counters_offs = RTMLockingCounters::abort_count_offset(); 2259 addi(addr_Reg, rtm_counters_Reg, counters_offs); 2260 const Register temp_Reg = rtm_counters_Reg; 2261 2262 //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically 2263 ldx(temp_Reg, addr_Reg); 2264 addi(temp_Reg, temp_Reg, 1); 2265 stdx(temp_Reg, addr_Reg); 2266 2267 if (PrintPreciseRTMLockingStatistics) { 2268 int counters_offs_delta = RTMLockingCounters::abortX_count_offset() - counters_offs; 2269 2270 //mftexasr(abort_status); done by caller 2271 for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) { 2272 counters_offs += counters_offs_delta; 2273 li(temp_Reg, counters_offs_delta); // can't use addi with R0 2274 add(addr_Reg, addr_Reg, temp_Reg); // point to next counter 2275 counters_offs_delta = sizeof(uintx); 2276 2277 Label check_abort; 2278 rldicr_(temp_Reg, abort_status, tm_failure_bit[i], 0); 2279 if (tm_failure_inv[i]) { 2280 bne(CCR0, check_abort); 2281 } else { 2282 beq(CCR0, check_abort); 2283 } 2284 //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically 2285 ldx(temp_Reg, addr_Reg); 2286 addi(temp_Reg, temp_Reg, 1); 2287 stdx(temp_Reg, addr_Reg); 2288 bind(check_abort); 2289 } 2290 } 2291 li(temp_Reg, -counters_offs); // can't use addi with R0 2292 add(rtm_counters_Reg, addr_Reg, temp_Reg); // restore 2293 } 2294 2295 // Branch if (random & (count-1) != 0), count is 2^n 2296 // tmp and CR0 are killed 2297 void MacroAssembler::branch_on_random_using_tb(Register tmp, int count, Label& brLabel) { 2298 mftb(tmp); 2299 andi_(tmp, tmp, count-1); 2300 bne(CCR0, brLabel); 2301 } 2302 2303 // Perform abort ratio calculation, set no_rtm bit if high ratio. 2304 // input: rtm_counters_Reg (RTMLockingCounters* address) - KILLED 2305 void MacroAssembler::rtm_abort_ratio_calculation(Register rtm_counters_Reg, 2306 RTMLockingCounters* rtm_counters, 2307 Metadata* method_data) { 2308 Label L_done, L_check_always_rtm1, L_check_always_rtm2; 2309 2310 if (RTMLockingCalculationDelay > 0) { 2311 // Delay calculation. 2312 ld(rtm_counters_Reg, (RegisterOrConstant)(intptr_t)RTMLockingCounters::rtm_calculation_flag_addr()); 2313 cmpdi(CCR0, rtm_counters_Reg, 0); 2314 beq(CCR0, L_done); 2315 load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload 2316 } 2317 // Abort ratio calculation only if abort_count > RTMAbortThreshold. 2318 // Aborted transactions = abort_count * 100 2319 // All transactions = total_count * RTMTotalCountIncrRate 2320 // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio) 2321 ld(R0, RTMLockingCounters::abort_count_offset(), rtm_counters_Reg); 2322 cmpdi(CCR0, R0, RTMAbortThreshold); 2323 blt(CCR0, L_check_always_rtm2); 2324 mulli(R0, R0, 100); 2325 2326 const Register tmpReg = rtm_counters_Reg; 2327 ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg); 2328 mulli(tmpReg, tmpReg, RTMTotalCountIncrRate); 2329 mulli(tmpReg, tmpReg, RTMAbortRatio); 2330 cmpd(CCR0, R0, tmpReg); 2331 blt(CCR0, L_check_always_rtm1); // jump to reload 2332 if (method_data != NULL) { 2333 // Set rtm_state to "no rtm" in MDO. 2334 // Not using a metadata relocation. Method and Class Loader are kept alive anyway. 2335 // (See nmethod::metadata_do and CodeBuffer::finalize_oop_references.) 2336 load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg); 2337 atomic_ori_int(R0, tmpReg, NoRTM); 2338 } 2339 b(L_done); 2340 2341 bind(L_check_always_rtm1); 2342 load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload 2343 bind(L_check_always_rtm2); 2344 ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg); 2345 cmpdi(CCR0, tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate); 2346 blt(CCR0, L_done); 2347 if (method_data != NULL) { 2348 // Set rtm_state to "always rtm" in MDO. 2349 // Not using a metadata relocation. See above. 2350 load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg); 2351 atomic_ori_int(R0, tmpReg, UseRTM); 2352 } 2353 bind(L_done); 2354 } 2355 2356 // Update counters and perform abort ratio calculation. 2357 // input: abort_status_Reg 2358 void MacroAssembler::rtm_profiling(Register abort_status_Reg, Register temp_Reg, 2359 RTMLockingCounters* rtm_counters, 2360 Metadata* method_data, 2361 bool profile_rtm) { 2362 2363 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2364 // Update rtm counters based on state at abort. 2365 // Reads abort_status_Reg, updates flags. 2366 assert_different_registers(abort_status_Reg, temp_Reg); 2367 load_const_optimized(temp_Reg, (address)rtm_counters, R0); 2368 rtm_counters_update(abort_status_Reg, temp_Reg); 2369 if (profile_rtm) { 2370 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2371 rtm_abort_ratio_calculation(temp_Reg, rtm_counters, method_data); 2372 } 2373 } 2374 2375 // Retry on abort if abort's status indicates non-persistent failure. 2376 // inputs: retry_count_Reg 2377 // : abort_status_Reg 2378 // output: retry_count_Reg decremented by 1 2379 void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, 2380 Label& retryLabel, Label* checkRetry) { 2381 Label doneRetry; 2382 rldicr_(R0, abort_status_Reg, tm_failure_persistent, 0); 2383 bne(CCR0, doneRetry); 2384 if (checkRetry) { bind(*checkRetry); } 2385 addic_(retry_count_Reg, retry_count_Reg, -1); 2386 blt(CCR0, doneRetry); 2387 smt_yield(); // Can't use wait(). No permission (SIGILL). 2388 b(retryLabel); 2389 bind(doneRetry); 2390 } 2391 2392 // Spin and retry if lock is busy. 2393 // inputs: box_Reg (monitor address) 2394 // : retry_count_Reg 2395 // output: retry_count_Reg decremented by 1 2396 // CTR is killed 2397 void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register owner_addr_Reg, Label& retryLabel) { 2398 Label SpinLoop, doneRetry; 2399 addic_(retry_count_Reg, retry_count_Reg, -1); 2400 blt(CCR0, doneRetry); 2401 li(R0, RTMSpinLoopCount); 2402 mtctr(R0); 2403 2404 bind(SpinLoop); 2405 smt_yield(); // Can't use waitrsv(). No permission (SIGILL). 2406 bdz(retryLabel); 2407 ld(R0, 0, owner_addr_Reg); 2408 cmpdi(CCR0, R0, 0); 2409 bne(CCR0, SpinLoop); 2410 b(retryLabel); 2411 2412 bind(doneRetry); 2413 } 2414 2415 // Use RTM for normal stack locks. 2416 // Input: objReg (object to lock) 2417 void MacroAssembler::rtm_stack_locking(ConditionRegister flag, 2418 Register obj, Register mark_word, Register tmp, 2419 Register retry_on_abort_count_Reg, 2420 RTMLockingCounters* stack_rtm_counters, 2421 Metadata* method_data, bool profile_rtm, 2422 Label& DONE_LABEL, Label& IsInflated) { 2423 assert(UseRTMForStackLocks, "why call this otherwise?"); 2424 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); 2425 Label L_rtm_retry, L_decrement_retry, L_on_abort; 2426 2427 if (RTMRetryCount > 0) { 2428 load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 2429 bind(L_rtm_retry); 2430 } 2431 andi_(R0, mark_word, markOopDesc::monitor_value); // inflated vs stack-locked|neutral|biased 2432 bne(CCR0, IsInflated); 2433 2434 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2435 Label L_noincrement; 2436 if (RTMTotalCountIncrRate > 1) { 2437 branch_on_random_using_tb(tmp, (int)RTMTotalCountIncrRate, L_noincrement); 2438 } 2439 assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM"); 2440 load_const_optimized(tmp, (address)stack_rtm_counters->total_count_addr(), R0); 2441 //atomic_inc_ptr(tmp, /*temp, will be reloaded*/mark_word); We don't increment atomically 2442 ldx(mark_word, tmp); 2443 addi(mark_word, mark_word, 1); 2444 stdx(mark_word, tmp); 2445 bind(L_noincrement); 2446 } 2447 tbegin_(); 2448 beq(CCR0, L_on_abort); 2449 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // Reload in transaction, conflicts need to be tracked. 2450 andi(R0, mark_word, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits 2451 cmpwi(flag, R0, markOopDesc::unlocked_value); // bits = 001 unlocked 2452 beq(flag, DONE_LABEL); // all done if unlocked 2453 2454 if (UseRTMXendForLockBusy) { 2455 tend_(); 2456 b(L_decrement_retry); 2457 } else { 2458 tabort_(); 2459 } 2460 bind(L_on_abort); 2461 const Register abort_status_Reg = tmp; 2462 mftexasr(abort_status_Reg); 2463 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2464 rtm_profiling(abort_status_Reg, /*temp*/mark_word, stack_rtm_counters, method_data, profile_rtm); 2465 } 2466 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // reload 2467 if (RTMRetryCount > 0) { 2468 // Retry on lock abort if abort status is not permanent. 2469 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry, &L_decrement_retry); 2470 } else { 2471 bind(L_decrement_retry); 2472 } 2473 } 2474 2475 // Use RTM for inflating locks 2476 // inputs: obj (object to lock) 2477 // mark_word (current header - KILLED) 2478 // boxReg (on-stack box address (displaced header location) - KILLED) 2479 void MacroAssembler::rtm_inflated_locking(ConditionRegister flag, 2480 Register obj, Register mark_word, Register boxReg, 2481 Register retry_on_busy_count_Reg, Register retry_on_abort_count_Reg, 2482 RTMLockingCounters* rtm_counters, 2483 Metadata* method_data, bool profile_rtm, 2484 Label& DONE_LABEL) { 2485 assert(UseRTMLocking, "why call this otherwise?"); 2486 Label L_rtm_retry, L_decrement_retry, L_on_abort; 2487 // Clean monitor_value bit to get valid pointer. 2488 int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markOopDesc::monitor_value; 2489 2490 // Store non-null, using boxReg instead of (intptr_t)markOopDesc::unused_mark(). 2491 std(boxReg, BasicLock::displaced_header_offset_in_bytes(), boxReg); 2492 const Register tmpReg = boxReg; 2493 const Register owner_addr_Reg = mark_word; 2494 addi(owner_addr_Reg, mark_word, owner_offset); 2495 2496 if (RTMRetryCount > 0) { 2497 load_const_optimized(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy. 2498 load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort. 2499 bind(L_rtm_retry); 2500 } 2501 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2502 Label L_noincrement; 2503 if (RTMTotalCountIncrRate > 1) { 2504 branch_on_random_using_tb(R0, (int)RTMTotalCountIncrRate, L_noincrement); 2505 } 2506 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2507 load_const(R0, (address)rtm_counters->total_count_addr(), tmpReg); 2508 //atomic_inc_ptr(R0, tmpReg); We don't increment atomically 2509 ldx(tmpReg, R0); 2510 addi(tmpReg, tmpReg, 1); 2511 stdx(tmpReg, R0); 2512 bind(L_noincrement); 2513 } 2514 tbegin_(); 2515 beq(CCR0, L_on_abort); 2516 // We don't reload mark word. Will only be reset at safepoint. 2517 ld(R0, 0, owner_addr_Reg); // Load in transaction, conflicts need to be tracked. 2518 cmpdi(flag, R0, 0); 2519 beq(flag, DONE_LABEL); 2520 2521 if (UseRTMXendForLockBusy) { 2522 tend_(); 2523 b(L_decrement_retry); 2524 } else { 2525 tabort_(); 2526 } 2527 bind(L_on_abort); 2528 const Register abort_status_Reg = tmpReg; 2529 mftexasr(abort_status_Reg); 2530 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2531 rtm_profiling(abort_status_Reg, /*temp*/ owner_addr_Reg, rtm_counters, method_data, profile_rtm); 2532 // Restore owner_addr_Reg 2533 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); 2534 #ifdef ASSERT 2535 andi_(R0, mark_word, markOopDesc::monitor_value); 2536 asm_assert_ne("must be inflated", 0xa754); // Deflating only allowed at safepoint. 2537 #endif 2538 addi(owner_addr_Reg, mark_word, owner_offset); 2539 } 2540 if (RTMRetryCount > 0) { 2541 // Retry on lock abort if abort status is not permanent. 2542 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 2543 } 2544 2545 // Appears unlocked - try to swing _owner from null to non-null. 2546 cmpxchgd(flag, /*current val*/ R0, (intptr_t)0, /*new val*/ R16_thread, owner_addr_Reg, 2547 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2548 MacroAssembler::cmpxchgx_hint_acquire_lock(), noreg, &L_decrement_retry, true); 2549 2550 if (RTMRetryCount > 0) { 2551 // success done else retry 2552 b(DONE_LABEL); 2553 bind(L_decrement_retry); 2554 // Spin and retry if lock is busy. 2555 rtm_retry_lock_on_busy(retry_on_busy_count_Reg, owner_addr_Reg, L_rtm_retry); 2556 } else { 2557 bind(L_decrement_retry); 2558 } 2559 } 2560 2561 #endif // INCLUDE_RTM_OPT 2562 2563 // "The box" is the space on the stack where we copy the object mark. 2564 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box, 2565 Register temp, Register displaced_header, Register current_header, 2566 bool try_bias, 2567 RTMLockingCounters* rtm_counters, 2568 RTMLockingCounters* stack_rtm_counters, 2569 Metadata* method_data, 2570 bool use_rtm, bool profile_rtm) { 2571 assert_different_registers(oop, box, temp, displaced_header, current_header); 2572 assert(flag != CCR0, "bad condition register"); 2573 Label cont; 2574 Label object_has_monitor; 2575 Label cas_failed; 2576 2577 // Load markOop from object into displaced_header. 2578 ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop); 2579 2580 2581 // Always do locking in runtime. 2582 if (EmitSync & 0x01) { 2583 cmpdi(flag, oop, 0); // Oop can't be 0 here => always false. 2584 return; 2585 } 2586 2587 if (try_bias) { 2588 biased_locking_enter(flag, oop, displaced_header, temp, current_header, cont); 2589 } 2590 2591 #if INCLUDE_RTM_OPT 2592 if (UseRTMForStackLocks && use_rtm) { 2593 rtm_stack_locking(flag, oop, displaced_header, temp, /*temp*/ current_header, 2594 stack_rtm_counters, method_data, profile_rtm, 2595 cont, object_has_monitor); 2596 } 2597 #endif // INCLUDE_RTM_OPT 2598 2599 // Handle existing monitor. 2600 if ((EmitSync & 0x02) == 0) { 2601 // The object has an existing monitor iff (mark & monitor_value) != 0. 2602 andi_(temp, displaced_header, markOopDesc::monitor_value); 2603 bne(CCR0, object_has_monitor); 2604 } 2605 2606 // Set displaced_header to be (markOop of object | UNLOCK_VALUE). 2607 ori(displaced_header, displaced_header, markOopDesc::unlocked_value); 2608 2609 // Load Compare Value application register. 2610 2611 // Initialize the box. (Must happen before we update the object mark!) 2612 std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2613 2614 // Must fence, otherwise, preceding store(s) may float below cmpxchg. 2615 // Compare object markOop with mark and if equal exchange scratch1 with object markOop. 2616 cmpxchgd(/*flag=*/flag, 2617 /*current_value=*/current_header, 2618 /*compare_value=*/displaced_header, 2619 /*exchange_value=*/box, 2620 /*where=*/oop, 2621 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2622 MacroAssembler::cmpxchgx_hint_acquire_lock(), 2623 noreg, 2624 &cas_failed, 2625 /*check without membar and ldarx first*/true); 2626 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2627 2628 // If the compare-and-exchange succeeded, then we found an unlocked 2629 // object and we have now locked it. 2630 b(cont); 2631 2632 bind(cas_failed); 2633 // We did not see an unlocked object so try the fast recursive case. 2634 2635 // Check if the owner is self by comparing the value in the markOop of object 2636 // (current_header) with the stack pointer. 2637 sub(current_header, current_header, R1_SP); 2638 load_const_optimized(temp, ~(os::vm_page_size()-1) | markOopDesc::lock_mask_in_place); 2639 2640 and_(R0/*==0?*/, current_header, temp); 2641 // If condition is true we are cont and hence we can store 0 as the 2642 // displaced header in the box, which indicates that it is a recursive lock. 2643 mcrf(flag,CCR0); 2644 std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box); 2645 2646 // Handle existing monitor. 2647 if ((EmitSync & 0x02) == 0) { 2648 b(cont); 2649 2650 bind(object_has_monitor); 2651 // The object's monitor m is unlocked iff m->owner == NULL, 2652 // otherwise m->owner may contain a thread or a stack address. 2653 2654 #if INCLUDE_RTM_OPT 2655 // Use the same RTM locking code in 32- and 64-bit VM. 2656 if (use_rtm) { 2657 rtm_inflated_locking(flag, oop, displaced_header, box, temp, /*temp*/ current_header, 2658 rtm_counters, method_data, profile_rtm, cont); 2659 } else { 2660 #endif // INCLUDE_RTM_OPT 2661 2662 // Try to CAS m->owner from NULL to current thread. 2663 addi(temp, displaced_header, ObjectMonitor::owner_offset_in_bytes()-markOopDesc::monitor_value); 2664 cmpxchgd(/*flag=*/flag, 2665 /*current_value=*/current_header, 2666 /*compare_value=*/(intptr_t)0, 2667 /*exchange_value=*/R16_thread, 2668 /*where=*/temp, 2669 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2670 MacroAssembler::cmpxchgx_hint_acquire_lock()); 2671 2672 // Store a non-null value into the box. 2673 std(box, BasicLock::displaced_header_offset_in_bytes(), box); 2674 2675 # ifdef ASSERT 2676 bne(flag, cont); 2677 // We have acquired the monitor, check some invariants. 2678 addi(/*monitor=*/temp, temp, -ObjectMonitor::owner_offset_in_bytes()); 2679 // Invariant 1: _recursions should be 0. 2680 //assert(ObjectMonitor::recursions_size_in_bytes() == 8, "unexpected size"); 2681 asm_assert_mem8_is_zero(ObjectMonitor::recursions_offset_in_bytes(), temp, 2682 "monitor->_recursions should be 0", -1); 2683 // Invariant 2: OwnerIsThread shouldn't be 0. 2684 //assert(ObjectMonitor::OwnerIsThread_size_in_bytes() == 4, "unexpected size"); 2685 //asm_assert_mem4_isnot_zero(ObjectMonitor::OwnerIsThread_offset_in_bytes(), temp, 2686 // "monitor->OwnerIsThread shouldn't be 0", -1); 2687 # endif 2688 2689 #if INCLUDE_RTM_OPT 2690 } // use_rtm() 2691 #endif 2692 } 2693 2694 bind(cont); 2695 // flag == EQ indicates success 2696 // flag == NE indicates failure 2697 } 2698 2699 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box, 2700 Register temp, Register displaced_header, Register current_header, 2701 bool try_bias, bool use_rtm) { 2702 assert_different_registers(oop, box, temp, displaced_header, current_header); 2703 assert(flag != CCR0, "bad condition register"); 2704 Label cont; 2705 Label object_has_monitor; 2706 2707 // Always do locking in runtime. 2708 if (EmitSync & 0x01) { 2709 cmpdi(flag, oop, 0); // Oop can't be 0 here => always false. 2710 return; 2711 } 2712 2713 if (try_bias) { 2714 biased_locking_exit(flag, oop, current_header, cont); 2715 } 2716 2717 #if INCLUDE_RTM_OPT 2718 if (UseRTMForStackLocks && use_rtm) { 2719 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); 2720 Label L_regular_unlock; 2721 ld(current_header, oopDesc::mark_offset_in_bytes(), oop); // fetch markword 2722 andi(R0, current_header, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits 2723 cmpwi(flag, R0, markOopDesc::unlocked_value); // bits = 001 unlocked 2724 bne(flag, L_regular_unlock); // else RegularLock 2725 tend_(); // otherwise end... 2726 b(cont); // ... and we're done 2727 bind(L_regular_unlock); 2728 } 2729 #endif 2730 2731 // Find the lock address and load the displaced header from the stack. 2732 ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2733 2734 // If the displaced header is 0, we have a recursive unlock. 2735 cmpdi(flag, displaced_header, 0); 2736 beq(flag, cont); 2737 2738 // Handle existing monitor. 2739 if ((EmitSync & 0x02) == 0) { 2740 // The object has an existing monitor iff (mark & monitor_value) != 0. 2741 RTM_OPT_ONLY( if (!(UseRTMForStackLocks && use_rtm)) ) // skip load if already done 2742 ld(current_header, oopDesc::mark_offset_in_bytes(), oop); 2743 andi_(R0, current_header, markOopDesc::monitor_value); 2744 bne(CCR0, object_has_monitor); 2745 } 2746 2747 // Check if it is still a light weight lock, this is is true if we see 2748 // the stack address of the basicLock in the markOop of the object. 2749 // Cmpxchg sets flag to cmpd(current_header, box). 2750 cmpxchgd(/*flag=*/flag, 2751 /*current_value=*/current_header, 2752 /*compare_value=*/box, 2753 /*exchange_value=*/displaced_header, 2754 /*where=*/oop, 2755 MacroAssembler::MemBarRel, 2756 MacroAssembler::cmpxchgx_hint_release_lock(), 2757 noreg, 2758 &cont); 2759 2760 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2761 2762 // Handle existing monitor. 2763 if ((EmitSync & 0x02) == 0) { 2764 b(cont); 2765 2766 bind(object_has_monitor); 2767 addi(current_header, current_header, -markOopDesc::monitor_value); // monitor 2768 ld(temp, ObjectMonitor::owner_offset_in_bytes(), current_header); 2769 2770 // It's inflated. 2771 #if INCLUDE_RTM_OPT 2772 if (use_rtm) { 2773 Label L_regular_inflated_unlock; 2774 // Clean monitor_value bit to get valid pointer 2775 cmpdi(flag, temp, 0); 2776 bne(flag, L_regular_inflated_unlock); 2777 tend_(); 2778 b(cont); 2779 bind(L_regular_inflated_unlock); 2780 } 2781 #endif 2782 2783 ld(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header); 2784 xorr(temp, R16_thread, temp); // Will be 0 if we are the owner. 2785 orr(temp, temp, displaced_header); // Will be 0 if there are 0 recursions. 2786 cmpdi(flag, temp, 0); 2787 bne(flag, cont); 2788 2789 ld(temp, ObjectMonitor::EntryList_offset_in_bytes(), current_header); 2790 ld(displaced_header, ObjectMonitor::cxq_offset_in_bytes(), current_header); 2791 orr(temp, temp, displaced_header); // Will be 0 if both are 0. 2792 cmpdi(flag, temp, 0); 2793 bne(flag, cont); 2794 release(); 2795 std(temp, ObjectMonitor::owner_offset_in_bytes(), current_header); 2796 } 2797 2798 bind(cont); 2799 // flag == EQ indicates success 2800 // flag == NE indicates failure 2801 } 2802 2803 // Write serialization page so VM thread can do a pseudo remote membar. 2804 // We use the current thread pointer to calculate a thread specific 2805 // offset to write to within the page. This minimizes bus traffic 2806 // due to cache line collision. 2807 void MacroAssembler::serialize_memory(Register thread, Register tmp1, Register tmp2) { 2808 srdi(tmp2, thread, os::get_serialize_page_shift_count()); 2809 2810 int mask = os::vm_page_size() - sizeof(int); 2811 if (Assembler::is_simm(mask, 16)) { 2812 andi(tmp2, tmp2, mask); 2813 } else { 2814 lis(tmp1, (int)((signed short) (mask >> 16))); 2815 ori(tmp1, tmp1, mask & 0x0000ffff); 2816 andr(tmp2, tmp2, tmp1); 2817 } 2818 2819 load_const(tmp1, (long) os::get_memory_serialize_page()); 2820 release(); 2821 stwx(R0, tmp1, tmp2); 2822 } 2823 2824 2825 // GC barrier helper macros 2826 2827 // Write the card table byte if needed. 2828 void MacroAssembler::card_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp) { 2829 CardTableModRefBS* bs = 2830 barrier_set_cast<CardTableModRefBS>(Universe::heap()->barrier_set()); 2831 assert(bs->kind() == BarrierSet::CardTableForRS || 2832 bs->kind() == BarrierSet::CardTableExtension, "wrong barrier"); 2833 #ifdef ASSERT 2834 cmpdi(CCR0, Rnew_val, 0); 2835 asm_assert_ne("null oop not allowed", 0x321); 2836 #endif 2837 card_table_write(bs->byte_map_base, Rtmp, Rstore_addr); 2838 } 2839 2840 // Write the card table byte. 2841 void MacroAssembler::card_table_write(jbyte* byte_map_base, Register Rtmp, Register Robj) { 2842 assert_different_registers(Robj, Rtmp, R0); 2843 load_const_optimized(Rtmp, (address)byte_map_base, R0); 2844 srdi(Robj, Robj, CardTableModRefBS::card_shift); 2845 li(R0, 0); // dirty 2846 if (UseConcMarkSweepGC) membar(Assembler::StoreStore); 2847 stbx(R0, Rtmp, Robj); 2848 } 2849 2850 #if INCLUDE_ALL_GCS 2851 // General G1 pre-barrier generator. 2852 // Goal: record the previous value if it is not null. 2853 void MacroAssembler::g1_write_barrier_pre(Register Robj, RegisterOrConstant offset, Register Rpre_val, 2854 Register Rtmp1, Register Rtmp2, bool needs_frame) { 2855 Label runtime, filtered; 2856 2857 // Is marking active? 2858 if (in_bytes(SATBMarkQueue::byte_width_of_active()) == 4) { 2859 lwz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_active()), R16_thread); 2860 } else { 2861 guarantee(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, "Assumption"); 2862 lbz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_active()), R16_thread); 2863 } 2864 cmpdi(CCR0, Rtmp1, 0); 2865 beq(CCR0, filtered); 2866 2867 // Do we need to load the previous value? 2868 if (Robj != noreg) { 2869 // Load the previous value... 2870 if (UseCompressedOops) { 2871 lwz(Rpre_val, offset, Robj); 2872 } else { 2873 ld(Rpre_val, offset, Robj); 2874 } 2875 // Previous value has been loaded into Rpre_val. 2876 } 2877 assert(Rpre_val != noreg, "must have a real register"); 2878 2879 // Is the previous value null? 2880 cmpdi(CCR0, Rpre_val, 0); 2881 beq(CCR0, filtered); 2882 2883 if (Robj != noreg && UseCompressedOops) { 2884 decode_heap_oop_not_null(Rpre_val); 2885 } 2886 2887 // OK, it's not filtered, so we'll need to call enqueue. In the normal 2888 // case, pre_val will be a scratch G-reg, but there are some cases in 2889 // which it's an O-reg. In the first case, do a normal call. In the 2890 // latter, do a save here and call the frameless version. 2891 2892 // Can we store original value in the thread's buffer? 2893 // Is index == 0? 2894 // (The index field is typed as size_t.) 2895 const Register Rbuffer = Rtmp1, Rindex = Rtmp2; 2896 2897 ld(Rindex, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_index()), R16_thread); 2898 cmpdi(CCR0, Rindex, 0); 2899 beq(CCR0, runtime); // If index == 0, goto runtime. 2900 ld(Rbuffer, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_buf()), R16_thread); 2901 2902 addi(Rindex, Rindex, -wordSize); // Decrement index. 2903 std(Rindex, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_index()), R16_thread); 2904 2905 // Record the previous value. 2906 stdx(Rpre_val, Rbuffer, Rindex); 2907 b(filtered); 2908 2909 bind(runtime); 2910 2911 // VM call need frame to access(write) O register. 2912 if (needs_frame) { 2913 save_LR_CR(Rtmp1); 2914 push_frame_reg_args(0, Rtmp2); 2915 } 2916 2917 if (Rpre_val->is_volatile() && Robj == noreg) mr(R31, Rpre_val); // Save pre_val across C call if it was preloaded. 2918 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), Rpre_val, R16_thread); 2919 if (Rpre_val->is_volatile() && Robj == noreg) mr(Rpre_val, R31); // restore 2920 2921 if (needs_frame) { 2922 pop_frame(); 2923 restore_LR_CR(Rtmp1); 2924 } 2925 2926 bind(filtered); 2927 } 2928 2929 // General G1 post-barrier generator 2930 // Store cross-region card. 2931 void MacroAssembler::g1_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp1, Register Rtmp2, Register Rtmp3, Label *filtered_ext) { 2932 Label runtime, filtered_int; 2933 Label& filtered = (filtered_ext != NULL) ? *filtered_ext : filtered_int; 2934 assert_different_registers(Rstore_addr, Rnew_val, Rtmp1, Rtmp2); 2935 2936 G1SATBCardTableLoggingModRefBS* bs = 2937 barrier_set_cast<G1SATBCardTableLoggingModRefBS>(Universe::heap()->barrier_set()); 2938 2939 // Does store cross heap regions? 2940 if (G1RSBarrierRegionFilter) { 2941 xorr(Rtmp1, Rstore_addr, Rnew_val); 2942 srdi_(Rtmp1, Rtmp1, HeapRegion::LogOfHRGrainBytes); 2943 beq(CCR0, filtered); 2944 } 2945 2946 // Crosses regions, storing NULL? 2947 #ifdef ASSERT 2948 cmpdi(CCR0, Rnew_val, 0); 2949 asm_assert_ne("null oop not allowed (G1)", 0x322); // Checked by caller on PPC64, so following branch is obsolete: 2950 //beq(CCR0, filtered); 2951 #endif 2952 2953 // Storing region crossing non-NULL, is card already dirty? 2954 assert(sizeof(*bs->byte_map_base) == sizeof(jbyte), "adjust this code"); 2955 const Register Rcard_addr = Rtmp1; 2956 Register Rbase = Rtmp2; 2957 load_const_optimized(Rbase, (address)bs->byte_map_base, /*temp*/ Rtmp3); 2958 2959 srdi(Rcard_addr, Rstore_addr, CardTableModRefBS::card_shift); 2960 2961 // Get the address of the card. 2962 lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr); 2963 cmpwi(CCR0, Rtmp3, (int)G1SATBCardTableModRefBS::g1_young_card_val()); 2964 beq(CCR0, filtered); 2965 2966 membar(Assembler::StoreLoad); 2967 lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr); // Reload after membar. 2968 cmpwi(CCR0, Rtmp3 /* card value */, CardTableModRefBS::dirty_card_val()); 2969 beq(CCR0, filtered); 2970 2971 // Storing a region crossing, non-NULL oop, card is clean. 2972 // Dirty card and log. 2973 li(Rtmp3, CardTableModRefBS::dirty_card_val()); 2974 //release(); // G1: oops are allowed to get visible after dirty marking. 2975 stbx(Rtmp3, Rbase, Rcard_addr); 2976 2977 add(Rcard_addr, Rbase, Rcard_addr); // This is the address which needs to get enqueued. 2978 Rbase = noreg; // end of lifetime 2979 2980 const Register Rqueue_index = Rtmp2, 2981 Rqueue_buf = Rtmp3; 2982 ld(Rqueue_index, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_index()), R16_thread); 2983 cmpdi(CCR0, Rqueue_index, 0); 2984 beq(CCR0, runtime); // index == 0 then jump to runtime 2985 ld(Rqueue_buf, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_buf()), R16_thread); 2986 2987 addi(Rqueue_index, Rqueue_index, -wordSize); // decrement index 2988 std(Rqueue_index, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_index()), R16_thread); 2989 2990 stdx(Rcard_addr, Rqueue_buf, Rqueue_index); // store card 2991 b(filtered); 2992 2993 bind(runtime); 2994 2995 // Save the live input values. 2996 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), Rcard_addr, R16_thread); 2997 2998 bind(filtered_int); 2999 } 3000 #endif // INCLUDE_ALL_GCS 3001 3002 // Values for last_Java_pc, and last_Java_sp must comply to the rules 3003 // in frame_ppc.hpp. 3004 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) { 3005 // Always set last_Java_pc and flags first because once last_Java_sp 3006 // is visible has_last_Java_frame is true and users will look at the 3007 // rest of the fields. (Note: flags should always be zero before we 3008 // get here so doesn't need to be set.) 3009 3010 // Verify that last_Java_pc was zeroed on return to Java 3011 asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread, 3012 "last_Java_pc not zeroed before leaving Java", 0x200); 3013 3014 // When returning from calling out from Java mode the frame anchor's 3015 // last_Java_pc will always be set to NULL. It is set here so that 3016 // if we are doing a call to native (not VM) that we capture the 3017 // known pc and don't have to rely on the native call having a 3018 // standard frame linkage where we can find the pc. 3019 if (last_Java_pc != noreg) 3020 std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 3021 3022 // Set last_Java_sp last. 3023 std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 3024 } 3025 3026 void MacroAssembler::reset_last_Java_frame(void) { 3027 asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()), 3028 R16_thread, "SP was not set, still zero", 0x202); 3029 3030 BLOCK_COMMENT("reset_last_Java_frame {"); 3031 li(R0, 0); 3032 3033 // _last_Java_sp = 0 3034 std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 3035 3036 // _last_Java_pc = 0 3037 std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 3038 BLOCK_COMMENT("} reset_last_Java_frame"); 3039 } 3040 3041 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) { 3042 assert_different_registers(sp, tmp1); 3043 3044 // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via 3045 // TOP_IJAVA_FRAME_ABI. 3046 // FIXME: assert that we really have a TOP_IJAVA_FRAME here! 3047 address entry = pc(); 3048 load_const_optimized(tmp1, entry); 3049 3050 set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1); 3051 } 3052 3053 void MacroAssembler::get_vm_result(Register oop_result) { 3054 // Read: 3055 // R16_thread 3056 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 3057 // 3058 // Updated: 3059 // oop_result 3060 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 3061 3062 verify_thread(); 3063 3064 ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread); 3065 li(R0, 0); 3066 std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread); 3067 3068 verify_oop(oop_result); 3069 } 3070 3071 void MacroAssembler::get_vm_result_2(Register metadata_result) { 3072 // Read: 3073 // R16_thread 3074 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 3075 // 3076 // Updated: 3077 // metadata_result 3078 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 3079 3080 ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 3081 li(R0, 0); 3082 std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 3083 } 3084 3085 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) { 3086 Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided. 3087 if (Universe::narrow_klass_base() != 0) { 3088 // Use dst as temp if it is free. 3089 sub_const_optimized(dst, current, Universe::narrow_klass_base(), R0); 3090 current = dst; 3091 } 3092 if (Universe::narrow_klass_shift() != 0) { 3093 srdi(dst, current, Universe::narrow_klass_shift()); 3094 current = dst; 3095 } 3096 return current; 3097 } 3098 3099 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) { 3100 if (UseCompressedClassPointers) { 3101 Register compressedKlass = encode_klass_not_null(ck, klass); 3102 stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop); 3103 } else { 3104 std(klass, oopDesc::klass_offset_in_bytes(), dst_oop); 3105 } 3106 } 3107 3108 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) { 3109 if (UseCompressedClassPointers) { 3110 if (val == noreg) { 3111 val = R0; 3112 li(val, 0); 3113 } 3114 stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed 3115 } 3116 } 3117 3118 int MacroAssembler::instr_size_for_decode_klass_not_null() { 3119 if (!UseCompressedClassPointers) return 0; 3120 int num_instrs = 1; // shift or move 3121 if (Universe::narrow_klass_base() != 0) num_instrs = 7; // shift + load const + add 3122 return num_instrs * BytesPerInstWord; 3123 } 3124 3125 void MacroAssembler::decode_klass_not_null(Register dst, Register src) { 3126 assert(dst != R0, "Dst reg may not be R0, as R0 is used here."); 3127 if (src == noreg) src = dst; 3128 Register shifted_src = src; 3129 if (Universe::narrow_klass_shift() != 0 || 3130 Universe::narrow_klass_base() == 0 && src != dst) { // Move required. 3131 shifted_src = dst; 3132 sldi(shifted_src, src, Universe::narrow_klass_shift()); 3133 } 3134 if (Universe::narrow_klass_base() != 0) { 3135 add_const_optimized(dst, shifted_src, Universe::narrow_klass_base(), R0); 3136 } 3137 } 3138 3139 void MacroAssembler::load_klass(Register dst, Register src) { 3140 if (UseCompressedClassPointers) { 3141 lwz(dst, oopDesc::klass_offset_in_bytes(), src); 3142 // Attention: no null check here! 3143 decode_klass_not_null(dst, dst); 3144 } else { 3145 ld(dst, oopDesc::klass_offset_in_bytes(), src); 3146 } 3147 } 3148 3149 void MacroAssembler::load_mirror(Register mirror, Register method) { 3150 const int mirror_offset = in_bytes(Klass::java_mirror_offset()); 3151 ld(mirror, in_bytes(Method::const_offset()), method); 3152 ld(mirror, in_bytes(ConstMethod::constants_offset()), mirror); 3153 ld(mirror, ConstantPool::pool_holder_offset_in_bytes(), mirror); 3154 ld(mirror, mirror_offset, mirror); 3155 } 3156 3157 // Clear Array 3158 // Kills both input registers. tmp == R0 is allowed. 3159 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp) { 3160 // Procedure for large arrays (uses data cache block zero instruction). 3161 Label startloop, fast, fastloop, small_rest, restloop, done; 3162 const int cl_size = VM_Version::L1_data_cache_line_size(), 3163 cl_dwords = cl_size>>3, 3164 cl_dw_addr_bits = exact_log2(cl_dwords), 3165 dcbz_min = 1; // Min count of dcbz executions, needs to be >0. 3166 3167 //2: 3168 cmpdi(CCR1, cnt_dwords, ((dcbz_min+1)<<cl_dw_addr_bits)-1); // Big enough? (ensure >=dcbz_min lines included). 3169 blt(CCR1, small_rest); // Too small. 3170 rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line. 3171 beq(CCR0, fast); // Already 128byte aligned. 3172 3173 subfic(tmp, tmp, cl_dwords); 3174 mtctr(tmp); // Set ctr to hit 128byte boundary (0<ctr<cl_dwords). 3175 subf(cnt_dwords, tmp, cnt_dwords); // rest. 3176 li(tmp, 0); 3177 //10: 3178 bind(startloop); // Clear at the beginning to reach 128byte boundary. 3179 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 3180 addi(base_ptr, base_ptr, 8); 3181 bdnz(startloop); 3182 //13: 3183 bind(fast); // Clear 128byte blocks. 3184 srdi(tmp, cnt_dwords, cl_dw_addr_bits); // Loop count for 128byte loop (>0). 3185 andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords. 3186 mtctr(tmp); // Load counter. 3187 //16: 3188 bind(fastloop); 3189 dcbz(base_ptr); // Clear 128byte aligned block. 3190 addi(base_ptr, base_ptr, cl_size); 3191 bdnz(fastloop); 3192 if (InsertEndGroupPPC64) { endgroup(); } else { nop(); } 3193 //20: 3194 bind(small_rest); 3195 cmpdi(CCR0, cnt_dwords, 0); // size 0? 3196 beq(CCR0, done); // rest == 0 3197 li(tmp, 0); 3198 mtctr(cnt_dwords); // Load counter. 3199 //24: 3200 bind(restloop); // Clear rest. 3201 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 3202 addi(base_ptr, base_ptr, 8); 3203 bdnz(restloop); 3204 //27: 3205 bind(done); 3206 } 3207 3208 /////////////////////////////////////////// String intrinsics //////////////////////////////////////////// 3209 3210 #ifdef COMPILER2 3211 // Intrinsics for CompactStrings 3212 3213 // Compress char[] to byte[] by compressing 16 bytes at once. 3214 void MacroAssembler::string_compress_16(Register src, Register dst, Register cnt, 3215 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, 3216 Label& Lfailure) { 3217 3218 const Register tmp0 = R0; 3219 assert_different_registers(src, dst, cnt, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5); 3220 Label Lloop, Lslow; 3221 3222 // Check if cnt >= 8 (= 16 bytes) 3223 lis(tmp1, 0xFF); // tmp1 = 0x00FF00FF00FF00FF 3224 srwi_(tmp2, cnt, 3); 3225 beq(CCR0, Lslow); 3226 ori(tmp1, tmp1, 0xFF); 3227 rldimi(tmp1, tmp1, 32, 0); 3228 mtctr(tmp2); 3229 3230 // 2x unrolled loop 3231 bind(Lloop); 3232 ld(tmp2, 0, src); // _0_1_2_3 (Big Endian) 3233 ld(tmp4, 8, src); // _4_5_6_7 3234 3235 orr(tmp0, tmp2, tmp4); 3236 rldicl(tmp3, tmp2, 6*8, 64-24); // _____1_2 3237 rldimi(tmp2, tmp2, 2*8, 2*8); // _0_2_3_3 3238 rldicl(tmp5, tmp4, 6*8, 64-24); // _____5_6 3239 rldimi(tmp4, tmp4, 2*8, 2*8); // _4_6_7_7 3240 3241 andc_(tmp0, tmp0, tmp1); 3242 bne(CCR0, Lfailure); // Not latin1. 3243 addi(src, src, 16); 3244 3245 rlwimi(tmp3, tmp2, 0*8, 24, 31);// _____1_3 3246 srdi(tmp2, tmp2, 3*8); // ____0_2_ 3247 rlwimi(tmp5, tmp4, 0*8, 24, 31);// _____5_7 3248 srdi(tmp4, tmp4, 3*8); // ____4_6_ 3249 3250 orr(tmp2, tmp2, tmp3); // ____0123 3251 orr(tmp4, tmp4, tmp5); // ____4567 3252 3253 stw(tmp2, 0, dst); 3254 stw(tmp4, 4, dst); 3255 addi(dst, dst, 8); 3256 bdnz(Lloop); 3257 3258 bind(Lslow); // Fallback to slow version 3259 } 3260 3261 // Compress char[] to byte[]. cnt must be positive int. 3262 void MacroAssembler::string_compress(Register src, Register dst, Register cnt, Register tmp, Label& Lfailure) { 3263 Label Lloop; 3264 mtctr(cnt); 3265 3266 bind(Lloop); 3267 lhz(tmp, 0, src); 3268 cmplwi(CCR0, tmp, 0xff); 3269 bgt(CCR0, Lfailure); // Not latin1. 3270 addi(src, src, 2); 3271 stb(tmp, 0, dst); 3272 addi(dst, dst, 1); 3273 bdnz(Lloop); 3274 } 3275 3276 // Inflate byte[] to char[] by inflating 16 bytes at once. 3277 void MacroAssembler::string_inflate_16(Register src, Register dst, Register cnt, 3278 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) { 3279 const Register tmp0 = R0; 3280 assert_different_registers(src, dst, cnt, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5); 3281 Label Lloop, Lslow; 3282 3283 // Check if cnt >= 8 3284 srwi_(tmp2, cnt, 3); 3285 beq(CCR0, Lslow); 3286 lis(tmp1, 0xFF); // tmp1 = 0x00FF00FF 3287 ori(tmp1, tmp1, 0xFF); 3288 mtctr(tmp2); 3289 3290 // 2x unrolled loop 3291 bind(Lloop); 3292 lwz(tmp2, 0, src); // ____0123 (Big Endian) 3293 lwz(tmp4, 4, src); // ____4567 3294 addi(src, src, 8); 3295 3296 rldicl(tmp3, tmp2, 7*8, 64-8); // _______2 3297 rlwimi(tmp2, tmp2, 3*8, 16, 23);// ____0113 3298 rldicl(tmp5, tmp4, 7*8, 64-8); // _______6 3299 rlwimi(tmp4, tmp4, 3*8, 16, 23);// ____4557 3300 3301 andc(tmp0, tmp2, tmp1); // ____0_1_ 3302 rlwimi(tmp2, tmp3, 2*8, 0, 23); // _____2_3 3303 andc(tmp3, tmp4, tmp1); // ____4_5_ 3304 rlwimi(tmp4, tmp5, 2*8, 0, 23); // _____6_7 3305 3306 rldimi(tmp2, tmp0, 3*8, 0*8); // _0_1_2_3 3307 rldimi(tmp4, tmp3, 3*8, 0*8); // _4_5_6_7 3308 3309 std(tmp2, 0, dst); 3310 std(tmp4, 8, dst); 3311 addi(dst, dst, 16); 3312 bdnz(Lloop); 3313 3314 bind(Lslow); // Fallback to slow version 3315 } 3316 3317 // Inflate byte[] to char[]. cnt must be positive int. 3318 void MacroAssembler::string_inflate(Register src, Register dst, Register cnt, Register tmp) { 3319 Label Lloop; 3320 mtctr(cnt); 3321 3322 bind(Lloop); 3323 lbz(tmp, 0, src); 3324 addi(src, src, 1); 3325 sth(tmp, 0, dst); 3326 addi(dst, dst, 2); 3327 bdnz(Lloop); 3328 } 3329 3330 void MacroAssembler::string_compare(Register str1, Register str2, 3331 Register cnt1, Register cnt2, 3332 Register tmp1, Register result, int ae) { 3333 const Register tmp0 = R0, 3334 diff = tmp1; 3335 3336 assert_different_registers(str1, str2, cnt1, cnt2, tmp0, tmp1, result); 3337 Label Ldone, Lslow, Lloop, Lreturn_diff; 3338 3339 // Note: Making use of the fact that compareTo(a, b) == -compareTo(b, a) 3340 // we interchange str1 and str2 in the UL case and negate the result. 3341 // Like this, str1 is always latin1 encoded, except for the UU case. 3342 // In addition, we need 0 (or sign which is 0) extend. 3343 3344 if (ae == StrIntrinsicNode::UU) { 3345 srwi(cnt1, cnt1, 1); 3346 } else { 3347 clrldi(cnt1, cnt1, 32); 3348 } 3349 3350 if (ae != StrIntrinsicNode::LL) { 3351 srwi(cnt2, cnt2, 1); 3352 } else { 3353 clrldi(cnt2, cnt2, 32); 3354 } 3355 3356 // See if the lengths are different, and calculate min in cnt1. 3357 // Save diff in case we need it for a tie-breaker. 3358 subf_(diff, cnt2, cnt1); // diff = cnt1 - cnt2 3359 // if (diff > 0) { cnt1 = cnt2; } 3360 if (VM_Version::has_isel()) { 3361 isel(cnt1, CCR0, Assembler::greater, /*invert*/ false, cnt2); 3362 } else { 3363 Label Lskip; 3364 blt(CCR0, Lskip); 3365 mr(cnt1, cnt2); 3366 bind(Lskip); 3367 } 3368 3369 // Rename registers 3370 Register chr1 = result; 3371 Register chr2 = tmp0; 3372 3373 // Compare multiple characters in fast loop (only implemented for same encoding). 3374 int stride1 = 8, stride2 = 8; 3375 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3376 int log2_chars_per_iter = (ae == StrIntrinsicNode::LL) ? 3 : 2; 3377 Label Lfastloop, Lskipfast; 3378 3379 srwi_(tmp0, cnt1, log2_chars_per_iter); 3380 beq(CCR0, Lskipfast); 3381 rldicl(cnt2, cnt1, 0, 64 - log2_chars_per_iter); // Remaining characters. 3382 li(cnt1, 1 << log2_chars_per_iter); // Initialize for failure case: Rescan characters from current iteration. 3383 mtctr(tmp0); 3384 3385 bind(Lfastloop); 3386 ld(chr1, 0, str1); 3387 ld(chr2, 0, str2); 3388 cmpd(CCR0, chr1, chr2); 3389 bne(CCR0, Lslow); 3390 addi(str1, str1, stride1); 3391 addi(str2, str2, stride2); 3392 bdnz(Lfastloop); 3393 mr(cnt1, cnt2); // Remaining characters. 3394 bind(Lskipfast); 3395 } 3396 3397 // Loop which searches the first difference character by character. 3398 cmpwi(CCR0, cnt1, 0); 3399 beq(CCR0, Lreturn_diff); 3400 bind(Lslow); 3401 mtctr(cnt1); 3402 3403 switch (ae) { 3404 case StrIntrinsicNode::LL: stride1 = 1; stride2 = 1; break; 3405 case StrIntrinsicNode::UL: // fallthru (see comment above) 3406 case StrIntrinsicNode::LU: stride1 = 1; stride2 = 2; break; 3407 case StrIntrinsicNode::UU: stride1 = 2; stride2 = 2; break; 3408 default: ShouldNotReachHere(); break; 3409 } 3410 3411 bind(Lloop); 3412 if (stride1 == 1) { lbz(chr1, 0, str1); } else { lhz(chr1, 0, str1); } 3413 if (stride2 == 1) { lbz(chr2, 0, str2); } else { lhz(chr2, 0, str2); } 3414 subf_(result, chr2, chr1); // result = chr1 - chr2 3415 bne(CCR0, Ldone); 3416 addi(str1, str1, stride1); 3417 addi(str2, str2, stride2); 3418 bdnz(Lloop); 3419 3420 // If strings are equal up to min length, return the length difference. 3421 bind(Lreturn_diff); 3422 mr(result, diff); 3423 3424 // Otherwise, return the difference between the first mismatched chars. 3425 bind(Ldone); 3426 if (ae == StrIntrinsicNode::UL) { 3427 neg(result, result); // Negate result (see note above). 3428 } 3429 } 3430 3431 void MacroAssembler::array_equals(bool is_array_equ, Register ary1, Register ary2, 3432 Register limit, Register tmp1, Register result, bool is_byte) { 3433 const Register tmp0 = R0; 3434 assert_different_registers(ary1, ary2, limit, tmp0, tmp1, result); 3435 Label Ldone, Lskiploop, Lloop, Lfastloop, Lskipfast; 3436 bool limit_needs_shift = false; 3437 3438 if (is_array_equ) { 3439 const int length_offset = arrayOopDesc::length_offset_in_bytes(); 3440 const int base_offset = arrayOopDesc::base_offset_in_bytes(is_byte ? T_BYTE : T_CHAR); 3441 3442 // Return true if the same array. 3443 cmpd(CCR0, ary1, ary2); 3444 beq(CCR0, Lskiploop); 3445 3446 // Return false if one of them is NULL. 3447 cmpdi(CCR0, ary1, 0); 3448 cmpdi(CCR1, ary2, 0); 3449 li(result, 0); 3450 cror(CCR0, Assembler::equal, CCR1, Assembler::equal); 3451 beq(CCR0, Ldone); 3452 3453 // Load the lengths of arrays. 3454 lwz(limit, length_offset, ary1); 3455 lwz(tmp0, length_offset, ary2); 3456 3457 // Return false if the two arrays are not equal length. 3458 cmpw(CCR0, limit, tmp0); 3459 bne(CCR0, Ldone); 3460 3461 // Load array addresses. 3462 addi(ary1, ary1, base_offset); 3463 addi(ary2, ary2, base_offset); 3464 } else { 3465 limit_needs_shift = !is_byte; 3466 li(result, 0); // Assume not equal. 3467 } 3468 3469 // Rename registers 3470 Register chr1 = tmp0; 3471 Register chr2 = tmp1; 3472 3473 // Compare 8 bytes per iteration in fast loop. 3474 const int log2_chars_per_iter = is_byte ? 3 : 2; 3475 3476 srwi_(tmp0, limit, log2_chars_per_iter + (limit_needs_shift ? 1 : 0)); 3477 beq(CCR0, Lskipfast); 3478 mtctr(tmp0); 3479 3480 bind(Lfastloop); 3481 ld(chr1, 0, ary1); 3482 ld(chr2, 0, ary2); 3483 addi(ary1, ary1, 8); 3484 addi(ary2, ary2, 8); 3485 cmpd(CCR0, chr1, chr2); 3486 bne(CCR0, Ldone); 3487 bdnz(Lfastloop); 3488 3489 bind(Lskipfast); 3490 rldicl_(limit, limit, limit_needs_shift ? 64 - 1 : 0, 64 - log2_chars_per_iter); // Remaining characters. 3491 beq(CCR0, Lskiploop); 3492 mtctr(limit); 3493 3494 // Character by character. 3495 bind(Lloop); 3496 if (is_byte) { 3497 lbz(chr1, 0, ary1); 3498 lbz(chr2, 0, ary2); 3499 addi(ary1, ary1, 1); 3500 addi(ary2, ary2, 1); 3501 } else { 3502 lhz(chr1, 0, ary1); 3503 lhz(chr2, 0, ary2); 3504 addi(ary1, ary1, 2); 3505 addi(ary2, ary2, 2); 3506 } 3507 cmpw(CCR0, chr1, chr2); 3508 bne(CCR0, Ldone); 3509 bdnz(Lloop); 3510 3511 bind(Lskiploop); 3512 li(result, 1); // All characters are equal. 3513 bind(Ldone); 3514 } 3515 3516 void MacroAssembler::string_indexof(Register result, Register haystack, Register haycnt, 3517 Register needle, ciTypeArray* needle_values, Register needlecnt, int needlecntval, 3518 Register tmp1, Register tmp2, Register tmp3, Register tmp4, int ae) { 3519 3520 // Ensure 0<needlecnt<=haycnt in ideal graph as prerequisite! 3521 Label L_TooShort, L_Found, L_NotFound, L_End; 3522 Register last_addr = haycnt, // Kill haycnt at the beginning. 3523 addr = tmp1, 3524 n_start = tmp2, 3525 ch1 = tmp3, 3526 ch2 = R0; 3527 3528 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 3529 const int h_csize = (ae == StrIntrinsicNode::LL) ? 1 : 2; 3530 const int n_csize = (ae == StrIntrinsicNode::UU) ? 2 : 1; 3531 3532 // ************************************************************************************************** 3533 // Prepare for main loop: optimized for needle count >=2, bail out otherwise. 3534 // ************************************************************************************************** 3535 3536 // Compute last haystack addr to use if no match gets found. 3537 clrldi(haycnt, haycnt, 32); // Ensure positive int is valid as 64 bit value. 3538 addi(addr, haystack, -h_csize); // Accesses use pre-increment. 3539 if (needlecntval == 0) { // variable needlecnt 3540 cmpwi(CCR6, needlecnt, 2); 3541 clrldi(needlecnt, needlecnt, 32); // Ensure positive int is valid as 64 bit value. 3542 blt(CCR6, L_TooShort); // Variable needlecnt: handle short needle separately. 3543 } 3544 3545 if (n_csize == 2) { lwz(n_start, 0, needle); } else { lhz(n_start, 0, needle); } // Load first 2 characters of needle. 3546 3547 if (needlecntval == 0) { // variable needlecnt 3548 subf(ch1, needlecnt, haycnt); // Last character index to compare is haycnt-needlecnt. 3549 addi(needlecnt, needlecnt, -2); // Rest of needle. 3550 } else { // constant needlecnt 3551 guarantee(needlecntval != 1, "IndexOf with single-character needle must be handled separately"); 3552 assert((needlecntval & 0x7fff) == needlecntval, "wrong immediate"); 3553 addi(ch1, haycnt, -needlecntval); // Last character index to compare is haycnt-needlecnt. 3554 if (needlecntval > 3) { li(needlecnt, needlecntval - 2); } // Rest of needle. 3555 } 3556 3557 if (h_csize == 2) { slwi(ch1, ch1, 1); } // Scale to number of bytes. 3558 3559 if (ae ==StrIntrinsicNode::UL) { 3560 srwi(tmp4, n_start, 1*8); // ___0 3561 rlwimi(n_start, tmp4, 2*8, 0, 23); // _0_1 3562 } 3563 3564 add(last_addr, haystack, ch1); // Point to last address to compare (haystack+2*(haycnt-needlecnt)). 3565 3566 // Main Loop (now we have at least 2 characters). 3567 Label L_OuterLoop, L_InnerLoop, L_FinalCheck, L_Comp1, L_Comp2; 3568 bind(L_OuterLoop); // Search for 1st 2 characters. 3569 Register addr_diff = tmp4; 3570 subf(addr_diff, addr, last_addr); // Difference between already checked address and last address to check. 3571 addi(addr, addr, h_csize); // This is the new address we want to use for comparing. 3572 srdi_(ch2, addr_diff, h_csize); 3573 beq(CCR0, L_FinalCheck); // 2 characters left? 3574 mtctr(ch2); // num of characters / 2 3575 bind(L_InnerLoop); // Main work horse (2x unrolled search loop) 3576 if (h_csize == 2) { // Load 2 characters of haystack (ignore alignment). 3577 lwz(ch1, 0, addr); 3578 lwz(ch2, 2, addr); 3579 } else { 3580 lhz(ch1, 0, addr); 3581 lhz(ch2, 1, addr); 3582 } 3583 cmpw(CCR0, ch1, n_start); // Compare 2 characters (1 would be sufficient but try to reduce branches to CompLoop). 3584 cmpw(CCR1, ch2, n_start); 3585 beq(CCR0, L_Comp1); // Did we find the needle start? 3586 beq(CCR1, L_Comp2); 3587 addi(addr, addr, 2 * h_csize); 3588 bdnz(L_InnerLoop); 3589 bind(L_FinalCheck); 3590 andi_(addr_diff, addr_diff, h_csize); // Remaining characters not covered by InnerLoop: (num of characters) & 1. 3591 beq(CCR0, L_NotFound); 3592 if (h_csize == 2) { lwz(ch1, 0, addr); } else { lhz(ch1, 0, addr); } // One position left at which we have to compare. 3593 cmpw(CCR1, ch1, n_start); 3594 beq(CCR1, L_Comp1); 3595 bind(L_NotFound); 3596 li(result, -1); // not found 3597 b(L_End); 3598 3599 // ************************************************************************************************** 3600 // Special Case: unfortunately, the variable needle case can be called with needlecnt<2 3601 // ************************************************************************************************** 3602 if (needlecntval == 0) { // We have to handle these cases separately. 3603 Label L_OneCharLoop; 3604 bind(L_TooShort); 3605 mtctr(haycnt); 3606 if (n_csize == 2) { lhz(n_start, 0, needle); } else { lbz(n_start, 0, needle); } // First character of needle 3607 bind(L_OneCharLoop); 3608 if (h_csize == 2) { lhzu(ch1, 2, addr); } else { lbzu(ch1, 1, addr); } 3609 cmpw(CCR1, ch1, n_start); 3610 beq(CCR1, L_Found); // Did we find the one character needle? 3611 bdnz(L_OneCharLoop); 3612 li(result, -1); // Not found. 3613 b(L_End); 3614 } 3615 3616 // ************************************************************************************************** 3617 // Regular Case Part II: compare rest of needle (first 2 characters have been compared already) 3618 // ************************************************************************************************** 3619 3620 // Compare the rest 3621 bind(L_Comp2); 3622 addi(addr, addr, h_csize); // First comparison has failed, 2nd one hit. 3623 bind(L_Comp1); // Addr points to possible needle start. 3624 if (needlecntval != 2) { // Const needlecnt==2? 3625 if (needlecntval != 3) { 3626 if (needlecntval == 0) { beq(CCR6, L_Found); } // Variable needlecnt==2? 3627 Register n_ind = tmp4, 3628 h_ind = n_ind; 3629 li(n_ind, 2 * n_csize); // First 2 characters are already compared, use index 2. 3630 mtctr(needlecnt); // Decremented by 2, still > 0. 3631 Label L_CompLoop; 3632 bind(L_CompLoop); 3633 if (ae ==StrIntrinsicNode::UL) { 3634 h_ind = ch1; 3635 sldi(h_ind, n_ind, 1); 3636 } 3637 if (n_csize == 2) { lhzx(ch2, needle, n_ind); } else { lbzx(ch2, needle, n_ind); } 3638 if (h_csize == 2) { lhzx(ch1, addr, h_ind); } else { lbzx(ch1, addr, h_ind); } 3639 cmpw(CCR1, ch1, ch2); 3640 bne(CCR1, L_OuterLoop); 3641 addi(n_ind, n_ind, n_csize); 3642 bdnz(L_CompLoop); 3643 } else { // No loop required if there's only one needle character left. 3644 if (n_csize == 2) { lhz(ch2, 2 * 2, needle); } else { lbz(ch2, 2 * 1, needle); } 3645 if (h_csize == 2) { lhz(ch1, 2 * 2, addr); } else { lbz(ch1, 2 * 1, addr); } 3646 cmpw(CCR1, ch1, ch2); 3647 bne(CCR1, L_OuterLoop); 3648 } 3649 } 3650 // Return index ... 3651 bind(L_Found); 3652 subf(result, haystack, addr); // relative to haystack, ... 3653 if (h_csize == 2) { srdi(result, result, 1); } // in characters. 3654 bind(L_End); 3655 } // string_indexof 3656 3657 void MacroAssembler::string_indexof_char(Register result, Register haystack, Register haycnt, 3658 Register needle, jchar needleChar, Register tmp1, Register tmp2, bool is_byte) { 3659 assert_different_registers(haystack, haycnt, needle, tmp1, tmp2); 3660 3661 Label L_InnerLoop, L_FinalCheck, L_Found1, L_Found2, L_NotFound, L_End; 3662 Register addr = tmp1, 3663 ch1 = tmp2, 3664 ch2 = R0; 3665 3666 const int h_csize = is_byte ? 1 : 2; 3667 3668 //4: 3669 srwi_(tmp2, haycnt, 1); // Shift right by exact_log2(UNROLL_FACTOR). 3670 mr(addr, haystack); 3671 beq(CCR0, L_FinalCheck); 3672 mtctr(tmp2); // Move to count register. 3673 //8: 3674 bind(L_InnerLoop); // Main work horse (2x unrolled search loop). 3675 if (!is_byte) { 3676 lhz(ch1, 0, addr); 3677 lhz(ch2, 2, addr); 3678 } else { 3679 lbz(ch1, 0, addr); 3680 lbz(ch2, 1, addr); 3681 } 3682 (needle != R0) ? cmpw(CCR0, ch1, needle) : cmplwi(CCR0, ch1, (unsigned int)needleChar); 3683 (needle != R0) ? cmpw(CCR1, ch2, needle) : cmplwi(CCR1, ch2, (unsigned int)needleChar); 3684 beq(CCR0, L_Found1); // Did we find the needle? 3685 beq(CCR1, L_Found2); 3686 addi(addr, addr, 2 * h_csize); 3687 bdnz(L_InnerLoop); 3688 //16: 3689 bind(L_FinalCheck); 3690 andi_(R0, haycnt, 1); 3691 beq(CCR0, L_NotFound); 3692 if (!is_byte) { lhz(ch1, 0, addr); } else { lbz(ch1, 0, addr); } // One position left at which we have to compare. 3693 (needle != R0) ? cmpw(CCR1, ch1, needle) : cmplwi(CCR1, ch1, (unsigned int)needleChar); 3694 beq(CCR1, L_Found1); 3695 //21: 3696 bind(L_NotFound); 3697 li(result, -1); // Not found. 3698 b(L_End); 3699 3700 bind(L_Found2); 3701 addi(addr, addr, h_csize); 3702 //24: 3703 bind(L_Found1); // Return index ... 3704 subf(result, haystack, addr); // relative to haystack, ... 3705 if (!is_byte) { srdi(result, result, 1); } // in characters. 3706 bind(L_End); 3707 } // string_indexof_char 3708 3709 3710 void MacroAssembler::has_negatives(Register src, Register cnt, Register result, 3711 Register tmp1, Register tmp2) { 3712 const Register tmp0 = R0; 3713 assert_different_registers(src, result, cnt, tmp0, tmp1, tmp2); 3714 Label Lfastloop, Lslow, Lloop, Lnoneg, Ldone; 3715 3716 // Check if cnt >= 8 (= 16 bytes) 3717 lis(tmp1, (int)(short)0x8080); // tmp1 = 0x8080808080808080 3718 srwi_(tmp2, cnt, 4); 3719 li(result, 1); // Assume there's a negative byte. 3720 beq(CCR0, Lslow); 3721 ori(tmp1, tmp1, 0x8080); 3722 rldimi(tmp1, tmp1, 32, 0); 3723 mtctr(tmp2); 3724 3725 // 2x unrolled loop 3726 bind(Lfastloop); 3727 ld(tmp2, 0, src); 3728 ld(tmp0, 8, src); 3729 3730 orr(tmp0, tmp2, tmp0); 3731 3732 and_(tmp0, tmp0, tmp1); 3733 bne(CCR0, Ldone); // Found negative byte. 3734 addi(src, src, 16); 3735 3736 bdnz(Lfastloop); 3737 3738 bind(Lslow); // Fallback to slow version 3739 rldicl_(tmp0, cnt, 0, 64-4); 3740 beq(CCR0, Lnoneg); 3741 mtctr(tmp0); 3742 bind(Lloop); 3743 lbz(tmp0, 0, src); 3744 addi(src, src, 1); 3745 andi_(tmp0, tmp0, 0x80); 3746 bne(CCR0, Ldone); // Found negative byte. 3747 bdnz(Lloop); 3748 bind(Lnoneg); 3749 li(result, 0); 3750 3751 bind(Ldone); 3752 } 3753 3754 3755 // Intrinsics for non-CompactStrings 3756 3757 // Search for a single jchar in an jchar[]. 3758 // 3759 // Assumes that result differs from all other registers. 3760 // 3761 // 'haystack' is the addresses of a jchar-array. 3762 // 'needle' is either the character to search for or R0. 3763 // 'needleChar' is the character to search for if 'needle' == R0.. 3764 // 'haycnt' is the length of the haystack. We assume 'haycnt' >=1. 3765 // 3766 // Preserves haystack, haycnt, needle and kills all other registers. 3767 // 3768 // If needle == R0, we search for the constant needleChar. 3769 void MacroAssembler::string_indexof_1(Register result, Register haystack, Register haycnt, 3770 Register needle, jchar needleChar, 3771 Register tmp1, Register tmp2) { 3772 3773 assert_different_registers(result, haystack, haycnt, needle, tmp1, tmp2); 3774 3775 Label L_InnerLoop, L_FinalCheck, L_Found1, L_Found2, L_Found3, L_NotFound, L_End; 3776 Register addr = tmp1, 3777 ch1 = tmp2, 3778 ch2 = R0; 3779 3780 //3: 3781 dcbtct(haystack, 0x00); // Indicate R/O access to haystack. 3782 3783 srwi_(tmp2, haycnt, 1); // Shift right by exact_log2(UNROLL_FACTOR). 3784 mr(addr, haystack); 3785 beq(CCR0, L_FinalCheck); 3786 mtctr(tmp2); // Move to count register. 3787 //8: 3788 bind(L_InnerLoop); // Main work horse (2x unrolled search loop). 3789 lhz(ch1, 0, addr); // Load characters from haystack. 3790 lhz(ch2, 2, addr); 3791 (needle != R0) ? cmpw(CCR0, ch1, needle) : cmplwi(CCR0, ch1, needleChar); 3792 (needle != R0) ? cmpw(CCR1, ch2, needle) : cmplwi(CCR1, ch2, needleChar); 3793 beq(CCR0, L_Found1); // Did we find the needle? 3794 beq(CCR1, L_Found2); 3795 addi(addr, addr, 4); 3796 bdnz(L_InnerLoop); 3797 //16: 3798 bind(L_FinalCheck); 3799 andi_(R0, haycnt, 1); 3800 beq(CCR0, L_NotFound); 3801 lhz(ch1, 0, addr); // One position left at which we have to compare. 3802 (needle != R0) ? cmpw(CCR1, ch1, needle) : cmplwi(CCR1, ch1, needleChar); 3803 beq(CCR1, L_Found3); 3804 //21: 3805 bind(L_NotFound); 3806 li(result, -1); // Not found. 3807 b(L_End); 3808 3809 bind(L_Found2); 3810 addi(addr, addr, 2); 3811 //24: 3812 bind(L_Found1); 3813 bind(L_Found3); // Return index ... 3814 subf(addr, haystack, addr); // relative to haystack, 3815 srdi(result, addr, 1); // in characters. 3816 bind(L_End); 3817 } 3818 3819 3820 // Implementation of IndexOf for jchar arrays. 3821 // 3822 // The length of haystack and needle are not constant, i.e. passed in a register. 3823 // 3824 // Preserves registers haystack, needle. 3825 // Kills registers haycnt, needlecnt. 3826 // Assumes that result differs from all other registers. 3827 // Haystack, needle are the addresses of jchar-arrays. 3828 // Haycnt, needlecnt are the lengths of them, respectively. 3829 // 3830 // Needlecntval must be zero or 15-bit unsigned immediate and > 1. 3831 void MacroAssembler::string_indexof(Register result, Register haystack, Register haycnt, 3832 Register needle, ciTypeArray* needle_values, Register needlecnt, int needlecntval, 3833 Register tmp1, Register tmp2, Register tmp3, Register tmp4) { 3834 3835 // Ensure 0<needlecnt<=haycnt in ideal graph as prerequisite! 3836 Label L_TooShort, L_Found, L_NotFound, L_End; 3837 Register last_addr = haycnt, // Kill haycnt at the beginning. 3838 addr = tmp1, 3839 n_start = tmp2, 3840 ch1 = tmp3, 3841 ch2 = R0; 3842 3843 // ************************************************************************************************** 3844 // Prepare for main loop: optimized for needle count >=2, bail out otherwise. 3845 // ************************************************************************************************** 3846 3847 //1 (variable) or 3 (const): 3848 dcbtct(needle, 0x00); // Indicate R/O access to str1. 3849 dcbtct(haystack, 0x00); // Indicate R/O access to str2. 3850 3851 // Compute last haystack addr to use if no match gets found. 3852 if (needlecntval == 0) { // variable needlecnt 3853 //3: 3854 subf(ch1, needlecnt, haycnt); // Last character index to compare is haycnt-needlecnt. 3855 addi(addr, haystack, -2); // Accesses use pre-increment. 3856 cmpwi(CCR6, needlecnt, 2); 3857 blt(CCR6, L_TooShort); // Variable needlecnt: handle short needle separately. 3858 slwi(ch1, ch1, 1); // Scale to number of bytes. 3859 lwz(n_start, 0, needle); // Load first 2 characters of needle. 3860 add(last_addr, haystack, ch1); // Point to last address to compare (haystack+2*(haycnt-needlecnt)). 3861 addi(needlecnt, needlecnt, -2); // Rest of needle. 3862 } else { // constant needlecnt 3863 guarantee(needlecntval != 1, "IndexOf with single-character needle must be handled separately"); 3864 assert((needlecntval & 0x7fff) == needlecntval, "wrong immediate"); 3865 //5: 3866 addi(ch1, haycnt, -needlecntval); // Last character index to compare is haycnt-needlecnt. 3867 lwz(n_start, 0, needle); // Load first 2 characters of needle. 3868 addi(addr, haystack, -2); // Accesses use pre-increment. 3869 slwi(ch1, ch1, 1); // Scale to number of bytes. 3870 add(last_addr, haystack, ch1); // Point to last address to compare (haystack+2*(haycnt-needlecnt)). 3871 li(needlecnt, needlecntval-2); // Rest of needle. 3872 } 3873 3874 // Main Loop (now we have at least 3 characters). 3875 //11: 3876 Label L_OuterLoop, L_InnerLoop, L_FinalCheck, L_Comp1, L_Comp2, L_Comp3; 3877 bind(L_OuterLoop); // Search for 1st 2 characters. 3878 Register addr_diff = tmp4; 3879 subf(addr_diff, addr, last_addr); // Difference between already checked address and last address to check. 3880 addi(addr, addr, 2); // This is the new address we want to use for comparing. 3881 srdi_(ch2, addr_diff, 2); 3882 beq(CCR0, L_FinalCheck); // 2 characters left? 3883 mtctr(ch2); // addr_diff/4 3884 //16: 3885 bind(L_InnerLoop); // Main work horse (2x unrolled search loop) 3886 lwz(ch1, 0, addr); // Load 2 characters of haystack (ignore alignment). 3887 lwz(ch2, 2, addr); 3888 cmpw(CCR0, ch1, n_start); // Compare 2 characters (1 would be sufficient but try to reduce branches to CompLoop). 3889 cmpw(CCR1, ch2, n_start); 3890 beq(CCR0, L_Comp1); // Did we find the needle start? 3891 beq(CCR1, L_Comp2); 3892 addi(addr, addr, 4); 3893 bdnz(L_InnerLoop); 3894 //24: 3895 bind(L_FinalCheck); 3896 rldicl_(addr_diff, addr_diff, 64-1, 63); // Remaining characters not covered by InnerLoop: (addr_diff>>1)&1. 3897 beq(CCR0, L_NotFound); 3898 lwz(ch1, 0, addr); // One position left at which we have to compare. 3899 cmpw(CCR1, ch1, n_start); 3900 beq(CCR1, L_Comp3); 3901 //29: 3902 bind(L_NotFound); 3903 li(result, -1); // not found 3904 b(L_End); 3905 3906 3907 // ************************************************************************************************** 3908 // Special Case: unfortunately, the variable needle case can be called with needlecnt<2 3909 // ************************************************************************************************** 3910 //31: 3911 if ((needlecntval>>1) !=1 ) { // Const needlecnt is 2 or 3? Reduce code size. 3912 int nopcnt = 5; 3913 if (needlecntval !=0 ) ++nopcnt; // Balance alignment (other case: see below). 3914 if (needlecntval == 0) { // We have to handle these cases separately. 3915 Label L_OneCharLoop; 3916 bind(L_TooShort); 3917 mtctr(haycnt); 3918 lhz(n_start, 0, needle); // First character of needle 3919 bind(L_OneCharLoop); 3920 lhzu(ch1, 2, addr); 3921 cmpw(CCR1, ch1, n_start); 3922 beq(CCR1, L_Found); // Did we find the one character needle? 3923 bdnz(L_OneCharLoop); 3924 li(result, -1); // Not found. 3925 b(L_End); 3926 } // 8 instructions, so no impact on alignment. 3927 for (int x = 0; x < nopcnt; ++x) nop(); 3928 } 3929 3930 // ************************************************************************************************** 3931 // Regular Case Part II: compare rest of needle (first 2 characters have been compared already) 3932 // ************************************************************************************************** 3933 3934 // Compare the rest 3935 //36 if needlecntval==0, else 37: 3936 bind(L_Comp2); 3937 addi(addr, addr, 2); // First comparison has failed, 2nd one hit. 3938 bind(L_Comp1); // Addr points to possible needle start. 3939 bind(L_Comp3); // Could have created a copy and use a different return address but saving code size here. 3940 if (needlecntval != 2) { // Const needlecnt==2? 3941 if (needlecntval != 3) { 3942 if (needlecntval == 0) beq(CCR6, L_Found); // Variable needlecnt==2? 3943 Register ind_reg = tmp4; 3944 li(ind_reg, 2*2); // First 2 characters are already compared, use index 2. 3945 mtctr(needlecnt); // Decremented by 2, still > 0. 3946 //40: 3947 Label L_CompLoop; 3948 bind(L_CompLoop); 3949 lhzx(ch2, needle, ind_reg); 3950 lhzx(ch1, addr, ind_reg); 3951 cmpw(CCR1, ch1, ch2); 3952 bne(CCR1, L_OuterLoop); 3953 addi(ind_reg, ind_reg, 2); 3954 bdnz(L_CompLoop); 3955 } else { // No loop required if there's only one needle character left. 3956 lhz(ch2, 2*2, needle); 3957 lhz(ch1, 2*2, addr); 3958 cmpw(CCR1, ch1, ch2); 3959 bne(CCR1, L_OuterLoop); 3960 } 3961 } 3962 // Return index ... 3963 //46: 3964 bind(L_Found); 3965 subf(addr, haystack, addr); // relative to haystack, ... 3966 srdi(result, addr, 1); // in characters. 3967 //48: 3968 bind(L_End); 3969 } 3970 3971 // Implementation of Compare for jchar arrays. 3972 // 3973 // Kills the registers str1, str2, cnt1, cnt2. 3974 // Kills cr0, ctr. 3975 // Assumes that result differes from the input registers. 3976 void MacroAssembler::string_compare(Register str1_reg, Register str2_reg, Register cnt1_reg, Register cnt2_reg, 3977 Register result_reg, Register tmp_reg) { 3978 assert_different_registers(result_reg, str1_reg, str2_reg, cnt1_reg, cnt2_reg, tmp_reg); 3979 3980 Label Ldone, Lslow_case, Lslow_loop, Lfast_loop; 3981 Register cnt_diff = R0, 3982 limit_reg = cnt1_reg, 3983 chr1_reg = result_reg, 3984 chr2_reg = cnt2_reg, 3985 addr_diff = str2_reg; 3986 3987 // 'cnt_reg' contains the number of characters in the string's character array for the 3988 // pre-CompactStrings strings implementation and the number of bytes in the string's 3989 // byte array for the CompactStrings strings implementation. 3990 const int HAS_COMPACT_STRING = java_lang_String::has_coder_field() ? 1 : 0; // '1' = byte array, '0' = char array 3991 3992 // Offset 0 should be 32 byte aligned. 3993 //-6: 3994 srawi(cnt1_reg, cnt1_reg, HAS_COMPACT_STRING); 3995 srawi(cnt2_reg, cnt2_reg, HAS_COMPACT_STRING); 3996 //-4: 3997 dcbtct(str1_reg, 0x00); // Indicate R/O access to str1. 3998 dcbtct(str2_reg, 0x00); // Indicate R/O access to str2. 3999 //-2: 4000 // Compute min(cnt1, cnt2) and check if 0 (bail out if we don't need to compare characters). 4001 subf(result_reg, cnt2_reg, cnt1_reg); // difference between cnt1/2 4002 subf_(addr_diff, str1_reg, str2_reg); // alias? 4003 beq(CCR0, Ldone); // return cnt difference if both ones are identical 4004 srawi(limit_reg, result_reg, 31); // generate signmask (cnt1/2 must be non-negative so cnt_diff can't overflow) 4005 mr(cnt_diff, result_reg); 4006 andr(limit_reg, result_reg, limit_reg); // difference or zero (negative): cnt1<cnt2 ? cnt1-cnt2 : 0 4007 add_(limit_reg, cnt2_reg, limit_reg); // min(cnt1, cnt2)==0? 4008 beq(CCR0, Ldone); // return cnt difference if one has 0 length 4009 4010 lhz(chr1_reg, 0, str1_reg); // optional: early out if first characters mismatch 4011 lhzx(chr2_reg, str1_reg, addr_diff); // optional: early out if first characters mismatch 4012 addi(tmp_reg, limit_reg, -1); // min(cnt1, cnt2)-1 4013 subf_(result_reg, chr2_reg, chr1_reg); // optional: early out if first characters mismatch 4014 bne(CCR0, Ldone); // optional: early out if first characters mismatch 4015 4016 // Set loop counter by scaling down tmp_reg 4017 srawi_(chr2_reg, tmp_reg, exact_log2(4)); // (min(cnt1, cnt2)-1)/4 4018 ble(CCR0, Lslow_case); // need >4 characters for fast loop 4019 andi(limit_reg, tmp_reg, 4-1); // remaining characters 4020 4021 // Adapt str1_reg str2_reg for the first loop iteration 4022 mtctr(chr2_reg); // (min(cnt1, cnt2)-1)/4 4023 addi(limit_reg, limit_reg, 4+1); // compare last 5-8 characters in slow_case if mismatch found in fast_loop 4024 //16: 4025 // Compare the rest of the characters 4026 bind(Lfast_loop); 4027 ld(chr1_reg, 0, str1_reg); 4028 ldx(chr2_reg, str1_reg, addr_diff); 4029 cmpd(CCR0, chr2_reg, chr1_reg); 4030 bne(CCR0, Lslow_case); // return chr1_reg 4031 addi(str1_reg, str1_reg, 4*2); 4032 bdnz(Lfast_loop); 4033 addi(limit_reg, limit_reg, -4); // no mismatch found in fast_loop, only 1-4 characters missing 4034 //23: 4035 bind(Lslow_case); 4036 mtctr(limit_reg); 4037 //24: 4038 bind(Lslow_loop); 4039 lhz(chr1_reg, 0, str1_reg); 4040 lhzx(chr2_reg, str1_reg, addr_diff); 4041 subf_(result_reg, chr2_reg, chr1_reg); 4042 bne(CCR0, Ldone); // return chr1_reg 4043 addi(str1_reg, str1_reg, 1*2); 4044 bdnz(Lslow_loop); 4045 //30: 4046 // If strings are equal up to min length, return the length difference. 4047 mr(result_reg, cnt_diff); 4048 nop(); // alignment 4049 //32: 4050 // Otherwise, return the difference between the first mismatched chars. 4051 bind(Ldone); 4052 } 4053 4054 4055 // Compare char[] arrays. 4056 // 4057 // str1_reg USE only 4058 // str2_reg USE only 4059 // cnt_reg USE_DEF, due to tmp reg shortage 4060 // result_reg DEF only, might compromise USE only registers 4061 void MacroAssembler::char_arrays_equals(Register str1_reg, Register str2_reg, Register cnt_reg, Register result_reg, 4062 Register tmp1_reg, Register tmp2_reg, Register tmp3_reg, Register tmp4_reg, 4063 Register tmp5_reg) { 4064 4065 // Str1 may be the same register as str2 which can occur e.g. after scalar replacement. 4066 assert_different_registers(result_reg, str1_reg, cnt_reg, tmp1_reg, tmp2_reg, tmp3_reg, tmp4_reg, tmp5_reg); 4067 assert_different_registers(result_reg, str2_reg, cnt_reg, tmp1_reg, tmp2_reg, tmp3_reg, tmp4_reg, tmp5_reg); 4068 4069 // Offset 0 should be 32 byte aligned. 4070 Label Linit_cbc, Lcbc, Lloop, Ldone_true, Ldone_false; 4071 Register index_reg = tmp5_reg; 4072 Register cbc_iter = tmp4_reg; 4073 4074 // 'cnt_reg' contains the number of characters in the string's character array for the 4075 // pre-CompactStrings strings implementation and the number of bytes in the string's 4076 // byte array for the CompactStrings strings implementation. 4077 const int HAS_COMPACT_STRING = java_lang_String::has_coder_field() ? 1 : 0; // '1' = byte array, '0' = char array 4078 4079 //-1: 4080 dcbtct(str1_reg, 0x00); // Indicate R/O access to str1. 4081 dcbtct(str2_reg, 0x00); // Indicate R/O access to str2. 4082 //1: 4083 // cbc_iter: remaining characters after the '4 java characters per iteration' loop. 4084 rlwinm(cbc_iter, cnt_reg, 32 - HAS_COMPACT_STRING, 30, 31); // (cnt_reg % (HAS_COMPACT_STRING ? 8 : 4)) >> HAS_COMPACT_STRING 4085 li(index_reg, 0); // init 4086 li(result_reg, 0); // assume false 4087 // tmp2_reg: units of 4 java characters (i.e. 8 bytes) per iteration (main loop). 4088 srwi_(tmp2_reg, cnt_reg, exact_log2(4 << HAS_COMPACT_STRING)); // cnt_reg / (HAS_COMPACT_STRING ? 8 : 4) 4089 4090 cmpwi(CCR1, cbc_iter, 0); // CCR1 = (cbc_iter==0) 4091 beq(CCR0, Linit_cbc); // too short 4092 mtctr(tmp2_reg); 4093 //8: 4094 bind(Lloop); 4095 ldx(tmp1_reg, str1_reg, index_reg); 4096 ldx(tmp2_reg, str2_reg, index_reg); 4097 cmpd(CCR0, tmp1_reg, tmp2_reg); 4098 bne(CCR0, Ldone_false); // Unequal char pair found -> done. 4099 addi(index_reg, index_reg, 4*sizeof(jchar)); 4100 bdnz(Lloop); 4101 //14: 4102 bind(Linit_cbc); 4103 beq(CCR1, Ldone_true); 4104 mtctr(cbc_iter); 4105 //16: 4106 bind(Lcbc); 4107 lhzx(tmp1_reg, str1_reg, index_reg); 4108 lhzx(tmp2_reg, str2_reg, index_reg); 4109 cmpw(CCR0, tmp1_reg, tmp2_reg); 4110 bne(CCR0, Ldone_false); // Unequal char pair found -> done. 4111 addi(index_reg, index_reg, 1*sizeof(jchar)); 4112 bdnz(Lcbc); 4113 nop(); 4114 bind(Ldone_true); 4115 li(result_reg, 1); 4116 //24: 4117 bind(Ldone_false); 4118 } 4119 4120 4121 void MacroAssembler::char_arrays_equalsImm(Register str1_reg, Register str2_reg, int cntval, Register result_reg, 4122 Register tmp1_reg, Register tmp2_reg) { 4123 // Str1 may be the same register as str2 which can occur e.g. after scalar replacement. 4124 assert_different_registers(result_reg, str1_reg, tmp1_reg, tmp2_reg); 4125 assert_different_registers(result_reg, str2_reg, tmp1_reg, tmp2_reg); 4126 assert(sizeof(jchar) == 2, "must be"); 4127 assert(cntval >= 0 && ((cntval & 0x7fff) == cntval), "wrong immediate"); 4128 4129 // 'cntval' contains the number of characters in the string's character array for the 4130 // pre-CompactStrings strings implementation and the number of bytes in the string's 4131 // byte array for the CompactStrings strings implementation. 4132 cntval >>= (java_lang_String::has_coder_field() ? 1 : 0); // '1' = byte array strings, '0' = char array strings 4133 4134 Label Ldone_false; 4135 4136 if (cntval < 16) { // short case 4137 if (cntval != 0) li(result_reg, 0); // assume false 4138 4139 const int num_bytes = cntval*sizeof(jchar); 4140 int index = 0; 4141 for (int next_index; (next_index = index + 8) <= num_bytes; index = next_index) { 4142 ld(tmp1_reg, index, str1_reg); 4143 ld(tmp2_reg, index, str2_reg); 4144 cmpd(CCR0, tmp1_reg, tmp2_reg); 4145 bne(CCR0, Ldone_false); 4146 } 4147 if (cntval & 2) { 4148 lwz(tmp1_reg, index, str1_reg); 4149 lwz(tmp2_reg, index, str2_reg); 4150 cmpw(CCR0, tmp1_reg, tmp2_reg); 4151 bne(CCR0, Ldone_false); 4152 index += 4; 4153 } 4154 if (cntval & 1) { 4155 lhz(tmp1_reg, index, str1_reg); 4156 lhz(tmp2_reg, index, str2_reg); 4157 cmpw(CCR0, tmp1_reg, tmp2_reg); 4158 bne(CCR0, Ldone_false); 4159 } 4160 // fallthrough: true 4161 } else { 4162 Label Lloop; 4163 Register index_reg = tmp1_reg; 4164 const int loopcnt = cntval/4; 4165 assert(loopcnt > 0, "must be"); 4166 // Offset 0 should be 32 byte aligned. 4167 //2: 4168 dcbtct(str1_reg, 0x00); // Indicate R/O access to str1. 4169 dcbtct(str2_reg, 0x00); // Indicate R/O access to str2. 4170 li(tmp2_reg, loopcnt); 4171 li(index_reg, 0); // init 4172 li(result_reg, 0); // assume false 4173 mtctr(tmp2_reg); 4174 //8: 4175 bind(Lloop); 4176 ldx(R0, str1_reg, index_reg); 4177 ldx(tmp2_reg, str2_reg, index_reg); 4178 cmpd(CCR0, R0, tmp2_reg); 4179 bne(CCR0, Ldone_false); // Unequal char pair found -> done. 4180 addi(index_reg, index_reg, 4*sizeof(jchar)); 4181 bdnz(Lloop); 4182 //14: 4183 if (cntval & 2) { 4184 lwzx(R0, str1_reg, index_reg); 4185 lwzx(tmp2_reg, str2_reg, index_reg); 4186 cmpw(CCR0, R0, tmp2_reg); 4187 bne(CCR0, Ldone_false); 4188 if (cntval & 1) addi(index_reg, index_reg, 2*sizeof(jchar)); 4189 } 4190 if (cntval & 1) { 4191 lhzx(R0, str1_reg, index_reg); 4192 lhzx(tmp2_reg, str2_reg, index_reg); 4193 cmpw(CCR0, R0, tmp2_reg); 4194 bne(CCR0, Ldone_false); 4195 } 4196 // fallthru: true 4197 } 4198 li(result_reg, 1); 4199 bind(Ldone_false); 4200 } 4201 4202 #endif // Compiler2 4203 4204 // Helpers for Intrinsic Emitters 4205 // 4206 // Revert the byte order of a 32bit value in a register 4207 // src: 0x44556677 4208 // dst: 0x77665544 4209 // Three steps to obtain the result: 4210 // 1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word 4211 // into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared. 4212 // This value initializes dst. 4213 // 2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost 4214 // byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go. 4215 // This value is mask inserted into dst with a [0..23] mask of 1s. 4216 // 3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position. 4217 // This value is mask inserted into dst with a [8..15] mask of 1s. 4218 void MacroAssembler::load_reverse_32(Register dst, Register src) { 4219 assert_different_registers(dst, src); 4220 4221 rldicl(dst, src, (4+1)*8, 56); // Rotate byte 4 into position 7 (rightmost), clear all to the left. 4222 rlwimi(dst, src, 3*8, 0, 23); // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone. 4223 rlwimi(dst, src, 1*8, 8, 15); // Insert byte 6 into position 5, leave the rest alone. 4224 } 4225 4226 // Calculate the column addresses of the crc32 lookup table into distinct registers. 4227 // This loop-invariant calculation is moved out of the loop body, reducing the loop 4228 // body size from 20 to 16 instructions. 4229 // Returns the offset that was used to calculate the address of column tc3. 4230 // Due to register shortage, setting tc3 may overwrite table. With the return offset 4231 // at hand, the original table address can be easily reconstructed. 4232 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) { 4233 4234 #ifdef VM_LITTLE_ENDIAN 4235 // This is what we implement (the DOLIT4 part): 4236 // ========================================================================= */ 4237 // #define DOLIT4 c ^= *buf4++; \ 4238 // c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \ 4239 // crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24] 4240 // #define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4 4241 // ========================================================================= */ 4242 const int ix0 = 3*(4*CRC32_COLUMN_SIZE); 4243 const int ix1 = 2*(4*CRC32_COLUMN_SIZE); 4244 const int ix2 = 1*(4*CRC32_COLUMN_SIZE); 4245 const int ix3 = 0*(4*CRC32_COLUMN_SIZE); 4246 #else 4247 // This is what we implement (the DOBIG4 part): 4248 // ========================================================================= 4249 // #define DOBIG4 c ^= *++buf4; \ 4250 // c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \ 4251 // crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24] 4252 // #define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4 4253 // ========================================================================= 4254 const int ix0 = 4*(4*CRC32_COLUMN_SIZE); 4255 const int ix1 = 5*(4*CRC32_COLUMN_SIZE); 4256 const int ix2 = 6*(4*CRC32_COLUMN_SIZE); 4257 const int ix3 = 7*(4*CRC32_COLUMN_SIZE); 4258 #endif 4259 assert_different_registers(table, tc0, tc1, tc2); 4260 assert(table == tc3, "must be!"); 4261 4262 addi(tc0, table, ix0); 4263 addi(tc1, table, ix1); 4264 addi(tc2, table, ix2); 4265 if (ix3 != 0) addi(tc3, table, ix3); 4266 4267 return ix3; 4268 } 4269 4270 /** 4271 * uint32_t crc; 4272 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8); 4273 */ 4274 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) { 4275 assert_different_registers(crc, table, tmp); 4276 assert_different_registers(val, table); 4277 4278 if (crc == val) { // Must rotate first to use the unmodified value. 4279 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 4280 // As we use a word (4-byte) instruction, we have to adapt the mask bit positions. 4281 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 4282 } else { 4283 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 4284 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 4285 } 4286 lwzx(tmp, table, tmp); 4287 xorr(crc, crc, tmp); 4288 } 4289 4290 /** 4291 * uint32_t crc; 4292 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8); 4293 */ 4294 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) { 4295 fold_byte_crc32(crc, crc, table, tmp); 4296 } 4297 4298 /** 4299 * Emits code to update CRC-32 with a byte value according to constants in table. 4300 * 4301 * @param [in,out]crc Register containing the crc. 4302 * @param [in]val Register containing the byte to fold into the CRC. 4303 * @param [in]table Register containing the table of crc constants. 4304 * 4305 * uint32_t crc; 4306 * val = crc_table[(val ^ crc) & 0xFF]; 4307 * crc = val ^ (crc >> 8); 4308 */ 4309 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 4310 BLOCK_COMMENT("update_byte_crc32:"); 4311 xorr(val, val, crc); 4312 fold_byte_crc32(crc, val, table, val); 4313 } 4314 4315 /** 4316 * @param crc register containing existing CRC (32-bit) 4317 * @param buf register pointing to input byte buffer (byte*) 4318 * @param len register containing number of bytes 4319 * @param table register pointing to CRC table 4320 */ 4321 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table, 4322 Register data, bool loopAlignment, bool invertCRC) { 4323 assert_different_registers(crc, buf, len, table, data); 4324 4325 Label L_mainLoop, L_done; 4326 const int mainLoop_stepping = 1; 4327 const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4; 4328 4329 // Process all bytes in a single-byte loop. 4330 clrldi_(len, len, 32); // Enforce 32 bit. Anything to do? 4331 beq(CCR0, L_done); 4332 4333 if (invertCRC) { 4334 nand(crc, crc, crc); // ~c 4335 } 4336 4337 mtctr(len); 4338 align(mainLoop_alignment); 4339 BIND(L_mainLoop); 4340 lbz(data, 0, buf); // Byte from buffer, zero-extended. 4341 addi(buf, buf, mainLoop_stepping); // Advance buffer position. 4342 update_byte_crc32(crc, data, table); 4343 bdnz(L_mainLoop); // Iterate. 4344 4345 if (invertCRC) { 4346 nand(crc, crc, crc); // ~c 4347 } 4348 4349 bind(L_done); 4350 } 4351 4352 /** 4353 * Emits code to update CRC-32 with a 4-byte value according to constants in table 4354 * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c 4355 */ 4356 // A not on the lookup table address(es): 4357 // The lookup table consists of two sets of four columns each. 4358 // The columns {0..3} are used for little-endian machines. 4359 // The columns {4..7} are used for big-endian machines. 4360 // To save the effort of adding the column offset to the table address each time 4361 // a table element is looked up, it is possible to pass the pre-calculated 4362 // column addresses. 4363 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary. 4364 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc, 4365 Register t0, Register t1, Register t2, Register t3, 4366 Register tc0, Register tc1, Register tc2, Register tc3) { 4367 assert_different_registers(crc, t3); 4368 4369 // XOR crc with next four bytes of buffer. 4370 lwz(t3, bufDisp, buf); 4371 if (bufInc != 0) { 4372 addi(buf, buf, bufInc); 4373 } 4374 xorr(t3, t3, crc); 4375 4376 // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices. 4377 rlwinm(t0, t3, 2, 24-2, 31-2); // ((t1 >> 0) & 0xff) << 2 4378 rlwinm(t1, t3, 32+(2- 8), 24-2, 31-2); // ((t1 >> 8) & 0xff) << 2 4379 rlwinm(t2, t3, 32+(2-16), 24-2, 31-2); // ((t1 >> 16) & 0xff) << 2 4380 rlwinm(t3, t3, 32+(2-24), 24-2, 31-2); // ((t1 >> 24) & 0xff) << 2 4381 4382 // Use the pre-calculated column addresses. 4383 // Load pre-calculated table values. 4384 lwzx(t0, tc0, t0); 4385 lwzx(t1, tc1, t1); 4386 lwzx(t2, tc2, t2); 4387 lwzx(t3, tc3, t3); 4388 4389 // Calculate new crc from table values. 4390 xorr(t0, t0, t1); 4391 xorr(t2, t2, t3); 4392 xorr(crc, t0, t2); // Now crc contains the final checksum value. 4393 } 4394 4395 /** 4396 * @param crc register containing existing CRC (32-bit) 4397 * @param buf register pointing to input byte buffer (byte*) 4398 * @param len register containing number of bytes 4399 * @param table register pointing to CRC table 4400 * 4401 * Uses R9..R12 as work register. Must be saved/restored by caller! 4402 */ 4403 void MacroAssembler::kernel_crc32_2word(Register crc, Register buf, Register len, Register table, 4404 Register t0, Register t1, Register t2, Register t3, 4405 Register tc0, Register tc1, Register tc2, Register tc3) { 4406 assert_different_registers(crc, buf, len, table); 4407 4408 Label L_mainLoop, L_tail; 4409 Register tmp = t0; 4410 Register data = t0; 4411 Register tmp2 = t1; 4412 const int mainLoop_stepping = 8; 4413 const int tailLoop_stepping = 1; 4414 const int log_stepping = exact_log2(mainLoop_stepping); 4415 const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32; 4416 const int complexThreshold = 2*mainLoop_stepping; 4417 4418 // Don't test for len <= 0 here. This pathological case should not occur anyway. 4419 // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles. 4420 // The situation itself is detected and handled correctly by the conditional branches 4421 // following aghi(len, -stepping) and aghi(len, +stepping). 4422 assert(tailLoop_stepping == 1, "check tailLoop_stepping!"); 4423 4424 BLOCK_COMMENT("kernel_crc32_2word {"); 4425 4426 nand(crc, crc, crc); // ~c 4427 4428 // Check for short (<mainLoop_stepping) buffer. 4429 cmpdi(CCR0, len, complexThreshold); 4430 blt(CCR0, L_tail); 4431 4432 // Pre-mainLoop alignment did show a slight (1%) positive effect on performance. 4433 // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions. 4434 { 4435 // Align buf addr to mainLoop_stepping boundary. 4436 neg(tmp2, buf); // Calculate # preLoop iterations for alignment. 4437 rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63. 4438 4439 if (complexThreshold > mainLoop_stepping) { 4440 sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 4441 } else { 4442 sub(tmp, len, tmp2); // Remaining bytes for main loop. 4443 cmpdi(CCR0, tmp, mainLoop_stepping); 4444 blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing 4445 mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 4446 } 4447 update_byteLoop_crc32(crc, buf, tmp2, table, data, false, false); 4448 } 4449 4450 srdi(tmp2, len, log_stepping); // #iterations for mainLoop 4451 andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop 4452 mtctr(tmp2); 4453 4454 #ifdef VM_LITTLE_ENDIAN 4455 Register crc_rv = crc; 4456 #else 4457 Register crc_rv = tmp; // Load_reverse needs separate registers to work on. 4458 // Occupies tmp, but frees up crc. 4459 load_reverse_32(crc_rv, crc); // Revert byte order because we are dealing with big-endian data. 4460 tmp = crc; 4461 #endif 4462 4463 int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3); 4464 4465 align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement. 4466 BIND(L_mainLoop); 4467 update_1word_crc32(crc_rv, buf, table, 0, 0, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 4468 update_1word_crc32(crc_rv, buf, table, 4, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 4469 bdnz(L_mainLoop); 4470 4471 #ifndef VM_LITTLE_ENDIAN 4472 load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data. 4473 tmp = crc_rv; // Tmp uses it's original register again. 4474 #endif 4475 4476 // Restore original table address for tailLoop. 4477 if (reconstructTableOffset != 0) { 4478 addi(table, table, -reconstructTableOffset); 4479 } 4480 4481 // Process last few (<complexThreshold) bytes of buffer. 4482 BIND(L_tail); 4483 update_byteLoop_crc32(crc, buf, len, table, data, false, false); 4484 4485 nand(crc, crc, crc); // ~c 4486 BLOCK_COMMENT("} kernel_crc32_2word"); 4487 } 4488 4489 /** 4490 * @param crc register containing existing CRC (32-bit) 4491 * @param buf register pointing to input byte buffer (byte*) 4492 * @param len register containing number of bytes 4493 * @param table register pointing to CRC table 4494 * 4495 * uses R9..R12 as work register. Must be saved/restored by caller! 4496 */ 4497 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table, 4498 Register t0, Register t1, Register t2, Register t3, 4499 Register tc0, Register tc1, Register tc2, Register tc3) { 4500 assert_different_registers(crc, buf, len, table); 4501 4502 Label L_mainLoop, L_tail; 4503 Register tmp = t0; 4504 Register data = t0; 4505 Register tmp2 = t1; 4506 const int mainLoop_stepping = 4; 4507 const int tailLoop_stepping = 1; 4508 const int log_stepping = exact_log2(mainLoop_stepping); 4509 const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32; 4510 const int complexThreshold = 2*mainLoop_stepping; 4511 4512 // Don't test for len <= 0 here. This pathological case should not occur anyway. 4513 // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles. 4514 // The situation itself is detected and handled correctly by the conditional branches 4515 // following aghi(len, -stepping) and aghi(len, +stepping). 4516 assert(tailLoop_stepping == 1, "check tailLoop_stepping!"); 4517 4518 BLOCK_COMMENT("kernel_crc32_1word {"); 4519 4520 nand(crc, crc, crc); // ~c 4521 4522 // Check for short (<mainLoop_stepping) buffer. 4523 cmpdi(CCR0, len, complexThreshold); 4524 blt(CCR0, L_tail); 4525 4526 // Pre-mainLoop alignment did show a slight (1%) positive effect on performance. 4527 // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions. 4528 { 4529 // Align buf addr to mainLoop_stepping boundary. 4530 neg(tmp2, buf); // Calculate # preLoop iterations for alignment. 4531 rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63. 4532 4533 if (complexThreshold > mainLoop_stepping) { 4534 sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 4535 } else { 4536 sub(tmp, len, tmp2); // Remaining bytes for main loop. 4537 cmpdi(CCR0, tmp, mainLoop_stepping); 4538 blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing 4539 mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 4540 } 4541 update_byteLoop_crc32(crc, buf, tmp2, table, data, false, false); 4542 } 4543 4544 srdi(tmp2, len, log_stepping); // #iterations for mainLoop 4545 andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop 4546 mtctr(tmp2); 4547 4548 #ifdef VM_LITTLE_ENDIAN 4549 Register crc_rv = crc; 4550 #else 4551 Register crc_rv = tmp; // Load_reverse needs separate registers to work on. 4552 // Occupies tmp, but frees up crc. 4553 load_reverse_32(crc_rv, crc); // Revert byte order because we are dealing with big-endian data. 4554 tmp = crc; 4555 #endif 4556 4557 int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3); 4558 4559 align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement. 4560 BIND(L_mainLoop); 4561 update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 4562 bdnz(L_mainLoop); 4563 4564 #ifndef VM_LITTLE_ENDIAN 4565 load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data. 4566 tmp = crc_rv; // Tmp uses it's original register again. 4567 #endif 4568 4569 // Restore original table address for tailLoop. 4570 if (reconstructTableOffset != 0) { 4571 addi(table, table, -reconstructTableOffset); 4572 } 4573 4574 // Process last few (<complexThreshold) bytes of buffer. 4575 BIND(L_tail); 4576 update_byteLoop_crc32(crc, buf, len, table, data, false, false); 4577 4578 nand(crc, crc, crc); // ~c 4579 BLOCK_COMMENT("} kernel_crc32_1word"); 4580 } 4581 4582 /** 4583 * @param crc register containing existing CRC (32-bit) 4584 * @param buf register pointing to input byte buffer (byte*) 4585 * @param len register containing number of bytes 4586 * @param table register pointing to CRC table 4587 * 4588 * Uses R7_ARG5, R8_ARG6 as work registers. 4589 */ 4590 void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len, Register table, 4591 Register t0, Register t1, Register t2, Register t3) { 4592 assert_different_registers(crc, buf, len, table); 4593 4594 Register data = t0; // Holds the current byte to be folded into crc. 4595 4596 BLOCK_COMMENT("kernel_crc32_1byte {"); 4597 4598 // Process all bytes in a single-byte loop. 4599 update_byteLoop_crc32(crc, buf, len, table, data, true, true); 4600 4601 BLOCK_COMMENT("} kernel_crc32_1byte"); 4602 } 4603 4604 void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp) { 4605 assert_different_registers(crc, buf, /* len, not used!! */ table, tmp); 4606 4607 BLOCK_COMMENT("kernel_crc32_singleByte:"); 4608 nand(crc, crc, crc); // ~c 4609 4610 lbz(tmp, 0, buf); // Byte from buffer, zero-extended. 4611 update_byte_crc32(crc, tmp, table); 4612 4613 nand(crc, crc, crc); // ~c 4614 } 4615 4616 // dest_lo += src1 + src2 4617 // dest_hi += carry1 + carry2 4618 void MacroAssembler::add2_with_carry(Register dest_hi, 4619 Register dest_lo, 4620 Register src1, Register src2) { 4621 li(R0, 0); 4622 addc(dest_lo, dest_lo, src1); 4623 adde(dest_hi, dest_hi, R0); 4624 addc(dest_lo, dest_lo, src2); 4625 adde(dest_hi, dest_hi, R0); 4626 } 4627 4628 // Multiply 64 bit by 64 bit first loop. 4629 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, 4630 Register x_xstart, 4631 Register y, Register y_idx, 4632 Register z, 4633 Register carry, 4634 Register product_high, Register product, 4635 Register idx, Register kdx, 4636 Register tmp) { 4637 // jlong carry, x[], y[], z[]; 4638 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) { 4639 // huge_128 product = y[idx] * x[xstart] + carry; 4640 // z[kdx] = (jlong)product; 4641 // carry = (jlong)(product >>> 64); 4642 // } 4643 // z[xstart] = carry; 4644 4645 Label L_first_loop, L_first_loop_exit; 4646 Label L_one_x, L_one_y, L_multiply; 4647 4648 addic_(xstart, xstart, -1); 4649 blt(CCR0, L_one_x); // Special case: length of x is 1. 4650 4651 // Load next two integers of x. 4652 sldi(tmp, xstart, LogBytesPerInt); 4653 ldx(x_xstart, x, tmp); 4654 #ifdef VM_LITTLE_ENDIAN 4655 rldicl(x_xstart, x_xstart, 32, 0); 4656 #endif 4657 4658 align(32, 16); 4659 bind(L_first_loop); 4660 4661 cmpdi(CCR0, idx, 1); 4662 blt(CCR0, L_first_loop_exit); 4663 addi(idx, idx, -2); 4664 beq(CCR0, L_one_y); 4665 4666 // Load next two integers of y. 4667 sldi(tmp, idx, LogBytesPerInt); 4668 ldx(y_idx, y, tmp); 4669 #ifdef VM_LITTLE_ENDIAN 4670 rldicl(y_idx, y_idx, 32, 0); 4671 #endif 4672 4673 4674 bind(L_multiply); 4675 multiply64(product_high, product, x_xstart, y_idx); 4676 4677 li(tmp, 0); 4678 addc(product, product, carry); // Add carry to result. 4679 adde(product_high, product_high, tmp); // Add carry of the last addition. 4680 addi(kdx, kdx, -2); 4681 4682 // Store result. 4683 #ifdef VM_LITTLE_ENDIAN 4684 rldicl(product, product, 32, 0); 4685 #endif 4686 sldi(tmp, kdx, LogBytesPerInt); 4687 stdx(product, z, tmp); 4688 mr_if_needed(carry, product_high); 4689 b(L_first_loop); 4690 4691 4692 bind(L_one_y); // Load one 32 bit portion of y as (0,value). 4693 4694 lwz(y_idx, 0, y); 4695 b(L_multiply); 4696 4697 4698 bind(L_one_x); // Load one 32 bit portion of x as (0,value). 4699 4700 lwz(x_xstart, 0, x); 4701 b(L_first_loop); 4702 4703 bind(L_first_loop_exit); 4704 } 4705 4706 // Multiply 64 bit by 64 bit and add 128 bit. 4707 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, 4708 Register z, Register yz_idx, 4709 Register idx, Register carry, 4710 Register product_high, Register product, 4711 Register tmp, int offset) { 4712 4713 // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry; 4714 // z[kdx] = (jlong)product; 4715 4716 sldi(tmp, idx, LogBytesPerInt); 4717 if (offset) { 4718 addi(tmp, tmp, offset); 4719 } 4720 ldx(yz_idx, y, tmp); 4721 #ifdef VM_LITTLE_ENDIAN 4722 rldicl(yz_idx, yz_idx, 32, 0); 4723 #endif 4724 4725 multiply64(product_high, product, x_xstart, yz_idx); 4726 ldx(yz_idx, z, tmp); 4727 #ifdef VM_LITTLE_ENDIAN 4728 rldicl(yz_idx, yz_idx, 32, 0); 4729 #endif 4730 4731 add2_with_carry(product_high, product, carry, yz_idx); 4732 4733 sldi(tmp, idx, LogBytesPerInt); 4734 if (offset) { 4735 addi(tmp, tmp, offset); 4736 } 4737 #ifdef VM_LITTLE_ENDIAN 4738 rldicl(product, product, 32, 0); 4739 #endif 4740 stdx(product, z, tmp); 4741 } 4742 4743 // Multiply 128 bit by 128 bit. Unrolled inner loop. 4744 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, 4745 Register y, Register z, 4746 Register yz_idx, Register idx, Register carry, 4747 Register product_high, Register product, 4748 Register carry2, Register tmp) { 4749 4750 // jlong carry, x[], y[], z[]; 4751 // int kdx = ystart+1; 4752 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 4753 // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry; 4754 // z[kdx+idx+1] = (jlong)product; 4755 // jlong carry2 = (jlong)(product >>> 64); 4756 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2; 4757 // z[kdx+idx] = (jlong)product; 4758 // carry = (jlong)(product >>> 64); 4759 // } 4760 // idx += 2; 4761 // if (idx > 0) { 4762 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry; 4763 // z[kdx+idx] = (jlong)product; 4764 // carry = (jlong)(product >>> 64); 4765 // } 4766 4767 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 4768 const Register jdx = R0; 4769 4770 // Scale the index. 4771 srdi_(jdx, idx, 2); 4772 beq(CCR0, L_third_loop_exit); 4773 mtctr(jdx); 4774 4775 align(32, 16); 4776 bind(L_third_loop); 4777 4778 addi(idx, idx, -4); 4779 4780 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8); 4781 mr_if_needed(carry2, product_high); 4782 4783 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0); 4784 mr_if_needed(carry, product_high); 4785 bdnz(L_third_loop); 4786 4787 bind(L_third_loop_exit); // Handle any left-over operand parts. 4788 4789 andi_(idx, idx, 0x3); 4790 beq(CCR0, L_post_third_loop_done); 4791 4792 Label L_check_1; 4793 4794 addic_(idx, idx, -2); 4795 blt(CCR0, L_check_1); 4796 4797 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0); 4798 mr_if_needed(carry, product_high); 4799 4800 bind(L_check_1); 4801 4802 addi(idx, idx, 0x2); 4803 andi_(idx, idx, 0x1); 4804 addic_(idx, idx, -1); 4805 blt(CCR0, L_post_third_loop_done); 4806 4807 sldi(tmp, idx, LogBytesPerInt); 4808 lwzx(yz_idx, y, tmp); 4809 multiply64(product_high, product, x_xstart, yz_idx); 4810 lwzx(yz_idx, z, tmp); 4811 4812 add2_with_carry(product_high, product, yz_idx, carry); 4813 4814 sldi(tmp, idx, LogBytesPerInt); 4815 stwx(product, z, tmp); 4816 srdi(product, product, 32); 4817 4818 sldi(product_high, product_high, 32); 4819 orr(product, product, product_high); 4820 mr_if_needed(carry, product); 4821 4822 bind(L_post_third_loop_done); 4823 } // multiply_128_x_128_loop 4824 4825 void MacroAssembler::multiply_to_len(Register x, Register xlen, 4826 Register y, Register ylen, 4827 Register z, Register zlen, 4828 Register tmp1, Register tmp2, 4829 Register tmp3, Register tmp4, 4830 Register tmp5, Register tmp6, 4831 Register tmp7, Register tmp8, 4832 Register tmp9, Register tmp10, 4833 Register tmp11, Register tmp12, 4834 Register tmp13) { 4835 4836 ShortBranchVerifier sbv(this); 4837 4838 assert_different_registers(x, xlen, y, ylen, z, zlen, 4839 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6); 4840 assert_different_registers(x, xlen, y, ylen, z, zlen, 4841 tmp1, tmp2, tmp3, tmp4, tmp5, tmp7); 4842 assert_different_registers(x, xlen, y, ylen, z, zlen, 4843 tmp1, tmp2, tmp3, tmp4, tmp5, tmp8); 4844 4845 const Register idx = tmp1; 4846 const Register kdx = tmp2; 4847 const Register xstart = tmp3; 4848 4849 const Register y_idx = tmp4; 4850 const Register carry = tmp5; 4851 const Register product = tmp6; 4852 const Register product_high = tmp7; 4853 const Register x_xstart = tmp8; 4854 const Register tmp = tmp9; 4855 4856 // First Loop. 4857 // 4858 // final static long LONG_MASK = 0xffffffffL; 4859 // int xstart = xlen - 1; 4860 // int ystart = ylen - 1; 4861 // long carry = 0; 4862 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 4863 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; 4864 // z[kdx] = (int)product; 4865 // carry = product >>> 32; 4866 // } 4867 // z[xstart] = (int)carry; 4868 4869 mr_if_needed(idx, ylen); // idx = ylen 4870 mr_if_needed(kdx, zlen); // kdx = xlen + ylen 4871 li(carry, 0); // carry = 0 4872 4873 Label L_done; 4874 4875 addic_(xstart, xlen, -1); 4876 blt(CCR0, L_done); 4877 4878 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, 4879 carry, product_high, product, idx, kdx, tmp); 4880 4881 Label L_second_loop; 4882 4883 cmpdi(CCR0, kdx, 0); 4884 beq(CCR0, L_second_loop); 4885 4886 Label L_carry; 4887 4888 addic_(kdx, kdx, -1); 4889 beq(CCR0, L_carry); 4890 4891 // Store lower 32 bits of carry. 4892 sldi(tmp, kdx, LogBytesPerInt); 4893 stwx(carry, z, tmp); 4894 srdi(carry, carry, 32); 4895 addi(kdx, kdx, -1); 4896 4897 4898 bind(L_carry); 4899 4900 // Store upper 32 bits of carry. 4901 sldi(tmp, kdx, LogBytesPerInt); 4902 stwx(carry, z, tmp); 4903 4904 // Second and third (nested) loops. 4905 // 4906 // for (int i = xstart-1; i >= 0; i--) { // Second loop 4907 // carry = 0; 4908 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop 4909 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + 4910 // (z[k] & LONG_MASK) + carry; 4911 // z[k] = (int)product; 4912 // carry = product >>> 32; 4913 // } 4914 // z[i] = (int)carry; 4915 // } 4916 // 4917 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx 4918 4919 bind(L_second_loop); 4920 4921 li(carry, 0); // carry = 0; 4922 4923 addic_(xstart, xstart, -1); // i = xstart-1; 4924 blt(CCR0, L_done); 4925 4926 Register zsave = tmp10; 4927 4928 mr(zsave, z); 4929 4930 4931 Label L_last_x; 4932 4933 sldi(tmp, xstart, LogBytesPerInt); 4934 add(z, z, tmp); // z = z + k - j 4935 addi(z, z, 4); 4936 addic_(xstart, xstart, -1); // i = xstart-1; 4937 blt(CCR0, L_last_x); 4938 4939 sldi(tmp, xstart, LogBytesPerInt); 4940 ldx(x_xstart, x, tmp); 4941 #ifdef VM_LITTLE_ENDIAN 4942 rldicl(x_xstart, x_xstart, 32, 0); 4943 #endif 4944 4945 4946 Label L_third_loop_prologue; 4947 4948 bind(L_third_loop_prologue); 4949 4950 Register xsave = tmp11; 4951 Register xlensave = tmp12; 4952 Register ylensave = tmp13; 4953 4954 mr(xsave, x); 4955 mr(xlensave, xstart); 4956 mr(ylensave, ylen); 4957 4958 4959 multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen, 4960 carry, product_high, product, x, tmp); 4961 4962 mr(z, zsave); 4963 mr(x, xsave); 4964 mr(xlen, xlensave); // This is the decrement of the loop counter! 4965 mr(ylen, ylensave); 4966 4967 addi(tmp3, xlen, 1); 4968 sldi(tmp, tmp3, LogBytesPerInt); 4969 stwx(carry, z, tmp); 4970 addic_(tmp3, tmp3, -1); 4971 blt(CCR0, L_done); 4972 4973 srdi(carry, carry, 32); 4974 sldi(tmp, tmp3, LogBytesPerInt); 4975 stwx(carry, z, tmp); 4976 b(L_second_loop); 4977 4978 // Next infrequent code is moved outside loops. 4979 bind(L_last_x); 4980 4981 lwz(x_xstart, 0, x); 4982 b(L_third_loop_prologue); 4983 4984 bind(L_done); 4985 } // multiply_to_len 4986 4987 void MacroAssembler::asm_assert(bool check_equal, const char *msg, int id) { 4988 #ifdef ASSERT 4989 Label ok; 4990 if (check_equal) { 4991 beq(CCR0, ok); 4992 } else { 4993 bne(CCR0, ok); 4994 } 4995 stop(msg, id); 4996 bind(ok); 4997 #endif 4998 } 4999 5000 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset, 5001 Register mem_base, const char* msg, int id) { 5002 #ifdef ASSERT 5003 switch (size) { 5004 case 4: 5005 lwz(R0, mem_offset, mem_base); 5006 cmpwi(CCR0, R0, 0); 5007 break; 5008 case 8: 5009 ld(R0, mem_offset, mem_base); 5010 cmpdi(CCR0, R0, 0); 5011 break; 5012 default: 5013 ShouldNotReachHere(); 5014 } 5015 asm_assert(check_equal, msg, id); 5016 #endif // ASSERT 5017 } 5018 5019 void MacroAssembler::verify_thread() { 5020 if (VerifyThread) { 5021 unimplemented("'VerifyThread' currently not implemented on PPC"); 5022 } 5023 } 5024 5025 // READ: oop. KILL: R0. Volatile floats perhaps. 5026 void MacroAssembler::verify_oop(Register oop, const char* msg) { 5027 if (!VerifyOops) { 5028 return; 5029 } 5030 5031 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address(); 5032 const Register tmp = R11; // Will be preserved. 5033 const int nbytes_save = MacroAssembler::num_volatile_regs * 8; 5034 save_volatile_gprs(R1_SP, -nbytes_save); // except R0 5035 5036 mr_if_needed(R4_ARG2, oop); 5037 save_LR_CR(tmp); // save in old frame 5038 push_frame_reg_args(nbytes_save, tmp); 5039 // load FunctionDescriptor** / entry_address * 5040 load_const_optimized(tmp, fd, R0); 5041 // load FunctionDescriptor* / entry_address 5042 ld(tmp, 0, tmp); 5043 load_const_optimized(R3_ARG1, (address)msg, R0); 5044 // Call destination for its side effect. 5045 call_c(tmp); 5046 5047 pop_frame(); 5048 restore_LR_CR(tmp); 5049 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0 5050 } 5051 5052 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) { 5053 if (!VerifyOops) { 5054 return; 5055 } 5056 5057 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address(); 5058 const Register tmp = R11; // Will be preserved. 5059 const int nbytes_save = MacroAssembler::num_volatile_regs * 8; 5060 save_volatile_gprs(R1_SP, -nbytes_save); // except R0 5061 5062 ld(R4_ARG2, offs, base); 5063 save_LR_CR(tmp); // save in old frame 5064 push_frame_reg_args(nbytes_save, tmp); 5065 // load FunctionDescriptor** / entry_address * 5066 load_const_optimized(tmp, fd, R0); 5067 // load FunctionDescriptor* / entry_address 5068 ld(tmp, 0, tmp); 5069 load_const_optimized(R3_ARG1, (address)msg, R0); 5070 // Call destination for its side effect. 5071 call_c(tmp); 5072 5073 pop_frame(); 5074 restore_LR_CR(tmp); 5075 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0 5076 } 5077 5078 const char* stop_types[] = { 5079 "stop", 5080 "untested", 5081 "unimplemented", 5082 "shouldnotreachhere" 5083 }; 5084 5085 static void stop_on_request(int tp, const char* msg) { 5086 tty->print("PPC assembly code requires stop: (%s) %s\n", stop_types[tp%/*stop_end*/4], msg); 5087 guarantee(false, "PPC assembly code requires stop: %s", msg); 5088 } 5089 5090 // Call a C-function that prints output. 5091 void MacroAssembler::stop(int type, const char* msg, int id) { 5092 #ifndef PRODUCT 5093 block_comment(err_msg("stop: %s %s {", stop_types[type%stop_end], msg)); 5094 #else 5095 block_comment("stop {"); 5096 #endif 5097 5098 // setup arguments 5099 load_const_optimized(R3_ARG1, type); 5100 load_const_optimized(R4_ARG2, (void *)msg, /*tmp=*/R0); 5101 call_VM_leaf(CAST_FROM_FN_PTR(address, stop_on_request), R3_ARG1, R4_ARG2); 5102 illtrap(); 5103 emit_int32(id); 5104 block_comment("} stop;"); 5105 } 5106 5107 #ifndef PRODUCT 5108 // Write pattern 0x0101010101010101 in memory region [low-before, high+after]. 5109 // Val, addr are temp registers. 5110 // If low == addr, addr is killed. 5111 // High is preserved. 5112 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) { 5113 if (!ZapMemory) return; 5114 5115 assert_different_registers(low, val); 5116 5117 BLOCK_COMMENT("zap memory region {"); 5118 load_const_optimized(val, 0x0101010101010101); 5119 int size = before + after; 5120 if (low == high && size < 5 && size > 0) { 5121 int offset = -before*BytesPerWord; 5122 for (int i = 0; i < size; ++i) { 5123 std(val, offset, low); 5124 offset += (1*BytesPerWord); 5125 } 5126 } else { 5127 addi(addr, low, -before*BytesPerWord); 5128 assert_different_registers(high, val); 5129 if (after) addi(high, high, after * BytesPerWord); 5130 Label loop; 5131 bind(loop); 5132 std(val, 0, addr); 5133 addi(addr, addr, 8); 5134 cmpd(CCR6, addr, high); 5135 ble(CCR6, loop); 5136 if (after) addi(high, high, -after * BytesPerWord); // Correct back to old value. 5137 } 5138 BLOCK_COMMENT("} zap memory region"); 5139 } 5140 5141 #endif // !PRODUCT 5142 5143 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() { 5144 int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true); 5145 assert(sizeof(bool) == 1, "PowerPC ABI"); 5146 masm->lbz(temp, simm16_offset, temp); 5147 masm->cmpwi(CCR0, temp, 0); 5148 masm->beq(CCR0, _label); 5149 } 5150 5151 SkipIfEqualZero::~SkipIfEqualZero() { 5152 _masm->bind(_label); 5153 }