1 /*
   2  * Copyright (c) 1997, 2016, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2012, 2016 SAP SE. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "compiler/disassembler.hpp"
  29 #include "gc/shared/cardTableModRefBS.hpp"
  30 #include "gc/shared/collectedHeap.inline.hpp"
  31 #include "interpreter/interpreter.hpp"
  32 #include "memory/resourceArea.hpp"
  33 #include "nativeInst_ppc.hpp"
  34 #include "prims/methodHandles.hpp"
  35 #include "runtime/biasedLocking.hpp"
  36 #include "runtime/icache.hpp"
  37 #include "runtime/interfaceSupport.hpp"
  38 #include "runtime/objectMonitor.hpp"
  39 #include "runtime/os.hpp"
  40 #include "runtime/sharedRuntime.hpp"
  41 #include "runtime/stubRoutines.hpp"
  42 #include "utilities/macros.hpp"
  43 #if INCLUDE_ALL_GCS
  44 #include "gc/g1/g1CollectedHeap.inline.hpp"
  45 #include "gc/g1/g1SATBCardTableModRefBS.hpp"
  46 #include "gc/g1/heapRegion.hpp"
  47 #endif // INCLUDE_ALL_GCS
  48 #ifdef COMPILER2
  49 #include "opto/intrinsicnode.hpp"
  50 #endif
  51 
  52 #ifdef PRODUCT
  53 #define BLOCK_COMMENT(str) // nothing
  54 #else
  55 #define BLOCK_COMMENT(str) block_comment(str)
  56 #endif
  57 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  58 
  59 #ifdef ASSERT
  60 // On RISC, there's no benefit to verifying instruction boundaries.
  61 bool AbstractAssembler::pd_check_instruction_mark() { return false; }
  62 #endif
  63 
  64 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) {
  65   assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range");
  66   if (Assembler::is_simm(si31, 16)) {
  67     ld(d, si31, a);
  68     if (emit_filler_nop) nop();
  69   } else {
  70     const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31);
  71     const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31);
  72     addis(d, a, hi);
  73     ld(d, lo, d);
  74   }
  75 }
  76 
  77 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) {
  78   assert_different_registers(d, a);
  79   ld_largeoffset_unchecked(d, si31, a, emit_filler_nop);
  80 }
  81 
  82 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base,
  83                                       size_t size_in_bytes, bool is_signed) {
  84   switch (size_in_bytes) {
  85   case  8:              ld(dst, offs, base);                         break;
  86   case  4:  is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break;
  87   case  2:  is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break;
  88   case  1:  lbz(dst, offs, base); if (is_signed) extsb(dst, dst);    break; // lba doesn't exist :(
  89   default:  ShouldNotReachHere();
  90   }
  91 }
  92 
  93 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base,
  94                                        size_t size_in_bytes) {
  95   switch (size_in_bytes) {
  96   case  8:  std(dst, offs, base); break;
  97   case  4:  stw(dst, offs, base); break;
  98   case  2:  sth(dst, offs, base); break;
  99   case  1:  stb(dst, offs, base); break;
 100   default:  ShouldNotReachHere();
 101   }
 102 }
 103 
 104 void MacroAssembler::align(int modulus, int max, int rem) {
 105   int padding = (rem + modulus - (offset() % modulus)) % modulus;
 106   if (padding > max) return;
 107   for (int c = (padding >> 2); c > 0; --c) { nop(); }
 108 }
 109 
 110 // Issue instructions that calculate given TOC from global TOC.
 111 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16,
 112                                                        bool add_relocation, bool emit_dummy_addr) {
 113   int offset = -1;
 114   if (emit_dummy_addr) {
 115     offset = -128; // dummy address
 116   } else if (addr != (address)(intptr_t)-1) {
 117     offset = MacroAssembler::offset_to_global_toc(addr);
 118   }
 119 
 120   if (hi16) {
 121     addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset));
 122   }
 123   if (lo16) {
 124     if (add_relocation) {
 125       // Relocate at the addi to avoid confusion with a load from the method's TOC.
 126       relocate(internal_word_Relocation::spec(addr));
 127     }
 128     addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset));
 129   }
 130 }
 131 
 132 int MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) {
 133   const int offset = MacroAssembler::offset_to_global_toc(addr);
 134 
 135   const address inst2_addr = a;
 136   const int inst2 = *(int *)inst2_addr;
 137 
 138   // The relocation points to the second instruction, the addi,
 139   // and the addi reads and writes the same register dst.
 140   const int dst = inv_rt_field(inst2);
 141   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 142 
 143   // Now, find the preceding addis which writes to dst.
 144   int inst1 = 0;
 145   address inst1_addr = inst2_addr - BytesPerInstWord;
 146   while (inst1_addr >= bound) {
 147     inst1 = *(int *) inst1_addr;
 148     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 149       // Stop, found the addis which writes dst.
 150       break;
 151     }
 152     inst1_addr -= BytesPerInstWord;
 153   }
 154 
 155   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 156   set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset));
 157   set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset));
 158   return (int)((intptr_t)addr - (intptr_t)inst1_addr);
 159 }
 160 
 161 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) {
 162   const address inst2_addr = a;
 163   const int inst2 = *(int *)inst2_addr;
 164 
 165   // The relocation points to the second instruction, the addi,
 166   // and the addi reads and writes the same register dst.
 167   const int dst = inv_rt_field(inst2);
 168   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 169 
 170   // Now, find the preceding addis which writes to dst.
 171   int inst1 = 0;
 172   address inst1_addr = inst2_addr - BytesPerInstWord;
 173   while (inst1_addr >= bound) {
 174     inst1 = *(int *) inst1_addr;
 175     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 176       // stop, found the addis which writes dst
 177       break;
 178     }
 179     inst1_addr -= BytesPerInstWord;
 180   }
 181 
 182   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 183 
 184   int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0);
 185   // -1 is a special case
 186   if (offset == -1) {
 187     return (address)(intptr_t)-1;
 188   } else {
 189     return global_toc() + offset;
 190   }
 191 }
 192 
 193 #ifdef _LP64
 194 // Patch compressed oops or klass constants.
 195 // Assembler sequence is
 196 // 1) compressed oops:
 197 //    lis  rx = const.hi
 198 //    ori rx = rx | const.lo
 199 // 2) compressed klass:
 200 //    lis  rx = const.hi
 201 //    clrldi rx = rx & 0xFFFFffff // clearMS32b, optional
 202 //    ori rx = rx | const.lo
 203 // Clrldi will be passed by.
 204 int MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) {
 205   assert(UseCompressedOops, "Should only patch compressed oops");
 206 
 207   const address inst2_addr = a;
 208   const int inst2 = *(int *)inst2_addr;
 209 
 210   // The relocation points to the second instruction, the ori,
 211   // and the ori reads and writes the same register dst.
 212   const int dst = inv_rta_field(inst2);
 213   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 214   // Now, find the preceding addis which writes to dst.
 215   int inst1 = 0;
 216   address inst1_addr = inst2_addr - BytesPerInstWord;
 217   bool inst1_found = false;
 218   while (inst1_addr >= bound) {
 219     inst1 = *(int *)inst1_addr;
 220     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; }
 221     inst1_addr -= BytesPerInstWord;
 222   }
 223   assert(inst1_found, "inst is not lis");
 224 
 225   int xc = (data >> 16) & 0xffff;
 226   int xd = (data >>  0) & 0xffff;
 227 
 228   set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo
 229   set_imm((int *)inst2_addr,        (xd)); // unsigned int
 230   return (int)((intptr_t)inst2_addr - (intptr_t)inst1_addr);
 231 }
 232 
 233 // Get compressed oop or klass constant.
 234 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) {
 235   assert(UseCompressedOops, "Should only patch compressed oops");
 236 
 237   const address inst2_addr = a;
 238   const int inst2 = *(int *)inst2_addr;
 239 
 240   // The relocation points to the second instruction, the ori,
 241   // and the ori reads and writes the same register dst.
 242   const int dst = inv_rta_field(inst2);
 243   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 244   // Now, find the preceding lis which writes to dst.
 245   int inst1 = 0;
 246   address inst1_addr = inst2_addr - BytesPerInstWord;
 247   bool inst1_found = false;
 248 
 249   while (inst1_addr >= bound) {
 250     inst1 = *(int *) inst1_addr;
 251     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;}
 252     inst1_addr -= BytesPerInstWord;
 253   }
 254   assert(inst1_found, "inst is not lis");
 255 
 256   uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff));
 257   uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16);
 258 
 259   return (int) (xl | xh);
 260 }
 261 #endif // _LP64
 262 
 263 // Returns true if successful.
 264 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a,
 265                                                 Register toc, bool fixed_size) {
 266   int toc_offset = 0;
 267   // Use RelocationHolder::none for the constant pool entry, otherwise
 268   // we will end up with a failing NativeCall::verify(x) where x is
 269   // the address of the constant pool entry.
 270   // FIXME: We should insert relocation information for oops at the constant
 271   // pool entries instead of inserting it at the loads; patching of a constant
 272   // pool entry should be less expensive.
 273   address const_address = address_constant((address)a.value(), RelocationHolder::none);
 274   if (const_address == NULL) { return false; } // allocation failure
 275   // Relocate at the pc of the load.
 276   relocate(a.rspec());
 277   toc_offset = (int)(const_address - code()->consts()->start());
 278   ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size);
 279   return true;
 280 }
 281 
 282 bool MacroAssembler::is_load_const_from_method_toc_at(address a) {
 283   const address inst1_addr = a;
 284   const int inst1 = *(int *)inst1_addr;
 285 
 286    // The relocation points to the ld or the addis.
 287    return (is_ld(inst1)) ||
 288           (is_addis(inst1) && inv_ra_field(inst1) != 0);
 289 }
 290 
 291 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) {
 292   assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc");
 293 
 294   const address inst1_addr = a;
 295   const int inst1 = *(int *)inst1_addr;
 296 
 297   if (is_ld(inst1)) {
 298     return inv_d1_field(inst1);
 299   } else if (is_addis(inst1)) {
 300     const int dst = inv_rt_field(inst1);
 301 
 302     // Now, find the succeeding ld which reads and writes to dst.
 303     address inst2_addr = inst1_addr + BytesPerInstWord;
 304     int inst2 = 0;
 305     while (true) {
 306       inst2 = *(int *) inst2_addr;
 307       if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) {
 308         // Stop, found the ld which reads and writes dst.
 309         break;
 310       }
 311       inst2_addr += BytesPerInstWord;
 312     }
 313     return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2);
 314   }
 315   ShouldNotReachHere();
 316   return 0;
 317 }
 318 
 319 // Get the constant from a `load_const' sequence.
 320 long MacroAssembler::get_const(address a) {
 321   assert(is_load_const_at(a), "not a load of a constant");
 322   const int *p = (const int*) a;
 323   unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48);
 324   if (is_ori(*(p+1))) {
 325     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32);
 326     x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16);
 327     x |= (((unsigned long) (get_imm(a,4) & 0xffff)));
 328   } else if (is_lis(*(p+1))) {
 329     x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32);
 330     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16);
 331     x |= (((unsigned long) (get_imm(a,3) & 0xffff)));
 332   } else {
 333     ShouldNotReachHere();
 334     return (long) 0;
 335   }
 336   return (long) x;
 337 }
 338 
 339 // Patch the 64 bit constant of a `load_const' sequence. This is a low
 340 // level procedure. It neither flushes the instruction cache nor is it
 341 // mt safe.
 342 void MacroAssembler::patch_const(address a, long x) {
 343   assert(is_load_const_at(a), "not a load of a constant");
 344   int *p = (int*) a;
 345   if (is_ori(*(p+1))) {
 346     set_imm(0 + p, (x >> 48) & 0xffff);
 347     set_imm(1 + p, (x >> 32) & 0xffff);
 348     set_imm(3 + p, (x >> 16) & 0xffff);
 349     set_imm(4 + p, x & 0xffff);
 350   } else if (is_lis(*(p+1))) {
 351     set_imm(0 + p, (x >> 48) & 0xffff);
 352     set_imm(2 + p, (x >> 32) & 0xffff);
 353     set_imm(1 + p, (x >> 16) & 0xffff);
 354     set_imm(3 + p, x & 0xffff);
 355   } else {
 356     ShouldNotReachHere();
 357   }
 358 }
 359 
 360 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) {
 361   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
 362   int index = oop_recorder()->allocate_metadata_index(obj);
 363   RelocationHolder rspec = metadata_Relocation::spec(index);
 364   return AddressLiteral((address)obj, rspec);
 365 }
 366 
 367 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) {
 368   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
 369   int index = oop_recorder()->find_index(obj);
 370   RelocationHolder rspec = metadata_Relocation::spec(index);
 371   return AddressLiteral((address)obj, rspec);
 372 }
 373 
 374 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) {
 375   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
 376   int oop_index = oop_recorder()->allocate_oop_index(obj);
 377   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 378 }
 379 
 380 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) {
 381   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
 382   int oop_index = oop_recorder()->find_index(obj);
 383   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 384 }
 385 
 386 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
 387                                                       Register tmp, int offset) {
 388   intptr_t value = *delayed_value_addr;
 389   if (value != 0) {
 390     return RegisterOrConstant(value + offset);
 391   }
 392 
 393   // Load indirectly to solve generation ordering problem.
 394   // static address, no relocation
 395   int simm16_offset = load_const_optimized(tmp, delayed_value_addr, noreg, true);
 396   ld(tmp, simm16_offset, tmp); // must be aligned ((xa & 3) == 0)
 397 
 398   if (offset != 0) {
 399     addi(tmp, tmp, offset);
 400   }
 401 
 402   return RegisterOrConstant(tmp);
 403 }
 404 
 405 #ifndef PRODUCT
 406 void MacroAssembler::pd_print_patched_instruction(address branch) {
 407   Unimplemented(); // TODO: PPC port
 408 }
 409 #endif // ndef PRODUCT
 410 
 411 // Conditional far branch for destinations encodable in 24+2 bits.
 412 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) {
 413 
 414   // If requested by flag optimize, relocate the bc_far as a
 415   // runtime_call and prepare for optimizing it when the code gets
 416   // relocated.
 417   if (optimize == bc_far_optimize_on_relocate) {
 418     relocate(relocInfo::runtime_call_type);
 419   }
 420 
 421   // variant 2:
 422   //
 423   //    b!cxx SKIP
 424   //    bxx   DEST
 425   //  SKIP:
 426   //
 427 
 428   const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 429                                                 opposite_bcond(inv_boint_bcond(boint)));
 430 
 431   // We emit two branches.
 432   // First, a conditional branch which jumps around the far branch.
 433   const address not_taken_pc = pc() + 2 * BytesPerInstWord;
 434   const address bc_pc        = pc();
 435   bc(opposite_boint, biint, not_taken_pc);
 436 
 437   const int bc_instr = *(int*)bc_pc;
 438   assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition");
 439   assert(opposite_boint == inv_bo_field(bc_instr), "postcondition");
 440   assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))),
 441                                      opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))),
 442          "postcondition");
 443   assert(biint == inv_bi_field(bc_instr), "postcondition");
 444 
 445   // Second, an unconditional far branch which jumps to dest.
 446   // Note: target(dest) remembers the current pc (see CodeSection::target)
 447   //       and returns the current pc if the label is not bound yet; when
 448   //       the label gets bound, the unconditional far branch will be patched.
 449   const address target_pc = target(dest);
 450   const address b_pc  = pc();
 451   b(target_pc);
 452 
 453   assert(not_taken_pc == pc(),                     "postcondition");
 454   assert(dest.is_bound() || target_pc == b_pc, "postcondition");
 455 }
 456 
 457 // 1 or 2 instructions
 458 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) {
 459   if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) {
 460     bc(boint, biint, dest);
 461   } else {
 462     bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate);
 463   }
 464 }
 465 
 466 bool MacroAssembler::is_bc_far_at(address instruction_addr) {
 467   return is_bc_far_variant1_at(instruction_addr) ||
 468          is_bc_far_variant2_at(instruction_addr) ||
 469          is_bc_far_variant3_at(instruction_addr);
 470 }
 471 
 472 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) {
 473   if (is_bc_far_variant1_at(instruction_addr)) {
 474     const address instruction_1_addr = instruction_addr;
 475     const int instruction_1 = *(int*)instruction_1_addr;
 476     return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr);
 477   } else if (is_bc_far_variant2_at(instruction_addr)) {
 478     const address instruction_2_addr = instruction_addr + 4;
 479     return bxx_destination(instruction_2_addr);
 480   } else if (is_bc_far_variant3_at(instruction_addr)) {
 481     return instruction_addr + 8;
 482   }
 483   // variant 4 ???
 484   ShouldNotReachHere();
 485   return NULL;
 486 }
 487 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) {
 488 
 489   if (is_bc_far_variant3_at(instruction_addr)) {
 490     // variant 3, far cond branch to the next instruction, already patched to nops:
 491     //
 492     //    nop
 493     //    endgroup
 494     //  SKIP/DEST:
 495     //
 496     return;
 497   }
 498 
 499   // first, extract boint and biint from the current branch
 500   int boint = 0;
 501   int biint = 0;
 502 
 503   ResourceMark rm;
 504   const int code_size = 2 * BytesPerInstWord;
 505   CodeBuffer buf(instruction_addr, code_size);
 506   MacroAssembler masm(&buf);
 507   if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) {
 508     // Far branch to next instruction: Optimize it by patching nops (produce variant 3).
 509     masm.nop();
 510     masm.endgroup();
 511   } else {
 512     if (is_bc_far_variant1_at(instruction_addr)) {
 513       // variant 1, the 1st instruction contains the destination address:
 514       //
 515       //    bcxx  DEST
 516       //    nop
 517       //
 518       const int instruction_1 = *(int*)(instruction_addr);
 519       boint = inv_bo_field(instruction_1);
 520       biint = inv_bi_field(instruction_1);
 521     } else if (is_bc_far_variant2_at(instruction_addr)) {
 522       // variant 2, the 2nd instruction contains the destination address:
 523       //
 524       //    b!cxx SKIP
 525       //    bxx   DEST
 526       //  SKIP:
 527       //
 528       const int instruction_1 = *(int*)(instruction_addr);
 529       boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))),
 530           opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1))));
 531       biint = inv_bi_field(instruction_1);
 532     } else {
 533       // variant 4???
 534       ShouldNotReachHere();
 535     }
 536 
 537     // second, set the new branch destination and optimize the code
 538     if (dest != instruction_addr + 4 && // the bc_far is still unbound!
 539         masm.is_within_range_of_bcxx(dest, instruction_addr)) {
 540       // variant 1:
 541       //
 542       //    bcxx  DEST
 543       //    nop
 544       //
 545       masm.bc(boint, biint, dest);
 546       masm.nop();
 547     } else {
 548       // variant 2:
 549       //
 550       //    b!cxx SKIP
 551       //    bxx   DEST
 552       //  SKIP:
 553       //
 554       const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 555                                                     opposite_bcond(inv_boint_bcond(boint)));
 556       const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord;
 557       masm.bc(opposite_boint, biint, not_taken_pc);
 558       masm.b(dest);
 559     }
 560   }
 561   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 562 }
 563 
 564 // Emit a NOT mt-safe patchable 64 bit absolute call/jump.
 565 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) {
 566   // get current pc
 567   uint64_t start_pc = (uint64_t) pc();
 568 
 569   const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last
 570   const address pc_of_b  = (address) (start_pc + (0*BytesPerInstWord)); // b is first
 571 
 572   // relocate here
 573   if (rt != relocInfo::none) {
 574     relocate(rt);
 575   }
 576 
 577   if ( ReoptimizeCallSequences &&
 578        (( link && is_within_range_of_b(dest, pc_of_bl)) ||
 579         (!link && is_within_range_of_b(dest, pc_of_b)))) {
 580     // variant 2:
 581     // Emit an optimized, pc-relative call/jump.
 582 
 583     if (link) {
 584       // some padding
 585       nop();
 586       nop();
 587       nop();
 588       nop();
 589       nop();
 590       nop();
 591 
 592       // do the call
 593       assert(pc() == pc_of_bl, "just checking");
 594       bl(dest, relocInfo::none);
 595     } else {
 596       // do the jump
 597       assert(pc() == pc_of_b, "just checking");
 598       b(dest, relocInfo::none);
 599 
 600       // some padding
 601       nop();
 602       nop();
 603       nop();
 604       nop();
 605       nop();
 606       nop();
 607     }
 608 
 609     // Assert that we can identify the emitted call/jump.
 610     assert(is_bxx64_patchable_variant2_at((address)start_pc, link),
 611            "can't identify emitted call");
 612   } else {
 613     // variant 1:
 614     mr(R0, R11);  // spill R11 -> R0.
 615 
 616     // Load the destination address into CTR,
 617     // calculate destination relative to global toc.
 618     calculate_address_from_global_toc(R11, dest, true, true, false);
 619 
 620     mtctr(R11);
 621     mr(R11, R0);  // spill R11 <- R0.
 622     nop();
 623 
 624     // do the call/jump
 625     if (link) {
 626       bctrl();
 627     } else{
 628       bctr();
 629     }
 630     // Assert that we can identify the emitted call/jump.
 631     assert(is_bxx64_patchable_variant1b_at((address)start_pc, link),
 632            "can't identify emitted call");
 633   }
 634 
 635   // Assert that we can identify the emitted call/jump.
 636   assert(is_bxx64_patchable_at((address)start_pc, link),
 637          "can't identify emitted call");
 638   assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest,
 639          "wrong encoding of dest address");
 640 }
 641 
 642 // Identify a bxx64_patchable instruction.
 643 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) {
 644   return is_bxx64_patchable_variant1b_at(instruction_addr, link)
 645     //|| is_bxx64_patchable_variant1_at(instruction_addr, link)
 646       || is_bxx64_patchable_variant2_at(instruction_addr, link);
 647 }
 648 
 649 // Does the call64_patchable instruction use a pc-relative encoding of
 650 // the call destination?
 651 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) {
 652   // variant 2 is pc-relative
 653   return is_bxx64_patchable_variant2_at(instruction_addr, link);
 654 }
 655 
 656 // Identify variant 1.
 657 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) {
 658   unsigned int* instr = (unsigned int*) instruction_addr;
 659   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 660       && is_mtctr(instr[5]) // mtctr
 661     && is_load_const_at(instruction_addr);
 662 }
 663 
 664 // Identify variant 1b: load destination relative to global toc.
 665 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) {
 666   unsigned int* instr = (unsigned int*) instruction_addr;
 667   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 668     && is_mtctr(instr[3]) // mtctr
 669     && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr);
 670 }
 671 
 672 // Identify variant 2.
 673 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) {
 674   unsigned int* instr = (unsigned int*) instruction_addr;
 675   if (link) {
 676     return is_bl (instr[6])  // bl dest is last
 677       && is_nop(instr[0])  // nop
 678       && is_nop(instr[1])  // nop
 679       && is_nop(instr[2])  // nop
 680       && is_nop(instr[3])  // nop
 681       && is_nop(instr[4])  // nop
 682       && is_nop(instr[5]); // nop
 683   } else {
 684     return is_b  (instr[0])  // b  dest is first
 685       && is_nop(instr[1])  // nop
 686       && is_nop(instr[2])  // nop
 687       && is_nop(instr[3])  // nop
 688       && is_nop(instr[4])  // nop
 689       && is_nop(instr[5])  // nop
 690       && is_nop(instr[6]); // nop
 691   }
 692 }
 693 
 694 // Set dest address of a bxx64_patchable instruction.
 695 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) {
 696   ResourceMark rm;
 697   int code_size = MacroAssembler::bxx64_patchable_size;
 698   CodeBuffer buf(instruction_addr, code_size);
 699   MacroAssembler masm(&buf);
 700   masm.bxx64_patchable(dest, relocInfo::none, link);
 701   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 702 }
 703 
 704 // Get dest address of a bxx64_patchable instruction.
 705 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) {
 706   if (is_bxx64_patchable_variant1_at(instruction_addr, link)) {
 707     return (address) (unsigned long) get_const(instruction_addr);
 708   } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) {
 709     unsigned int* instr = (unsigned int*) instruction_addr;
 710     if (link) {
 711       const int instr_idx = 6; // bl is last
 712       int branchoffset = branch_destination(instr[instr_idx], 0);
 713       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 714     } else {
 715       const int instr_idx = 0; // b is first
 716       int branchoffset = branch_destination(instr[instr_idx], 0);
 717       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 718     }
 719   // Load dest relative to global toc.
 720   } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) {
 721     return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord,
 722                                                                instruction_addr);
 723   } else {
 724     ShouldNotReachHere();
 725     return NULL;
 726   }
 727 }
 728 
 729 // Uses ordering which corresponds to ABI:
 730 //    _savegpr0_14:  std  r14,-144(r1)
 731 //    _savegpr0_15:  std  r15,-136(r1)
 732 //    _savegpr0_16:  std  r16,-128(r1)
 733 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) {
 734   std(R14, offset, dst);   offset += 8;
 735   std(R15, offset, dst);   offset += 8;
 736   std(R16, offset, dst);   offset += 8;
 737   std(R17, offset, dst);   offset += 8;
 738   std(R18, offset, dst);   offset += 8;
 739   std(R19, offset, dst);   offset += 8;
 740   std(R20, offset, dst);   offset += 8;
 741   std(R21, offset, dst);   offset += 8;
 742   std(R22, offset, dst);   offset += 8;
 743   std(R23, offset, dst);   offset += 8;
 744   std(R24, offset, dst);   offset += 8;
 745   std(R25, offset, dst);   offset += 8;
 746   std(R26, offset, dst);   offset += 8;
 747   std(R27, offset, dst);   offset += 8;
 748   std(R28, offset, dst);   offset += 8;
 749   std(R29, offset, dst);   offset += 8;
 750   std(R30, offset, dst);   offset += 8;
 751   std(R31, offset, dst);   offset += 8;
 752 
 753   stfd(F14, offset, dst);   offset += 8;
 754   stfd(F15, offset, dst);   offset += 8;
 755   stfd(F16, offset, dst);   offset += 8;
 756   stfd(F17, offset, dst);   offset += 8;
 757   stfd(F18, offset, dst);   offset += 8;
 758   stfd(F19, offset, dst);   offset += 8;
 759   stfd(F20, offset, dst);   offset += 8;
 760   stfd(F21, offset, dst);   offset += 8;
 761   stfd(F22, offset, dst);   offset += 8;
 762   stfd(F23, offset, dst);   offset += 8;
 763   stfd(F24, offset, dst);   offset += 8;
 764   stfd(F25, offset, dst);   offset += 8;
 765   stfd(F26, offset, dst);   offset += 8;
 766   stfd(F27, offset, dst);   offset += 8;
 767   stfd(F28, offset, dst);   offset += 8;
 768   stfd(F29, offset, dst);   offset += 8;
 769   stfd(F30, offset, dst);   offset += 8;
 770   stfd(F31, offset, dst);
 771 }
 772 
 773 // Uses ordering which corresponds to ABI:
 774 //    _restgpr0_14:  ld   r14,-144(r1)
 775 //    _restgpr0_15:  ld   r15,-136(r1)
 776 //    _restgpr0_16:  ld   r16,-128(r1)
 777 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) {
 778   ld(R14, offset, src);   offset += 8;
 779   ld(R15, offset, src);   offset += 8;
 780   ld(R16, offset, src);   offset += 8;
 781   ld(R17, offset, src);   offset += 8;
 782   ld(R18, offset, src);   offset += 8;
 783   ld(R19, offset, src);   offset += 8;
 784   ld(R20, offset, src);   offset += 8;
 785   ld(R21, offset, src);   offset += 8;
 786   ld(R22, offset, src);   offset += 8;
 787   ld(R23, offset, src);   offset += 8;
 788   ld(R24, offset, src);   offset += 8;
 789   ld(R25, offset, src);   offset += 8;
 790   ld(R26, offset, src);   offset += 8;
 791   ld(R27, offset, src);   offset += 8;
 792   ld(R28, offset, src);   offset += 8;
 793   ld(R29, offset, src);   offset += 8;
 794   ld(R30, offset, src);   offset += 8;
 795   ld(R31, offset, src);   offset += 8;
 796 
 797   // FP registers
 798   lfd(F14, offset, src);   offset += 8;
 799   lfd(F15, offset, src);   offset += 8;
 800   lfd(F16, offset, src);   offset += 8;
 801   lfd(F17, offset, src);   offset += 8;
 802   lfd(F18, offset, src);   offset += 8;
 803   lfd(F19, offset, src);   offset += 8;
 804   lfd(F20, offset, src);   offset += 8;
 805   lfd(F21, offset, src);   offset += 8;
 806   lfd(F22, offset, src);   offset += 8;
 807   lfd(F23, offset, src);   offset += 8;
 808   lfd(F24, offset, src);   offset += 8;
 809   lfd(F25, offset, src);   offset += 8;
 810   lfd(F26, offset, src);   offset += 8;
 811   lfd(F27, offset, src);   offset += 8;
 812   lfd(F28, offset, src);   offset += 8;
 813   lfd(F29, offset, src);   offset += 8;
 814   lfd(F30, offset, src);   offset += 8;
 815   lfd(F31, offset, src);
 816 }
 817 
 818 // For verify_oops.
 819 void MacroAssembler::save_volatile_gprs(Register dst, int offset) {
 820   std(R2,  offset, dst);   offset += 8;
 821   std(R3,  offset, dst);   offset += 8;
 822   std(R4,  offset, dst);   offset += 8;
 823   std(R5,  offset, dst);   offset += 8;
 824   std(R6,  offset, dst);   offset += 8;
 825   std(R7,  offset, dst);   offset += 8;
 826   std(R8,  offset, dst);   offset += 8;
 827   std(R9,  offset, dst);   offset += 8;
 828   std(R10, offset, dst);   offset += 8;
 829   std(R11, offset, dst);   offset += 8;
 830   std(R12, offset, dst);   offset += 8;
 831 
 832   stfd(F0, offset, dst);   offset += 8;
 833   stfd(F1, offset, dst);   offset += 8;
 834   stfd(F2, offset, dst);   offset += 8;
 835   stfd(F3, offset, dst);   offset += 8;
 836   stfd(F4, offset, dst);   offset += 8;
 837   stfd(F5, offset, dst);   offset += 8;
 838   stfd(F6, offset, dst);   offset += 8;
 839   stfd(F7, offset, dst);   offset += 8;
 840   stfd(F8, offset, dst);   offset += 8;
 841   stfd(F9, offset, dst);   offset += 8;
 842   stfd(F10, offset, dst);  offset += 8;
 843   stfd(F11, offset, dst);  offset += 8;
 844   stfd(F12, offset, dst);  offset += 8;
 845   stfd(F13, offset, dst);
 846 }
 847 
 848 // For verify_oops.
 849 void MacroAssembler::restore_volatile_gprs(Register src, int offset) {
 850   ld(R2,  offset, src);   offset += 8;
 851   ld(R3,  offset, src);   offset += 8;
 852   ld(R4,  offset, src);   offset += 8;
 853   ld(R5,  offset, src);   offset += 8;
 854   ld(R6,  offset, src);   offset += 8;
 855   ld(R7,  offset, src);   offset += 8;
 856   ld(R8,  offset, src);   offset += 8;
 857   ld(R9,  offset, src);   offset += 8;
 858   ld(R10, offset, src);   offset += 8;
 859   ld(R11, offset, src);   offset += 8;
 860   ld(R12, offset, src);   offset += 8;
 861 
 862   lfd(F0, offset, src);   offset += 8;
 863   lfd(F1, offset, src);   offset += 8;
 864   lfd(F2, offset, src);   offset += 8;
 865   lfd(F3, offset, src);   offset += 8;
 866   lfd(F4, offset, src);   offset += 8;
 867   lfd(F5, offset, src);   offset += 8;
 868   lfd(F6, offset, src);   offset += 8;
 869   lfd(F7, offset, src);   offset += 8;
 870   lfd(F8, offset, src);   offset += 8;
 871   lfd(F9, offset, src);   offset += 8;
 872   lfd(F10, offset, src);  offset += 8;
 873   lfd(F11, offset, src);  offset += 8;
 874   lfd(F12, offset, src);  offset += 8;
 875   lfd(F13, offset, src);
 876 }
 877 
 878 void MacroAssembler::save_LR_CR(Register tmp) {
 879   mfcr(tmp);
 880   std(tmp, _abi(cr), R1_SP);
 881   mflr(tmp);
 882   std(tmp, _abi(lr), R1_SP);
 883   // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad)
 884 }
 885 
 886 void MacroAssembler::restore_LR_CR(Register tmp) {
 887   assert(tmp != R1_SP, "must be distinct");
 888   ld(tmp, _abi(lr), R1_SP);
 889   mtlr(tmp);
 890   ld(tmp, _abi(cr), R1_SP);
 891   mtcr(tmp);
 892 }
 893 
 894 address MacroAssembler::get_PC_trash_LR(Register result) {
 895   Label L;
 896   bl(L);
 897   bind(L);
 898   address lr_pc = pc();
 899   mflr(result);
 900   return lr_pc;
 901 }
 902 
 903 void MacroAssembler::resize_frame(Register offset, Register tmp) {
 904 #ifdef ASSERT
 905   assert_different_registers(offset, tmp, R1_SP);
 906   andi_(tmp, offset, frame::alignment_in_bytes-1);
 907   asm_assert_eq("resize_frame: unaligned", 0x204);
 908 #endif
 909 
 910   // tmp <- *(SP)
 911   ld(tmp, _abi(callers_sp), R1_SP);
 912   // addr <- SP + offset;
 913   // *(addr) <- tmp;
 914   // SP <- addr
 915   stdux(tmp, R1_SP, offset);
 916 }
 917 
 918 void MacroAssembler::resize_frame(int offset, Register tmp) {
 919   assert(is_simm(offset, 16), "too big an offset");
 920   assert_different_registers(tmp, R1_SP);
 921   assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned");
 922   // tmp <- *(SP)
 923   ld(tmp, _abi(callers_sp), R1_SP);
 924   // addr <- SP + offset;
 925   // *(addr) <- tmp;
 926   // SP <- addr
 927   stdu(tmp, offset, R1_SP);
 928 }
 929 
 930 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) {
 931   // (addr == tmp1) || (addr == tmp2) is allowed here!
 932   assert(tmp1 != tmp2, "must be distinct");
 933 
 934   // compute offset w.r.t. current stack pointer
 935   // tmp_1 <- addr - SP (!)
 936   subf(tmp1, R1_SP, addr);
 937 
 938   // atomically update SP keeping back link.
 939   resize_frame(tmp1/* offset */, tmp2/* tmp */);
 940 }
 941 
 942 void MacroAssembler::push_frame(Register bytes, Register tmp) {
 943 #ifdef ASSERT
 944   assert(bytes != R0, "r0 not allowed here");
 945   andi_(R0, bytes, frame::alignment_in_bytes-1);
 946   asm_assert_eq("push_frame(Reg, Reg): unaligned", 0x203);
 947 #endif
 948   neg(tmp, bytes);
 949   stdux(R1_SP, R1_SP, tmp);
 950 }
 951 
 952 // Push a frame of size `bytes'.
 953 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) {
 954   long offset = align_addr(bytes, frame::alignment_in_bytes);
 955   if (is_simm(-offset, 16)) {
 956     stdu(R1_SP, -offset, R1_SP);
 957   } else {
 958     load_const_optimized(tmp, -offset);
 959     stdux(R1_SP, R1_SP, tmp);
 960   }
 961 }
 962 
 963 // Push a frame of size `bytes' plus abi_reg_args on top.
 964 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) {
 965   push_frame(bytes + frame::abi_reg_args_size, tmp);
 966 }
 967 
 968 // Setup up a new C frame with a spill area for non-volatile GPRs and
 969 // additional space for local variables.
 970 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes,
 971                                                       Register tmp) {
 972   push_frame(bytes + frame::abi_reg_args_size + frame::spill_nonvolatiles_size, tmp);
 973 }
 974 
 975 // Pop current C frame.
 976 void MacroAssembler::pop_frame() {
 977   ld(R1_SP, _abi(callers_sp), R1_SP);
 978 }
 979 
 980 #if defined(ABI_ELFv2)
 981 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) {
 982   // TODO(asmundak): make sure the caller uses R12 as function descriptor
 983   // most of the times.
 984   if (R12 != r_function_entry) {
 985     mr(R12, r_function_entry);
 986   }
 987   mtctr(R12);
 988   // Do a call or a branch.
 989   if (and_link) {
 990     bctrl();
 991   } else {
 992     bctr();
 993   }
 994   _last_calls_return_pc = pc();
 995 
 996   return _last_calls_return_pc;
 997 }
 998 
 999 // Call a C function via a function descriptor and use full C
1000 // calling conventions. Updates and returns _last_calls_return_pc.
1001 address MacroAssembler::call_c(Register r_function_entry) {
1002   return branch_to(r_function_entry, /*and_link=*/true);
1003 }
1004 
1005 // For tail calls: only branch, don't link, so callee returns to caller of this function.
1006 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) {
1007   return branch_to(r_function_entry, /*and_link=*/false);
1008 }
1009 
1010 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) {
1011   load_const(R12, function_entry, R0);
1012   return branch_to(R12,  /*and_link=*/true);
1013 }
1014 
1015 #else
1016 // Generic version of a call to C function via a function descriptor
1017 // with variable support for C calling conventions (TOC, ENV, etc.).
1018 // Updates and returns _last_calls_return_pc.
1019 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call,
1020                                   bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) {
1021   // we emit standard ptrgl glue code here
1022   assert((function_descriptor != R0), "function_descriptor cannot be R0");
1023 
1024   // retrieve necessary entries from the function descriptor
1025   ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor);
1026   mtctr(R0);
1027 
1028   if (load_toc_of_callee) {
1029     ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor);
1030   }
1031   if (load_env_of_callee) {
1032     ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor);
1033   } else if (load_toc_of_callee) {
1034     li(R11, 0);
1035   }
1036 
1037   // do a call or a branch
1038   if (and_link) {
1039     bctrl();
1040   } else {
1041     bctr();
1042   }
1043   _last_calls_return_pc = pc();
1044 
1045   return _last_calls_return_pc;
1046 }
1047 
1048 // Call a C function via a function descriptor and use full C calling
1049 // conventions.
1050 // We don't use the TOC in generated code, so there is no need to save
1051 // and restore its value.
1052 address MacroAssembler::call_c(Register fd) {
1053   return branch_to(fd, /*and_link=*/true,
1054                        /*save toc=*/false,
1055                        /*restore toc=*/false,
1056                        /*load toc=*/true,
1057                        /*load env=*/true);
1058 }
1059 
1060 address MacroAssembler::call_c_and_return_to_caller(Register fd) {
1061   return branch_to(fd, /*and_link=*/false,
1062                        /*save toc=*/false,
1063                        /*restore toc=*/false,
1064                        /*load toc=*/true,
1065                        /*load env=*/true);
1066 }
1067 
1068 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) {
1069   if (rt != relocInfo::none) {
1070     // this call needs to be relocatable
1071     if (!ReoptimizeCallSequences
1072         || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1073         || fd == NULL   // support code-size estimation
1074         || !fd->is_friend_function()
1075         || fd->entry() == NULL) {
1076       // it's not a friend function as defined by class FunctionDescriptor,
1077       // so do a full call-c here.
1078       load_const(R11, (address)fd, R0);
1079 
1080       bool has_env = (fd != NULL && fd->env() != NULL);
1081       return branch_to(R11, /*and_link=*/true,
1082                             /*save toc=*/false,
1083                             /*restore toc=*/false,
1084                             /*load toc=*/true,
1085                             /*load env=*/has_env);
1086     } else {
1087       // It's a friend function. Load the entry point and don't care about
1088       // toc and env. Use an optimizable call instruction, but ensure the
1089       // same code-size as in the case of a non-friend function.
1090       nop();
1091       nop();
1092       nop();
1093       bl64_patchable(fd->entry(), rt);
1094       _last_calls_return_pc = pc();
1095       return _last_calls_return_pc;
1096     }
1097   } else {
1098     // This call does not need to be relocatable, do more aggressive
1099     // optimizations.
1100     if (!ReoptimizeCallSequences
1101       || !fd->is_friend_function()) {
1102       // It's not a friend function as defined by class FunctionDescriptor,
1103       // so do a full call-c here.
1104       load_const(R11, (address)fd, R0);
1105       return branch_to(R11, /*and_link=*/true,
1106                             /*save toc=*/false,
1107                             /*restore toc=*/false,
1108                             /*load toc=*/true,
1109                             /*load env=*/true);
1110     } else {
1111       // it's a friend function, load the entry point and don't care about
1112       // toc and env.
1113       address dest = fd->entry();
1114       if (is_within_range_of_b(dest, pc())) {
1115         bl(dest);
1116       } else {
1117         bl64_patchable(dest, rt);
1118       }
1119       _last_calls_return_pc = pc();
1120       return _last_calls_return_pc;
1121     }
1122   }
1123 }
1124 
1125 // Call a C function.  All constants needed reside in TOC.
1126 //
1127 // Read the address to call from the TOC.
1128 // Read env from TOC, if fd specifies an env.
1129 // Read new TOC from TOC.
1130 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd,
1131                                          relocInfo::relocType rt, Register toc) {
1132   if (!ReoptimizeCallSequences
1133     || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1134     || !fd->is_friend_function()) {
1135     // It's not a friend function as defined by class FunctionDescriptor,
1136     // so do a full call-c here.
1137     assert(fd->entry() != NULL, "function must be linked");
1138 
1139     AddressLiteral fd_entry(fd->entry());
1140     bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true);
1141     mtctr(R11);
1142     if (fd->env() == NULL) {
1143       li(R11, 0);
1144       nop();
1145     } else {
1146       AddressLiteral fd_env(fd->env());
1147       success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true);
1148     }
1149     AddressLiteral fd_toc(fd->toc());
1150     // Set R2_TOC (load from toc)
1151     success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true);
1152     bctrl();
1153     _last_calls_return_pc = pc();
1154     if (!success) { return NULL; }
1155   } else {
1156     // It's a friend function, load the entry point and don't care about
1157     // toc and env. Use an optimizable call instruction, but ensure the
1158     // same code-size as in the case of a non-friend function.
1159     nop();
1160     bl64_patchable(fd->entry(), rt);
1161     _last_calls_return_pc = pc();
1162   }
1163   return _last_calls_return_pc;
1164 }
1165 #endif // ABI_ELFv2
1166 
1167 void MacroAssembler::call_VM_base(Register oop_result,
1168                                   Register last_java_sp,
1169                                   address  entry_point,
1170                                   bool     check_exceptions) {
1171   BLOCK_COMMENT("call_VM {");
1172   // Determine last_java_sp register.
1173   if (!last_java_sp->is_valid()) {
1174     last_java_sp = R1_SP;
1175   }
1176   set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1);
1177 
1178   // ARG1 must hold thread address.
1179   mr(R3_ARG1, R16_thread);
1180 #if defined(ABI_ELFv2)
1181   address return_pc = call_c(entry_point, relocInfo::none);
1182 #else
1183   address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none);
1184 #endif
1185 
1186   reset_last_Java_frame();
1187 
1188   // Check for pending exceptions.
1189   if (check_exceptions) {
1190     // We don't check for exceptions here.
1191     ShouldNotReachHere();
1192   }
1193 
1194   // Get oop result if there is one and reset the value in the thread.
1195   if (oop_result->is_valid()) {
1196     get_vm_result(oop_result);
1197   }
1198 
1199   _last_calls_return_pc = return_pc;
1200   BLOCK_COMMENT("} call_VM");
1201 }
1202 
1203 void MacroAssembler::call_VM_leaf_base(address entry_point) {
1204   BLOCK_COMMENT("call_VM_leaf {");
1205 #if defined(ABI_ELFv2)
1206   call_c(entry_point, relocInfo::none);
1207 #else
1208   call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none);
1209 #endif
1210   BLOCK_COMMENT("} call_VM_leaf");
1211 }
1212 
1213 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) {
1214   call_VM_base(oop_result, noreg, entry_point, check_exceptions);
1215 }
1216 
1217 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1,
1218                              bool check_exceptions) {
1219   // R3_ARG1 is reserved for the thread.
1220   mr_if_needed(R4_ARG2, arg_1);
1221   call_VM(oop_result, entry_point, check_exceptions);
1222 }
1223 
1224 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2,
1225                              bool check_exceptions) {
1226   // R3_ARG1 is reserved for the thread
1227   mr_if_needed(R4_ARG2, arg_1);
1228   assert(arg_2 != R4_ARG2, "smashed argument");
1229   mr_if_needed(R5_ARG3, arg_2);
1230   call_VM(oop_result, entry_point, check_exceptions);
1231 }
1232 
1233 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3,
1234                              bool check_exceptions) {
1235   // R3_ARG1 is reserved for the thread
1236   mr_if_needed(R4_ARG2, arg_1);
1237   assert(arg_2 != R4_ARG2, "smashed argument");
1238   mr_if_needed(R5_ARG3, arg_2);
1239   mr_if_needed(R6_ARG4, arg_3);
1240   call_VM(oop_result, entry_point, check_exceptions);
1241 }
1242 
1243 void MacroAssembler::call_VM_leaf(address entry_point) {
1244   call_VM_leaf_base(entry_point);
1245 }
1246 
1247 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) {
1248   mr_if_needed(R3_ARG1, arg_1);
1249   call_VM_leaf(entry_point);
1250 }
1251 
1252 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) {
1253   mr_if_needed(R3_ARG1, arg_1);
1254   assert(arg_2 != R3_ARG1, "smashed argument");
1255   mr_if_needed(R4_ARG2, arg_2);
1256   call_VM_leaf(entry_point);
1257 }
1258 
1259 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) {
1260   mr_if_needed(R3_ARG1, arg_1);
1261   assert(arg_2 != R3_ARG1, "smashed argument");
1262   mr_if_needed(R4_ARG2, arg_2);
1263   assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument");
1264   mr_if_needed(R5_ARG3, arg_3);
1265   call_VM_leaf(entry_point);
1266 }
1267 
1268 // Check whether instruction is a read access to the polling page
1269 // which was emitted by load_from_polling_page(..).
1270 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext,
1271                                                address* polling_address_ptr) {
1272   if (!is_ld(instruction))
1273     return false; // It's not a ld. Fail.
1274 
1275   int rt = inv_rt_field(instruction);
1276   int ra = inv_ra_field(instruction);
1277   int ds = inv_ds_field(instruction);
1278   if (!(ds == 0 && ra != 0 && rt == 0)) {
1279     return false; // It's not a ld(r0, X, ra). Fail.
1280   }
1281 
1282   if (!ucontext) {
1283     // Set polling address.
1284     if (polling_address_ptr != NULL) {
1285       *polling_address_ptr = NULL;
1286     }
1287     return true; // No ucontext given. Can't check value of ra. Assume true.
1288   }
1289 
1290 #ifdef LINUX
1291   // Ucontext given. Check that register ra contains the address of
1292   // the safepoing polling page.
1293   ucontext_t* uc = (ucontext_t*) ucontext;
1294   // Set polling address.
1295   address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds;
1296   if (polling_address_ptr != NULL) {
1297     *polling_address_ptr = addr;
1298   }
1299   return os::is_poll_address(addr);
1300 #else
1301   // Not on Linux, ucontext must be NULL.
1302   ShouldNotReachHere();
1303   return false;
1304 #endif
1305 }
1306 
1307 bool MacroAssembler::is_memory_serialization(int instruction, JavaThread* thread, void* ucontext) {
1308 #ifdef LINUX
1309   ucontext_t* uc = (ucontext_t*) ucontext;
1310 
1311   if (is_stwx(instruction) || is_stwux(instruction)) {
1312     int ra = inv_ra_field(instruction);
1313     int rb = inv_rb_field(instruction);
1314 
1315     // look up content of ra and rb in ucontext
1316     address ra_val=(address)uc->uc_mcontext.regs->gpr[ra];
1317     long rb_val=(long)uc->uc_mcontext.regs->gpr[rb];
1318     return os::is_memory_serialize_page(thread, ra_val+rb_val);
1319   } else if (is_stw(instruction) || is_stwu(instruction)) {
1320     int ra = inv_ra_field(instruction);
1321     int d1 = inv_d1_field(instruction);
1322 
1323     // look up content of ra in ucontext
1324     address ra_val=(address)uc->uc_mcontext.regs->gpr[ra];
1325     return os::is_memory_serialize_page(thread, ra_val+d1);
1326   } else {
1327     return false;
1328   }
1329 #else
1330   // workaround not needed on !LINUX :-)
1331   ShouldNotCallThis();
1332   return false;
1333 #endif
1334 }
1335 
1336 void MacroAssembler::bang_stack_with_offset(int offset) {
1337   // When increasing the stack, the old stack pointer will be written
1338   // to the new top of stack according to the PPC64 abi.
1339   // Therefore, stack banging is not necessary when increasing
1340   // the stack by <= os::vm_page_size() bytes.
1341   // When increasing the stack by a larger amount, this method is
1342   // called repeatedly to bang the intermediate pages.
1343 
1344   // Stack grows down, caller passes positive offset.
1345   assert(offset > 0, "must bang with positive offset");
1346 
1347   long stdoffset = -offset;
1348 
1349   if (is_simm(stdoffset, 16)) {
1350     // Signed 16 bit offset, a simple std is ok.
1351     if (UseLoadInstructionsForStackBangingPPC64) {
1352       ld(R0, (int)(signed short)stdoffset, R1_SP);
1353     } else {
1354       std(R0,(int)(signed short)stdoffset, R1_SP);
1355     }
1356   } else if (is_simm(stdoffset, 31)) {
1357     const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset);
1358     const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset);
1359 
1360     Register tmp = R11;
1361     addis(tmp, R1_SP, hi);
1362     if (UseLoadInstructionsForStackBangingPPC64) {
1363       ld(R0,  lo, tmp);
1364     } else {
1365       std(R0, lo, tmp);
1366     }
1367   } else {
1368     ShouldNotReachHere();
1369   }
1370 }
1371 
1372 // If instruction is a stack bang of the form
1373 //    std    R0,    x(Ry),       (see bang_stack_with_offset())
1374 //    stdu   R1_SP, x(R1_SP),    (see push_frame(), resize_frame())
1375 // or stdux  R1_SP, Rx, R1_SP    (see push_frame(), resize_frame())
1376 // return the banged address. Otherwise, return 0.
1377 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) {
1378 #ifdef LINUX
1379   ucontext_t* uc = (ucontext_t*) ucontext;
1380   int rs = inv_rs_field(instruction);
1381   int ra = inv_ra_field(instruction);
1382   if (   (is_ld(instruction)   && rs == 0 &&  UseLoadInstructionsForStackBangingPPC64)
1383       || (is_std(instruction)  && rs == 0 && !UseLoadInstructionsForStackBangingPPC64)
1384       || (is_stdu(instruction) && rs == 1)) {
1385     int ds = inv_ds_field(instruction);
1386     // return banged address
1387     return ds+(address)uc->uc_mcontext.regs->gpr[ra];
1388   } else if (is_stdux(instruction) && rs == 1) {
1389     int rb = inv_rb_field(instruction);
1390     address sp = (address)uc->uc_mcontext.regs->gpr[1];
1391     long rb_val = (long)uc->uc_mcontext.regs->gpr[rb];
1392     return ra != 1 || rb_val >= 0 ? NULL         // not a stack bang
1393                                   : sp + rb_val; // banged address
1394   }
1395   return NULL; // not a stack bang
1396 #else
1397   // workaround not needed on !LINUX :-)
1398   ShouldNotCallThis();
1399   return NULL;
1400 #endif
1401 }
1402 
1403 void MacroAssembler::reserved_stack_check(Register return_pc) {
1404   // Test if reserved zone needs to be enabled.
1405   Label no_reserved_zone_enabling;
1406 
1407   ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread);
1408   cmpld(CCR0, R1_SP, R0);
1409   blt_predict_taken(CCR0, no_reserved_zone_enabling);
1410 
1411   // Enable reserved zone again, throw stack overflow exception.
1412   push_frame_reg_args(0, R0);
1413   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread);
1414   pop_frame();
1415   mtlr(return_pc);
1416   load_const_optimized(R0, StubRoutines::throw_delayed_StackOverflowError_entry());
1417   mtctr(R0);
1418   bctr();
1419 
1420   should_not_reach_here();
1421 
1422   bind(no_reserved_zone_enabling);
1423 }
1424 
1425 // CmpxchgX sets condition register to cmpX(current, compare).
1426 void MacroAssembler::cmpxchgw(ConditionRegister flag, Register dest_current_value,
1427                               Register compare_value, Register exchange_value,
1428                               Register addr_base, int semantics, bool cmpxchgx_hint,
1429                               Register int_flag_success, bool contention_hint, bool weak) {
1430   Label retry;
1431   Label failed;
1432   Label done;
1433 
1434   // Save one branch if result is returned via register and
1435   // result register is different from the other ones.
1436   bool use_result_reg    = (int_flag_success != noreg);
1437   bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value &&
1438                             int_flag_success != exchange_value && int_flag_success != addr_base);
1439   assert(!weak || flag == CCR0, "weak only supported with CCR0");
1440 
1441   if (use_result_reg && preset_result_reg) {
1442     li(int_flag_success, 0); // preset (assume cas failed)
1443   }
1444 
1445   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1446   if (contention_hint) { // Don't try to reserve if cmp fails.
1447     lwz(dest_current_value, 0, addr_base);
1448     cmpw(flag, dest_current_value, compare_value);
1449     bne(flag, failed);
1450   }
1451 
1452   // release/fence semantics
1453   if (semantics & MemBarRel) {
1454     release();
1455   }
1456 
1457   // atomic emulation loop
1458   bind(retry);
1459 
1460   lwarx(dest_current_value, addr_base, cmpxchgx_hint);
1461   cmpw(flag, dest_current_value, compare_value);
1462   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1463     bne_predict_not_taken(flag, failed);
1464   } else {
1465     bne(                  flag, failed);
1466   }
1467   // branch to done  => (flag == ne), (dest_current_value != compare_value)
1468   // fall through    => (flag == eq), (dest_current_value == compare_value)
1469 
1470   stwcx_(exchange_value, addr_base);
1471   if (!weak || use_result_reg) {
1472     if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1473       bne_predict_not_taken(CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1474     } else {
1475       bne(                  CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1476     }
1477   }
1478   // fall through    => (flag == eq), (dest_current_value == compare_value), (swapped)
1479 
1480   // Result in register (must do this at the end because int_flag_success can be the
1481   // same register as one above).
1482   if (use_result_reg) {
1483     li(int_flag_success, 1);
1484   }
1485 
1486   if (semantics & MemBarFenceAfter) {
1487     fence();
1488   } else if (semantics & MemBarAcq) {
1489     isync();
1490   }
1491 
1492   if (use_result_reg && !preset_result_reg) {
1493     b(done);
1494   }
1495 
1496   bind(failed);
1497   if (use_result_reg && !preset_result_reg) {
1498     li(int_flag_success, 0);
1499   }
1500 
1501   bind(done);
1502   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1503   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1504 }
1505 
1506 // Preforms atomic compare exchange:
1507 //   if (compare_value == *addr_base)
1508 //     *addr_base = exchange_value
1509 //     int_flag_success = 1;
1510 //   else
1511 //     int_flag_success = 0;
1512 //
1513 // ConditionRegister flag       = cmp(compare_value, *addr_base)
1514 // Register dest_current_value  = *addr_base
1515 // Register compare_value       Used to compare with value in memory
1516 // Register exchange_value      Written to memory if compare_value == *addr_base
1517 // Register addr_base           The memory location to compareXChange
1518 // Register int_flag_success    Set to 1 if exchange_value was written to *addr_base
1519 //
1520 // To avoid the costly compare exchange the value is tested beforehand.
1521 // Several special cases exist to avoid that unnecessary information is generated.
1522 //
1523 void MacroAssembler::cmpxchgd(ConditionRegister flag,
1524                               Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value,
1525                               Register addr_base, int semantics, bool cmpxchgx_hint,
1526                               Register int_flag_success, Label* failed_ext, bool contention_hint, bool weak) {
1527   Label retry;
1528   Label failed_int;
1529   Label& failed = (failed_ext != NULL) ? *failed_ext : failed_int;
1530   Label done;
1531 
1532   // Save one branch if result is returned via register and result register is different from the other ones.
1533   bool use_result_reg    = (int_flag_success!=noreg);
1534   bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() &&
1535                             int_flag_success!=exchange_value && int_flag_success!=addr_base);
1536   assert(!weak || flag == CCR0, "weak only supported with CCR0");
1537   assert(int_flag_success == noreg || failed_ext == NULL, "cannot have both");
1538 
1539   if (use_result_reg && preset_result_reg) {
1540     li(int_flag_success, 0); // preset (assume cas failed)
1541   }
1542 
1543   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1544   if (contention_hint) { // Don't try to reserve if cmp fails.
1545     ld(dest_current_value, 0, addr_base);
1546     cmpd(flag, compare_value, dest_current_value);
1547     bne(flag, failed);
1548   }
1549 
1550   // release/fence semantics
1551   if (semantics & MemBarRel) {
1552     release();
1553   }
1554 
1555   // atomic emulation loop
1556   bind(retry);
1557 
1558   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1559   cmpd(flag, compare_value, dest_current_value);
1560   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1561     bne_predict_not_taken(flag, failed);
1562   } else {
1563     bne(                  flag, failed);
1564   }
1565 
1566   stdcx_(exchange_value, addr_base);
1567   if (!weak || use_result_reg || failed_ext) {
1568     if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1569       bne_predict_not_taken(CCR0, weak ? failed : retry); // stXcx_ sets CCR0
1570     } else {
1571       bne(                  CCR0, weak ? failed : retry); // stXcx_ sets CCR0
1572     }
1573   }
1574 
1575   // result in register (must do this at the end because int_flag_success can be the same register as one above)
1576   if (use_result_reg) {
1577     li(int_flag_success, 1);
1578   }
1579 
1580   if (semantics & MemBarFenceAfter) {
1581     fence();
1582   } else if (semantics & MemBarAcq) {
1583     isync();
1584   }
1585 
1586   if (use_result_reg && !preset_result_reg) {
1587     b(done);
1588   }
1589 
1590   bind(failed_int);
1591   if (use_result_reg && !preset_result_reg) {
1592     li(int_flag_success, 0);
1593   }
1594 
1595   bind(done);
1596   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1597   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1598 }
1599 
1600 // Look up the method for a megamorphic invokeinterface call.
1601 // The target method is determined by <intf_klass, itable_index>.
1602 // The receiver klass is in recv_klass.
1603 // On success, the result will be in method_result, and execution falls through.
1604 // On failure, execution transfers to the given label.
1605 void MacroAssembler::lookup_interface_method(Register recv_klass,
1606                                              Register intf_klass,
1607                                              RegisterOrConstant itable_index,
1608                                              Register method_result,
1609                                              Register scan_temp,
1610                                              Register sethi_temp,
1611                                              Label& L_no_such_interface) {
1612   assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
1613   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
1614          "caller must use same register for non-constant itable index as for method");
1615 
1616   // Compute start of first itableOffsetEntry (which is at the end of the vtable).
1617   int vtable_base = in_bytes(Klass::vtable_start_offset());
1618   int itentry_off = itableMethodEntry::method_offset_in_bytes();
1619   int logMEsize   = exact_log2(itableMethodEntry::size() * wordSize);
1620   int scan_step   = itableOffsetEntry::size() * wordSize;
1621   int log_vte_size= exact_log2(vtableEntry::size_in_bytes());
1622 
1623   lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass);
1624   // %%% We should store the aligned, prescaled offset in the klassoop.
1625   // Then the next several instructions would fold away.
1626 
1627   sldi(scan_temp, scan_temp, log_vte_size);
1628   addi(scan_temp, scan_temp, vtable_base);
1629   add(scan_temp, recv_klass, scan_temp);
1630 
1631   // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1632   if (itable_index.is_register()) {
1633     Register itable_offset = itable_index.as_register();
1634     sldi(itable_offset, itable_offset, logMEsize);
1635     if (itentry_off) addi(itable_offset, itable_offset, itentry_off);
1636     add(recv_klass, itable_offset, recv_klass);
1637   } else {
1638     long itable_offset = (long)itable_index.as_constant();
1639     load_const_optimized(sethi_temp, (itable_offset<<logMEsize)+itentry_off); // static address, no relocation
1640     add(recv_klass, sethi_temp, recv_klass);
1641   }
1642 
1643   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
1644   //   if (scan->interface() == intf) {
1645   //     result = (klass + scan->offset() + itable_index);
1646   //   }
1647   // }
1648   Label search, found_method;
1649 
1650   for (int peel = 1; peel >= 0; peel--) {
1651     // %%%% Could load both offset and interface in one ldx, if they were
1652     // in the opposite order. This would save a load.
1653     ld(method_result, itableOffsetEntry::interface_offset_in_bytes(), scan_temp);
1654 
1655     // Check that this entry is non-null. A null entry means that
1656     // the receiver class doesn't implement the interface, and wasn't the
1657     // same as when the caller was compiled.
1658     cmpd(CCR0, method_result, intf_klass);
1659 
1660     if (peel) {
1661       beq(CCR0, found_method);
1662     } else {
1663       bne(CCR0, search);
1664       // (invert the test to fall through to found_method...)
1665     }
1666 
1667     if (!peel) break;
1668 
1669     bind(search);
1670 
1671     cmpdi(CCR0, method_result, 0);
1672     beq(CCR0, L_no_such_interface);
1673     addi(scan_temp, scan_temp, scan_step);
1674   }
1675 
1676   bind(found_method);
1677 
1678   // Got a hit.
1679   int ito_offset = itableOffsetEntry::offset_offset_in_bytes();
1680   lwz(scan_temp, ito_offset, scan_temp);
1681   ldx(method_result, scan_temp, recv_klass);
1682 }
1683 
1684 // virtual method calling
1685 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1686                                            RegisterOrConstant vtable_index,
1687                                            Register method_result) {
1688 
1689   assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg());
1690 
1691   const int base = in_bytes(Klass::vtable_start_offset());
1692   assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1693 
1694   if (vtable_index.is_register()) {
1695     sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord);
1696     add(recv_klass, vtable_index.as_register(), recv_klass);
1697   } else {
1698     addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord);
1699   }
1700   ld(R19_method, base + vtableEntry::method_offset_in_bytes(), recv_klass);
1701 }
1702 
1703 /////////////////////////////////////////// subtype checking ////////////////////////////////////////////
1704 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1705                                                    Register super_klass,
1706                                                    Register temp1_reg,
1707                                                    Register temp2_reg,
1708                                                    Label* L_success,
1709                                                    Label* L_failure,
1710                                                    Label* L_slow_path,
1711                                                    RegisterOrConstant super_check_offset) {
1712 
1713   const Register check_cache_offset = temp1_reg;
1714   const Register cached_super       = temp2_reg;
1715 
1716   assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super);
1717 
1718   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1719   int sc_offset  = in_bytes(Klass::secondary_super_cache_offset());
1720 
1721   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1722   bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset);
1723 
1724   Label L_fallthrough;
1725   int label_nulls = 0;
1726   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1727   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1728   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1729   assert(label_nulls <= 1 ||
1730          (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path),
1731          "at most one NULL in the batch, usually");
1732 
1733   // If the pointers are equal, we are done (e.g., String[] elements).
1734   // This self-check enables sharing of secondary supertype arrays among
1735   // non-primary types such as array-of-interface. Otherwise, each such
1736   // type would need its own customized SSA.
1737   // We move this check to the front of the fast path because many
1738   // type checks are in fact trivially successful in this manner,
1739   // so we get a nicely predicted branch right at the start of the check.
1740   cmpd(CCR0, sub_klass, super_klass);
1741   beq(CCR0, *L_success);
1742 
1743   // Check the supertype display:
1744   if (must_load_sco) {
1745     // The super check offset is always positive...
1746   lwz(check_cache_offset, sco_offset, super_klass);
1747     super_check_offset = RegisterOrConstant(check_cache_offset);
1748     // super_check_offset is register.
1749     assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register());
1750   }
1751   // The loaded value is the offset from KlassOopDesc.
1752 
1753   ld(cached_super, super_check_offset, sub_klass);
1754   cmpd(CCR0, cached_super, super_klass);
1755 
1756   // This check has worked decisively for primary supers.
1757   // Secondary supers are sought in the super_cache ('super_cache_addr').
1758   // (Secondary supers are interfaces and very deeply nested subtypes.)
1759   // This works in the same check above because of a tricky aliasing
1760   // between the super_cache and the primary super display elements.
1761   // (The 'super_check_addr' can address either, as the case requires.)
1762   // Note that the cache is updated below if it does not help us find
1763   // what we need immediately.
1764   // So if it was a primary super, we can just fail immediately.
1765   // Otherwise, it's the slow path for us (no success at this point).
1766 
1767 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); }
1768 
1769   if (super_check_offset.is_register()) {
1770     beq(CCR0, *L_success);
1771     cmpwi(CCR0, super_check_offset.as_register(), sc_offset);
1772     if (L_failure == &L_fallthrough) {
1773       beq(CCR0, *L_slow_path);
1774     } else {
1775       bne(CCR0, *L_failure);
1776       FINAL_JUMP(*L_slow_path);
1777     }
1778   } else {
1779     if (super_check_offset.as_constant() == sc_offset) {
1780       // Need a slow path; fast failure is impossible.
1781       if (L_slow_path == &L_fallthrough) {
1782         beq(CCR0, *L_success);
1783       } else {
1784         bne(CCR0, *L_slow_path);
1785         FINAL_JUMP(*L_success);
1786       }
1787     } else {
1788       // No slow path; it's a fast decision.
1789       if (L_failure == &L_fallthrough) {
1790         beq(CCR0, *L_success);
1791       } else {
1792         bne(CCR0, *L_failure);
1793         FINAL_JUMP(*L_success);
1794       }
1795     }
1796   }
1797 
1798   bind(L_fallthrough);
1799 #undef FINAL_JUMP
1800 }
1801 
1802 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1803                                                    Register super_klass,
1804                                                    Register temp1_reg,
1805                                                    Register temp2_reg,
1806                                                    Label* L_success,
1807                                                    Register result_reg) {
1808   const Register array_ptr = temp1_reg; // current value from cache array
1809   const Register temp      = temp2_reg;
1810 
1811   assert_different_registers(sub_klass, super_klass, array_ptr, temp);
1812 
1813   int source_offset = in_bytes(Klass::secondary_supers_offset());
1814   int target_offset = in_bytes(Klass::secondary_super_cache_offset());
1815 
1816   int length_offset = Array<Klass*>::length_offset_in_bytes();
1817   int base_offset   = Array<Klass*>::base_offset_in_bytes();
1818 
1819   Label hit, loop, failure, fallthru;
1820 
1821   ld(array_ptr, source_offset, sub_klass);
1822 
1823   // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated.");
1824   lwz(temp, length_offset, array_ptr);
1825   cmpwi(CCR0, temp, 0);
1826   beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0
1827 
1828   mtctr(temp); // load ctr
1829 
1830   bind(loop);
1831   // Oops in table are NO MORE compressed.
1832   ld(temp, base_offset, array_ptr);
1833   cmpd(CCR0, temp, super_klass);
1834   beq(CCR0, hit);
1835   addi(array_ptr, array_ptr, BytesPerWord);
1836   bdnz(loop);
1837 
1838   bind(failure);
1839   if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss)
1840   b(fallthru);
1841 
1842   bind(hit);
1843   std(super_klass, target_offset, sub_klass); // save result to cache
1844   if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit)
1845   if (L_success != NULL) { b(*L_success); }
1846   else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided
1847 
1848   bind(fallthru);
1849 }
1850 
1851 // Try fast path, then go to slow one if not successful
1852 void MacroAssembler::check_klass_subtype(Register sub_klass,
1853                          Register super_klass,
1854                          Register temp1_reg,
1855                          Register temp2_reg,
1856                          Label& L_success) {
1857   Label L_failure;
1858   check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure);
1859   check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success);
1860   bind(L_failure); // Fallthru if not successful.
1861 }
1862 
1863 void MacroAssembler::check_method_handle_type(Register mtype_reg, Register mh_reg,
1864                                               Register temp_reg,
1865                                               Label& wrong_method_type) {
1866   assert_different_registers(mtype_reg, mh_reg, temp_reg);
1867   // Compare method type against that of the receiver.
1868   load_heap_oop_not_null(temp_reg, delayed_value(java_lang_invoke_MethodHandle::type_offset_in_bytes, temp_reg), mh_reg);
1869   cmpd(CCR0, temp_reg, mtype_reg);
1870   bne(CCR0, wrong_method_type);
1871 }
1872 
1873 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot,
1874                                                    Register temp_reg,
1875                                                    int extra_slot_offset) {
1876   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
1877   int stackElementSize = Interpreter::stackElementSize;
1878   int offset = extra_slot_offset * stackElementSize;
1879   if (arg_slot.is_constant()) {
1880     offset += arg_slot.as_constant() * stackElementSize;
1881     return offset;
1882   } else {
1883     assert(temp_reg != noreg, "must specify");
1884     sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize));
1885     if (offset != 0)
1886       addi(temp_reg, temp_reg, offset);
1887     return temp_reg;
1888   }
1889 }
1890 
1891 // Supports temp2_reg = R0.
1892 void MacroAssembler::biased_locking_enter(ConditionRegister cr_reg, Register obj_reg,
1893                                           Register mark_reg, Register temp_reg,
1894                                           Register temp2_reg, Label& done, Label* slow_case) {
1895   assert(UseBiasedLocking, "why call this otherwise?");
1896 
1897 #ifdef ASSERT
1898   assert_different_registers(obj_reg, mark_reg, temp_reg, temp2_reg);
1899 #endif
1900 
1901   Label cas_label;
1902 
1903   // Branch to done if fast path fails and no slow_case provided.
1904   Label *slow_case_int = (slow_case != NULL) ? slow_case : &done;
1905 
1906   // Biased locking
1907   // See whether the lock is currently biased toward our thread and
1908   // whether the epoch is still valid
1909   // Note that the runtime guarantees sufficient alignment of JavaThread
1910   // pointers to allow age to be placed into low bits
1911   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits,
1912          "biased locking makes assumptions about bit layout");
1913 
1914   if (PrintBiasedLockingStatistics) {
1915     load_const(temp2_reg, (address) BiasedLocking::total_entry_count_addr(), temp_reg);
1916     lwzx(temp_reg, temp2_reg);
1917     addi(temp_reg, temp_reg, 1);
1918     stwx(temp_reg, temp2_reg);
1919   }
1920 
1921   andi(temp_reg, mark_reg, markOopDesc::biased_lock_mask_in_place);
1922   cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern);
1923   bne(cr_reg, cas_label);
1924 
1925   load_klass(temp_reg, obj_reg);
1926 
1927   load_const_optimized(temp2_reg, ~((int) markOopDesc::age_mask_in_place));
1928   ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
1929   orr(temp_reg, R16_thread, temp_reg);
1930   xorr(temp_reg, mark_reg, temp_reg);
1931   andr(temp_reg, temp_reg, temp2_reg);
1932   cmpdi(cr_reg, temp_reg, 0);
1933   if (PrintBiasedLockingStatistics) {
1934     Label l;
1935     bne(cr_reg, l);
1936     load_const(temp2_reg, (address) BiasedLocking::biased_lock_entry_count_addr());
1937     lwzx(mark_reg, temp2_reg);
1938     addi(mark_reg, mark_reg, 1);
1939     stwx(mark_reg, temp2_reg);
1940     // restore mark_reg
1941     ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);
1942     bind(l);
1943   }
1944   beq(cr_reg, done);
1945 
1946   Label try_revoke_bias;
1947   Label try_rebias;
1948 
1949   // At this point we know that the header has the bias pattern and
1950   // that we are not the bias owner in the current epoch. We need to
1951   // figure out more details about the state of the header in order to
1952   // know what operations can be legally performed on the object's
1953   // header.
1954 
1955   // If the low three bits in the xor result aren't clear, that means
1956   // the prototype header is no longer biased and we have to revoke
1957   // the bias on this object.
1958   andi(temp2_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
1959   cmpwi(cr_reg, temp2_reg, 0);
1960   bne(cr_reg, try_revoke_bias);
1961 
1962   // Biasing is still enabled for this data type. See whether the
1963   // epoch of the current bias is still valid, meaning that the epoch
1964   // bits of the mark word are equal to the epoch bits of the
1965   // prototype header. (Note that the prototype header's epoch bits
1966   // only change at a safepoint.) If not, attempt to rebias the object
1967   // toward the current thread. Note that we must be absolutely sure
1968   // that the current epoch is invalid in order to do this because
1969   // otherwise the manipulations it performs on the mark word are
1970   // illegal.
1971 
1972   int shift_amount = 64 - markOopDesc::epoch_shift;
1973   // rotate epoch bits to right (little) end and set other bits to 0
1974   // [ big part | epoch | little part ] -> [ 0..0 | epoch ]
1975   rldicl_(temp2_reg, temp_reg, shift_amount, 64 - markOopDesc::epoch_bits);
1976   // branch if epoch bits are != 0, i.e. they differ, because the epoch has been incremented
1977   bne(CCR0, try_rebias);
1978 
1979   // The epoch of the current bias is still valid but we know nothing
1980   // about the owner; it might be set or it might be clear. Try to
1981   // acquire the bias of the object using an atomic operation. If this
1982   // fails we will go in to the runtime to revoke the object's bias.
1983   // Note that we first construct the presumed unbiased header so we
1984   // don't accidentally blow away another thread's valid bias.
1985   andi(mark_reg, mark_reg, (markOopDesc::biased_lock_mask_in_place |
1986                                 markOopDesc::age_mask_in_place |
1987                                 markOopDesc::epoch_mask_in_place));
1988   orr(temp_reg, R16_thread, mark_reg);
1989 
1990   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
1991 
1992   // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
1993   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
1994            /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
1995            /*where=*/obj_reg,
1996            MacroAssembler::MemBarAcq,
1997            MacroAssembler::cmpxchgx_hint_acquire_lock(),
1998            noreg, slow_case_int); // bail out if failed
1999 
2000   // If the biasing toward our thread failed, this means that
2001   // another thread succeeded in biasing it toward itself and we
2002   // need to revoke that bias. The revocation will occur in the
2003   // interpreter runtime in the slow case.
2004   if (PrintBiasedLockingStatistics) {
2005     load_const(temp2_reg, (address) BiasedLocking::anonymously_biased_lock_entry_count_addr(), temp_reg);
2006     lwzx(temp_reg, temp2_reg);
2007     addi(temp_reg, temp_reg, 1);
2008     stwx(temp_reg, temp2_reg);
2009   }
2010   b(done);
2011 
2012   bind(try_rebias);
2013   // At this point we know the epoch has expired, meaning that the
2014   // current "bias owner", if any, is actually invalid. Under these
2015   // circumstances _only_, we are allowed to use the current header's
2016   // value as the comparison value when doing the cas to acquire the
2017   // bias in the current epoch. In other words, we allow transfer of
2018   // the bias from one thread to another directly in this situation.
2019   load_klass(temp_reg, obj_reg);
2020   andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place);
2021   orr(temp2_reg, R16_thread, temp2_reg);
2022   ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
2023   orr(temp_reg, temp2_reg, temp_reg);
2024 
2025   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2026 
2027   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
2028                  /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
2029                  /*where=*/obj_reg,
2030                  MacroAssembler::MemBarAcq,
2031                  MacroAssembler::cmpxchgx_hint_acquire_lock(),
2032                  noreg, slow_case_int); // bail out if failed
2033 
2034   // If the biasing toward our thread failed, this means that
2035   // another thread succeeded in biasing it toward itself and we
2036   // need to revoke that bias. The revocation will occur in the
2037   // interpreter runtime in the slow case.
2038   if (PrintBiasedLockingStatistics) {
2039     load_const(temp2_reg, (address) BiasedLocking::rebiased_lock_entry_count_addr(), temp_reg);
2040     lwzx(temp_reg, temp2_reg);
2041     addi(temp_reg, temp_reg, 1);
2042     stwx(temp_reg, temp2_reg);
2043   }
2044   b(done);
2045 
2046   bind(try_revoke_bias);
2047   // The prototype mark in the klass doesn't have the bias bit set any
2048   // more, indicating that objects of this data type are not supposed
2049   // to be biased any more. We are going to try to reset the mark of
2050   // this object to the prototype value and fall through to the
2051   // CAS-based locking scheme. Note that if our CAS fails, it means
2052   // that another thread raced us for the privilege of revoking the
2053   // bias of this particular object, so it's okay to continue in the
2054   // normal locking code.
2055   load_klass(temp_reg, obj_reg);
2056   ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
2057   andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place);
2058   orr(temp_reg, temp_reg, temp2_reg);
2059 
2060   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2061 
2062   // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
2063   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
2064                  /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
2065                  /*where=*/obj_reg,
2066                  MacroAssembler::MemBarAcq,
2067                  MacroAssembler::cmpxchgx_hint_acquire_lock());
2068 
2069   // reload markOop in mark_reg before continuing with lightweight locking
2070   ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);
2071 
2072   // Fall through to the normal CAS-based lock, because no matter what
2073   // the result of the above CAS, some thread must have succeeded in
2074   // removing the bias bit from the object's header.
2075   if (PrintBiasedLockingStatistics) {
2076     Label l;
2077     bne(cr_reg, l);
2078     load_const(temp2_reg, (address) BiasedLocking::revoked_lock_entry_count_addr(), temp_reg);
2079     lwzx(temp_reg, temp2_reg);
2080     addi(temp_reg, temp_reg, 1);
2081     stwx(temp_reg, temp2_reg);
2082     bind(l);
2083   }
2084 
2085   bind(cas_label);
2086 }
2087 
2088 void MacroAssembler::biased_locking_exit (ConditionRegister cr_reg, Register mark_addr, Register temp_reg, Label& done) {
2089   // Check for biased locking unlock case, which is a no-op
2090   // Note: we do not have to check the thread ID for two reasons.
2091   // First, the interpreter checks for IllegalMonitorStateException at
2092   // a higher level. Second, if the bias was revoked while we held the
2093   // lock, the object could not be rebiased toward another thread, so
2094   // the bias bit would be clear.
2095 
2096   ld(temp_reg, 0, mark_addr);
2097   andi(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
2098 
2099   cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern);
2100   beq(cr_reg, done);
2101 }
2102 
2103 // allocation (for C1)
2104 void MacroAssembler::eden_allocate(
2105   Register obj,                      // result: pointer to object after successful allocation
2106   Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
2107   int      con_size_in_bytes,        // object size in bytes if   known at compile time
2108   Register t1,                       // temp register
2109   Register t2,                       // temp register
2110   Label&   slow_case                 // continuation point if fast allocation fails
2111 ) {
2112   b(slow_case);
2113 }
2114 
2115 void MacroAssembler::tlab_allocate(
2116   Register obj,                      // result: pointer to object after successful allocation
2117   Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
2118   int      con_size_in_bytes,        // object size in bytes if   known at compile time
2119   Register t1,                       // temp register
2120   Label&   slow_case                 // continuation point if fast allocation fails
2121 ) {
2122   // make sure arguments make sense
2123   assert_different_registers(obj, var_size_in_bytes, t1);
2124   assert(0 <= con_size_in_bytes && is_simm13(con_size_in_bytes), "illegal object size");
2125   assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment");
2126 
2127   const Register new_top = t1;
2128   //verify_tlab(); not implemented
2129 
2130   ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2131   ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread);
2132   if (var_size_in_bytes == noreg) {
2133     addi(new_top, obj, con_size_in_bytes);
2134   } else {
2135     add(new_top, obj, var_size_in_bytes);
2136   }
2137   cmpld(CCR0, new_top, R0);
2138   bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case);
2139 
2140 #ifdef ASSERT
2141   // make sure new free pointer is properly aligned
2142   {
2143     Label L;
2144     andi_(R0, new_top, MinObjAlignmentInBytesMask);
2145     beq(CCR0, L);
2146     stop("updated TLAB free is not properly aligned", 0x934);
2147     bind(L);
2148   }
2149 #endif // ASSERT
2150 
2151   // update the tlab top pointer
2152   std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2153   //verify_tlab(); not implemented
2154 }
2155 void MacroAssembler::tlab_refill(Label& retry_tlab, Label& try_eden, Label& slow_case) {
2156   unimplemented("tlab_refill");
2157 }
2158 void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register t1, Register t2) {
2159   unimplemented("incr_allocated_bytes");
2160 }
2161 
2162 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset,
2163                                              int insts_call_instruction_offset, Register Rtoc) {
2164   // Start the stub.
2165   address stub = start_a_stub(64);
2166   if (stub == NULL) { return NULL; } // CodeCache full: bail out
2167 
2168   // Create a trampoline stub relocation which relates this trampoline stub
2169   // with the call instruction at insts_call_instruction_offset in the
2170   // instructions code-section.
2171   relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset));
2172   const int stub_start_offset = offset();
2173 
2174   // For java_to_interp stubs we use R11_scratch1 as scratch register
2175   // and in call trampoline stubs we use R12_scratch2. This way we
2176   // can distinguish them (see is_NativeCallTrampolineStub_at()).
2177   Register reg_scratch = R12_scratch2;
2178 
2179   // Now, create the trampoline stub's code:
2180   // - load the TOC
2181   // - load the call target from the constant pool
2182   // - call
2183   if (Rtoc == noreg) {
2184     calculate_address_from_global_toc(reg_scratch, method_toc());
2185     Rtoc = reg_scratch;
2186   }
2187 
2188   ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false);
2189   mtctr(reg_scratch);
2190   bctr();
2191 
2192   const address stub_start_addr = addr_at(stub_start_offset);
2193 
2194   // Assert that the encoded destination_toc_offset can be identified and that it is correct.
2195   assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(),
2196          "encoded offset into the constant pool must match");
2197   // Trampoline_stub_size should be good.
2198   assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size");
2199   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
2200 
2201   // End the stub.
2202   end_a_stub();
2203   return stub;
2204 }
2205 
2206 // TM on PPC64.
2207 void MacroAssembler::atomic_inc_ptr(Register addr, Register result, int simm16) {
2208   Label retry;
2209   bind(retry);
2210   ldarx(result, addr, /*hint*/ false);
2211   addi(result, result, simm16);
2212   stdcx_(result, addr);
2213   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
2214     bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
2215   } else {
2216     bne(                  CCR0, retry); // stXcx_ sets CCR0
2217   }
2218 }
2219 
2220 void MacroAssembler::atomic_ori_int(Register addr, Register result, int uimm16) {
2221   Label retry;
2222   bind(retry);
2223   lwarx(result, addr, /*hint*/ false);
2224   ori(result, result, uimm16);
2225   stwcx_(result, addr);
2226   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
2227     bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
2228   } else {
2229     bne(                  CCR0, retry); // stXcx_ sets CCR0
2230   }
2231 }
2232 
2233 #if INCLUDE_RTM_OPT
2234 
2235 // Update rtm_counters based on abort status
2236 // input: abort_status
2237 //        rtm_counters (RTMLockingCounters*)
2238 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters_Reg) {
2239   // Mapping to keep PreciseRTMLockingStatistics similar to x86.
2240   // x86 ppc (! means inverted, ? means not the same)
2241   //  0   31  Set if abort caused by XABORT instruction.
2242   //  1  ! 7  If set, the transaction may succeed on a retry. This bit is always clear if bit 0 is set.
2243   //  2   13  Set if another logical processor conflicted with a memory address that was part of the transaction that aborted.
2244   //  3   10  Set if an internal buffer overflowed.
2245   //  4  ?12  Set if a debug breakpoint was hit.
2246   //  5  ?32  Set if an abort occurred during execution of a nested transaction.
2247   const  int tm_failure_bit[] = {Assembler::tm_tabort, // Note: Seems like signal handler sets this, too.
2248                                  Assembler::tm_failure_persistent, // inverted: transient
2249                                  Assembler::tm_trans_cf,
2250                                  Assembler::tm_footprint_of,
2251                                  Assembler::tm_non_trans_cf,
2252                                  Assembler::tm_suspended};
2253   const bool tm_failure_inv[] = {false, true, false, false, false, false};
2254   assert(sizeof(tm_failure_bit)/sizeof(int) == RTMLockingCounters::ABORT_STATUS_LIMIT, "adapt mapping!");
2255 
2256   const Register addr_Reg = R0;
2257   // Keep track of offset to where rtm_counters_Reg had pointed to.
2258   int counters_offs = RTMLockingCounters::abort_count_offset();
2259   addi(addr_Reg, rtm_counters_Reg, counters_offs);
2260   const Register temp_Reg = rtm_counters_Reg;
2261 
2262   //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically
2263   ldx(temp_Reg, addr_Reg);
2264   addi(temp_Reg, temp_Reg, 1);
2265   stdx(temp_Reg, addr_Reg);
2266 
2267   if (PrintPreciseRTMLockingStatistics) {
2268     int counters_offs_delta = RTMLockingCounters::abortX_count_offset() - counters_offs;
2269 
2270     //mftexasr(abort_status); done by caller
2271     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
2272       counters_offs += counters_offs_delta;
2273       li(temp_Reg, counters_offs_delta); // can't use addi with R0
2274       add(addr_Reg, addr_Reg, temp_Reg); // point to next counter
2275       counters_offs_delta = sizeof(uintx);
2276 
2277       Label check_abort;
2278       rldicr_(temp_Reg, abort_status, tm_failure_bit[i], 0);
2279       if (tm_failure_inv[i]) {
2280         bne(CCR0, check_abort);
2281       } else {
2282         beq(CCR0, check_abort);
2283       }
2284       //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically
2285       ldx(temp_Reg, addr_Reg);
2286       addi(temp_Reg, temp_Reg, 1);
2287       stdx(temp_Reg, addr_Reg);
2288       bind(check_abort);
2289     }
2290   }
2291   li(temp_Reg, -counters_offs); // can't use addi with R0
2292   add(rtm_counters_Reg, addr_Reg, temp_Reg); // restore
2293 }
2294 
2295 // Branch if (random & (count-1) != 0), count is 2^n
2296 // tmp and CR0 are killed
2297 void MacroAssembler::branch_on_random_using_tb(Register tmp, int count, Label& brLabel) {
2298   mftb(tmp);
2299   andi_(tmp, tmp, count-1);
2300   bne(CCR0, brLabel);
2301 }
2302 
2303 // Perform abort ratio calculation, set no_rtm bit if high ratio.
2304 // input:  rtm_counters_Reg (RTMLockingCounters* address) - KILLED
2305 void MacroAssembler::rtm_abort_ratio_calculation(Register rtm_counters_Reg,
2306                                                  RTMLockingCounters* rtm_counters,
2307                                                  Metadata* method_data) {
2308   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
2309 
2310   if (RTMLockingCalculationDelay > 0) {
2311     // Delay calculation.
2312     ld(rtm_counters_Reg, (RegisterOrConstant)(intptr_t)RTMLockingCounters::rtm_calculation_flag_addr());
2313     cmpdi(CCR0, rtm_counters_Reg, 0);
2314     beq(CCR0, L_done);
2315     load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2316   }
2317   // Abort ratio calculation only if abort_count > RTMAbortThreshold.
2318   //   Aborted transactions = abort_count * 100
2319   //   All transactions = total_count *  RTMTotalCountIncrRate
2320   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
2321   ld(R0, RTMLockingCounters::abort_count_offset(), rtm_counters_Reg);
2322   cmpdi(CCR0, R0, RTMAbortThreshold);
2323   blt(CCR0, L_check_always_rtm2);
2324   mulli(R0, R0, 100);
2325 
2326   const Register tmpReg = rtm_counters_Reg;
2327   ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2328   mulli(tmpReg, tmpReg, RTMTotalCountIncrRate);
2329   mulli(tmpReg, tmpReg, RTMAbortRatio);
2330   cmpd(CCR0, R0, tmpReg);
2331   blt(CCR0, L_check_always_rtm1); // jump to reload
2332   if (method_data != NULL) {
2333     // Set rtm_state to "no rtm" in MDO.
2334     // Not using a metadata relocation. Method and Class Loader are kept alive anyway.
2335     // (See nmethod::metadata_do and CodeBuffer::finalize_oop_references.)
2336     load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2337     atomic_ori_int(R0, tmpReg, NoRTM);
2338   }
2339   b(L_done);
2340 
2341   bind(L_check_always_rtm1);
2342   load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2343   bind(L_check_always_rtm2);
2344   ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2345   cmpdi(CCR0, tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
2346   blt(CCR0, L_done);
2347   if (method_data != NULL) {
2348     // Set rtm_state to "always rtm" in MDO.
2349     // Not using a metadata relocation. See above.
2350     load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2351     atomic_ori_int(R0, tmpReg, UseRTM);
2352   }
2353   bind(L_done);
2354 }
2355 
2356 // Update counters and perform abort ratio calculation.
2357 // input: abort_status_Reg
2358 void MacroAssembler::rtm_profiling(Register abort_status_Reg, Register temp_Reg,
2359                                    RTMLockingCounters* rtm_counters,
2360                                    Metadata* method_data,
2361                                    bool profile_rtm) {
2362 
2363   assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2364   // Update rtm counters based on state at abort.
2365   // Reads abort_status_Reg, updates flags.
2366   assert_different_registers(abort_status_Reg, temp_Reg);
2367   load_const_optimized(temp_Reg, (address)rtm_counters, R0);
2368   rtm_counters_update(abort_status_Reg, temp_Reg);
2369   if (profile_rtm) {
2370     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2371     rtm_abort_ratio_calculation(temp_Reg, rtm_counters, method_data);
2372   }
2373 }
2374 
2375 // Retry on abort if abort's status indicates non-persistent failure.
2376 // inputs: retry_count_Reg
2377 //       : abort_status_Reg
2378 // output: retry_count_Reg decremented by 1
2379 void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg,
2380                                              Label& retryLabel, Label* checkRetry) {
2381   Label doneRetry;
2382   rldicr_(R0, abort_status_Reg, tm_failure_persistent, 0);
2383   bne(CCR0, doneRetry);
2384   if (checkRetry) { bind(*checkRetry); }
2385   addic_(retry_count_Reg, retry_count_Reg, -1);
2386   blt(CCR0, doneRetry);
2387   smt_yield(); // Can't use wait(). No permission (SIGILL).
2388   b(retryLabel);
2389   bind(doneRetry);
2390 }
2391 
2392 // Spin and retry if lock is busy.
2393 // inputs: box_Reg (monitor address)
2394 //       : retry_count_Reg
2395 // output: retry_count_Reg decremented by 1
2396 // CTR is killed
2397 void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register owner_addr_Reg, Label& retryLabel) {
2398   Label SpinLoop, doneRetry;
2399   addic_(retry_count_Reg, retry_count_Reg, -1);
2400   blt(CCR0, doneRetry);
2401   li(R0, RTMSpinLoopCount);
2402   mtctr(R0);
2403 
2404   bind(SpinLoop);
2405   smt_yield(); // Can't use waitrsv(). No permission (SIGILL).
2406   bdz(retryLabel);
2407   ld(R0, 0, owner_addr_Reg);
2408   cmpdi(CCR0, R0, 0);
2409   bne(CCR0, SpinLoop);
2410   b(retryLabel);
2411 
2412   bind(doneRetry);
2413 }
2414 
2415 // Use RTM for normal stack locks.
2416 // Input: objReg (object to lock)
2417 void MacroAssembler::rtm_stack_locking(ConditionRegister flag,
2418                                        Register obj, Register mark_word, Register tmp,
2419                                        Register retry_on_abort_count_Reg,
2420                                        RTMLockingCounters* stack_rtm_counters,
2421                                        Metadata* method_data, bool profile_rtm,
2422                                        Label& DONE_LABEL, Label& IsInflated) {
2423   assert(UseRTMForStackLocks, "why call this otherwise?");
2424   assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
2425   Label L_rtm_retry, L_decrement_retry, L_on_abort;
2426 
2427   if (RTMRetryCount > 0) {
2428     load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
2429     bind(L_rtm_retry);
2430   }
2431   andi_(R0, mark_word, markOopDesc::monitor_value);  // inflated vs stack-locked|neutral|biased
2432   bne(CCR0, IsInflated);
2433 
2434   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2435     Label L_noincrement;
2436     if (RTMTotalCountIncrRate > 1) {
2437       branch_on_random_using_tb(tmp, (int)RTMTotalCountIncrRate, L_noincrement);
2438     }
2439     assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
2440     load_const_optimized(tmp, (address)stack_rtm_counters->total_count_addr(), R0);
2441     //atomic_inc_ptr(tmp, /*temp, will be reloaded*/mark_word); We don't increment atomically
2442     ldx(mark_word, tmp);
2443     addi(mark_word, mark_word, 1);
2444     stdx(mark_word, tmp);
2445     bind(L_noincrement);
2446   }
2447   tbegin_();
2448   beq(CCR0, L_on_abort);
2449   ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);         // Reload in transaction, conflicts need to be tracked.
2450   andi(R0, mark_word, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
2451   cmpwi(flag, R0, markOopDesc::unlocked_value);                // bits = 001 unlocked
2452   beq(flag, DONE_LABEL);                                       // all done if unlocked
2453 
2454   if (UseRTMXendForLockBusy) {
2455     tend_();
2456     b(L_decrement_retry);
2457   } else {
2458     tabort_();
2459   }
2460   bind(L_on_abort);
2461   const Register abort_status_Reg = tmp;
2462   mftexasr(abort_status_Reg);
2463   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2464     rtm_profiling(abort_status_Reg, /*temp*/mark_word, stack_rtm_counters, method_data, profile_rtm);
2465   }
2466   ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // reload
2467   if (RTMRetryCount > 0) {
2468     // Retry on lock abort if abort status is not permanent.
2469     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry, &L_decrement_retry);
2470   } else {
2471     bind(L_decrement_retry);
2472   }
2473 }
2474 
2475 // Use RTM for inflating locks
2476 // inputs: obj       (object to lock)
2477 //         mark_word (current header - KILLED)
2478 //         boxReg    (on-stack box address (displaced header location) - KILLED)
2479 void MacroAssembler::rtm_inflated_locking(ConditionRegister flag,
2480                                           Register obj, Register mark_word, Register boxReg,
2481                                           Register retry_on_busy_count_Reg, Register retry_on_abort_count_Reg,
2482                                           RTMLockingCounters* rtm_counters,
2483                                           Metadata* method_data, bool profile_rtm,
2484                                           Label& DONE_LABEL) {
2485   assert(UseRTMLocking, "why call this otherwise?");
2486   Label L_rtm_retry, L_decrement_retry, L_on_abort;
2487   // Clean monitor_value bit to get valid pointer.
2488   int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markOopDesc::monitor_value;
2489 
2490   // Store non-null, using boxReg instead of (intptr_t)markOopDesc::unused_mark().
2491   std(boxReg, BasicLock::displaced_header_offset_in_bytes(), boxReg);
2492   const Register tmpReg = boxReg;
2493   const Register owner_addr_Reg = mark_word;
2494   addi(owner_addr_Reg, mark_word, owner_offset);
2495 
2496   if (RTMRetryCount > 0) {
2497     load_const_optimized(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy.
2498     load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort.
2499     bind(L_rtm_retry);
2500   }
2501   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2502     Label L_noincrement;
2503     if (RTMTotalCountIncrRate > 1) {
2504       branch_on_random_using_tb(R0, (int)RTMTotalCountIncrRate, L_noincrement);
2505     }
2506     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2507     load_const(R0, (address)rtm_counters->total_count_addr(), tmpReg);
2508     //atomic_inc_ptr(R0, tmpReg); We don't increment atomically
2509     ldx(tmpReg, R0);
2510     addi(tmpReg, tmpReg, 1);
2511     stdx(tmpReg, R0);
2512     bind(L_noincrement);
2513   }
2514   tbegin_();
2515   beq(CCR0, L_on_abort);
2516   // We don't reload mark word. Will only be reset at safepoint.
2517   ld(R0, 0, owner_addr_Reg); // Load in transaction, conflicts need to be tracked.
2518   cmpdi(flag, R0, 0);
2519   beq(flag, DONE_LABEL);
2520 
2521   if (UseRTMXendForLockBusy) {
2522     tend_();
2523     b(L_decrement_retry);
2524   } else {
2525     tabort_();
2526   }
2527   bind(L_on_abort);
2528   const Register abort_status_Reg = tmpReg;
2529   mftexasr(abort_status_Reg);
2530   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2531     rtm_profiling(abort_status_Reg, /*temp*/ owner_addr_Reg, rtm_counters, method_data, profile_rtm);
2532     // Restore owner_addr_Reg
2533     ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);
2534 #ifdef ASSERT
2535     andi_(R0, mark_word, markOopDesc::monitor_value);
2536     asm_assert_ne("must be inflated", 0xa754); // Deflating only allowed at safepoint.
2537 #endif
2538     addi(owner_addr_Reg, mark_word, owner_offset);
2539   }
2540   if (RTMRetryCount > 0) {
2541     // Retry on lock abort if abort status is not permanent.
2542     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
2543   }
2544 
2545   // Appears unlocked - try to swing _owner from null to non-null.
2546   cmpxchgd(flag, /*current val*/ R0, (intptr_t)0, /*new val*/ R16_thread, owner_addr_Reg,
2547            MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2548            MacroAssembler::cmpxchgx_hint_acquire_lock(), noreg, &L_decrement_retry, true);
2549 
2550   if (RTMRetryCount > 0) {
2551     // success done else retry
2552     b(DONE_LABEL);
2553     bind(L_decrement_retry);
2554     // Spin and retry if lock is busy.
2555     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, owner_addr_Reg, L_rtm_retry);
2556   } else {
2557     bind(L_decrement_retry);
2558   }
2559 }
2560 
2561 #endif //  INCLUDE_RTM_OPT
2562 
2563 // "The box" is the space on the stack where we copy the object mark.
2564 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box,
2565                                                Register temp, Register displaced_header, Register current_header,
2566                                                bool try_bias,
2567                                                RTMLockingCounters* rtm_counters,
2568                                                RTMLockingCounters* stack_rtm_counters,
2569                                                Metadata* method_data,
2570                                                bool use_rtm, bool profile_rtm) {
2571   assert_different_registers(oop, box, temp, displaced_header, current_header);
2572   assert(flag != CCR0, "bad condition register");
2573   Label cont;
2574   Label object_has_monitor;
2575   Label cas_failed;
2576 
2577   // Load markOop from object into displaced_header.
2578   ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop);
2579 
2580 
2581   // Always do locking in runtime.
2582   if (EmitSync & 0x01) {
2583     cmpdi(flag, oop, 0); // Oop can't be 0 here => always false.
2584     return;
2585   }
2586 
2587   if (try_bias) {
2588     biased_locking_enter(flag, oop, displaced_header, temp, current_header, cont);
2589   }
2590 
2591 #if INCLUDE_RTM_OPT
2592   if (UseRTMForStackLocks && use_rtm) {
2593     rtm_stack_locking(flag, oop, displaced_header, temp, /*temp*/ current_header,
2594                       stack_rtm_counters, method_data, profile_rtm,
2595                       cont, object_has_monitor);
2596   }
2597 #endif // INCLUDE_RTM_OPT
2598 
2599   // Handle existing monitor.
2600   if ((EmitSync & 0x02) == 0) {
2601     // The object has an existing monitor iff (mark & monitor_value) != 0.
2602     andi_(temp, displaced_header, markOopDesc::monitor_value);
2603     bne(CCR0, object_has_monitor);
2604   }
2605 
2606   // Set displaced_header to be (markOop of object | UNLOCK_VALUE).
2607   ori(displaced_header, displaced_header, markOopDesc::unlocked_value);
2608 
2609   // Load Compare Value application register.
2610 
2611   // Initialize the box. (Must happen before we update the object mark!)
2612   std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2613 
2614   // Must fence, otherwise, preceding store(s) may float below cmpxchg.
2615   // Compare object markOop with mark and if equal exchange scratch1 with object markOop.
2616   cmpxchgd(/*flag=*/flag,
2617            /*current_value=*/current_header,
2618            /*compare_value=*/displaced_header,
2619            /*exchange_value=*/box,
2620            /*where=*/oop,
2621            MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2622            MacroAssembler::cmpxchgx_hint_acquire_lock(),
2623            noreg,
2624            &cas_failed,
2625            /*check without membar and ldarx first*/true);
2626   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2627 
2628   // If the compare-and-exchange succeeded, then we found an unlocked
2629   // object and we have now locked it.
2630   b(cont);
2631 
2632   bind(cas_failed);
2633   // We did not see an unlocked object so try the fast recursive case.
2634 
2635   // Check if the owner is self by comparing the value in the markOop of object
2636   // (current_header) with the stack pointer.
2637   sub(current_header, current_header, R1_SP);
2638   load_const_optimized(temp, ~(os::vm_page_size()-1) | markOopDesc::lock_mask_in_place);
2639 
2640   and_(R0/*==0?*/, current_header, temp);
2641   // If condition is true we are cont and hence we can store 0 as the
2642   // displaced header in the box, which indicates that it is a recursive lock.
2643   mcrf(flag,CCR0);
2644   std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box);
2645 
2646   // Handle existing monitor.
2647   if ((EmitSync & 0x02) == 0) {
2648     b(cont);
2649 
2650     bind(object_has_monitor);
2651     // The object's monitor m is unlocked iff m->owner == NULL,
2652     // otherwise m->owner may contain a thread or a stack address.
2653 
2654 #if INCLUDE_RTM_OPT
2655     // Use the same RTM locking code in 32- and 64-bit VM.
2656     if (use_rtm) {
2657       rtm_inflated_locking(flag, oop, displaced_header, box, temp, /*temp*/ current_header,
2658                            rtm_counters, method_data, profile_rtm, cont);
2659     } else {
2660 #endif // INCLUDE_RTM_OPT
2661 
2662     // Try to CAS m->owner from NULL to current thread.
2663     addi(temp, displaced_header, ObjectMonitor::owner_offset_in_bytes()-markOopDesc::monitor_value);
2664     cmpxchgd(/*flag=*/flag,
2665              /*current_value=*/current_header,
2666              /*compare_value=*/(intptr_t)0,
2667              /*exchange_value=*/R16_thread,
2668              /*where=*/temp,
2669              MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2670              MacroAssembler::cmpxchgx_hint_acquire_lock());
2671 
2672     // Store a non-null value into the box.
2673     std(box, BasicLock::displaced_header_offset_in_bytes(), box);
2674 
2675 #   ifdef ASSERT
2676     bne(flag, cont);
2677     // We have acquired the monitor, check some invariants.
2678     addi(/*monitor=*/temp, temp, -ObjectMonitor::owner_offset_in_bytes());
2679     // Invariant 1: _recursions should be 0.
2680     //assert(ObjectMonitor::recursions_size_in_bytes() == 8, "unexpected size");
2681     asm_assert_mem8_is_zero(ObjectMonitor::recursions_offset_in_bytes(), temp,
2682                             "monitor->_recursions should be 0", -1);
2683     // Invariant 2: OwnerIsThread shouldn't be 0.
2684     //assert(ObjectMonitor::OwnerIsThread_size_in_bytes() == 4, "unexpected size");
2685     //asm_assert_mem4_isnot_zero(ObjectMonitor::OwnerIsThread_offset_in_bytes(), temp,
2686     //                           "monitor->OwnerIsThread shouldn't be 0", -1);
2687 #   endif
2688 
2689 #if INCLUDE_RTM_OPT
2690     } // use_rtm()
2691 #endif
2692   }
2693 
2694   bind(cont);
2695   // flag == EQ indicates success
2696   // flag == NE indicates failure
2697 }
2698 
2699 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box,
2700                                                  Register temp, Register displaced_header, Register current_header,
2701                                                  bool try_bias, bool use_rtm) {
2702   assert_different_registers(oop, box, temp, displaced_header, current_header);
2703   assert(flag != CCR0, "bad condition register");
2704   Label cont;
2705   Label object_has_monitor;
2706 
2707   // Always do locking in runtime.
2708   if (EmitSync & 0x01) {
2709     cmpdi(flag, oop, 0); // Oop can't be 0 here => always false.
2710     return;
2711   }
2712 
2713   if (try_bias) {
2714     biased_locking_exit(flag, oop, current_header, cont);
2715   }
2716 
2717 #if INCLUDE_RTM_OPT
2718   if (UseRTMForStackLocks && use_rtm) {
2719     assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
2720     Label L_regular_unlock;
2721     ld(current_header, oopDesc::mark_offset_in_bytes(), oop);         // fetch markword
2722     andi(R0, current_header, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
2723     cmpwi(flag, R0, markOopDesc::unlocked_value);                     // bits = 001 unlocked
2724     bne(flag, L_regular_unlock);                                      // else RegularLock
2725     tend_();                                                          // otherwise end...
2726     b(cont);                                                          // ... and we're done
2727     bind(L_regular_unlock);
2728   }
2729 #endif
2730 
2731   // Find the lock address and load the displaced header from the stack.
2732   ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2733 
2734   // If the displaced header is 0, we have a recursive unlock.
2735   cmpdi(flag, displaced_header, 0);
2736   beq(flag, cont);
2737 
2738   // Handle existing monitor.
2739   if ((EmitSync & 0x02) == 0) {
2740     // The object has an existing monitor iff (mark & monitor_value) != 0.
2741     RTM_OPT_ONLY( if (!(UseRTMForStackLocks && use_rtm)) ) // skip load if already done
2742     ld(current_header, oopDesc::mark_offset_in_bytes(), oop);
2743     andi_(R0, current_header, markOopDesc::monitor_value);
2744     bne(CCR0, object_has_monitor);
2745   }
2746 
2747   // Check if it is still a light weight lock, this is is true if we see
2748   // the stack address of the basicLock in the markOop of the object.
2749   // Cmpxchg sets flag to cmpd(current_header, box).
2750   cmpxchgd(/*flag=*/flag,
2751            /*current_value=*/current_header,
2752            /*compare_value=*/box,
2753            /*exchange_value=*/displaced_header,
2754            /*where=*/oop,
2755            MacroAssembler::MemBarRel,
2756            MacroAssembler::cmpxchgx_hint_release_lock(),
2757            noreg,
2758            &cont);
2759 
2760   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2761 
2762   // Handle existing monitor.
2763   if ((EmitSync & 0x02) == 0) {
2764     b(cont);
2765 
2766     bind(object_has_monitor);
2767     addi(current_header, current_header, -markOopDesc::monitor_value); // monitor
2768     ld(temp,             ObjectMonitor::owner_offset_in_bytes(), current_header);
2769 
2770     // It's inflated.
2771 #if INCLUDE_RTM_OPT
2772     if (use_rtm) {
2773       Label L_regular_inflated_unlock;
2774       // Clean monitor_value bit to get valid pointer
2775       cmpdi(flag, temp, 0);
2776       bne(flag, L_regular_inflated_unlock);
2777       tend_();
2778       b(cont);
2779       bind(L_regular_inflated_unlock);
2780     }
2781 #endif
2782 
2783     ld(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header);
2784     xorr(temp, R16_thread, temp);      // Will be 0 if we are the owner.
2785     orr(temp, temp, displaced_header); // Will be 0 if there are 0 recursions.
2786     cmpdi(flag, temp, 0);
2787     bne(flag, cont);
2788 
2789     ld(temp,             ObjectMonitor::EntryList_offset_in_bytes(), current_header);
2790     ld(displaced_header, ObjectMonitor::cxq_offset_in_bytes(), current_header);
2791     orr(temp, temp, displaced_header); // Will be 0 if both are 0.
2792     cmpdi(flag, temp, 0);
2793     bne(flag, cont);
2794     release();
2795     std(temp, ObjectMonitor::owner_offset_in_bytes(), current_header);
2796   }
2797 
2798   bind(cont);
2799   // flag == EQ indicates success
2800   // flag == NE indicates failure
2801 }
2802 
2803 // Write serialization page so VM thread can do a pseudo remote membar.
2804 // We use the current thread pointer to calculate a thread specific
2805 // offset to write to within the page. This minimizes bus traffic
2806 // due to cache line collision.
2807 void MacroAssembler::serialize_memory(Register thread, Register tmp1, Register tmp2) {
2808   srdi(tmp2, thread, os::get_serialize_page_shift_count());
2809 
2810   int mask = os::vm_page_size() - sizeof(int);
2811   if (Assembler::is_simm(mask, 16)) {
2812     andi(tmp2, tmp2, mask);
2813   } else {
2814     lis(tmp1, (int)((signed short) (mask >> 16)));
2815     ori(tmp1, tmp1, mask & 0x0000ffff);
2816     andr(tmp2, tmp2, tmp1);
2817   }
2818 
2819   load_const(tmp1, (long) os::get_memory_serialize_page());
2820   release();
2821   stwx(R0, tmp1, tmp2);
2822 }
2823 
2824 
2825 // GC barrier helper macros
2826 
2827 // Write the card table byte if needed.
2828 void MacroAssembler::card_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp) {
2829   CardTableModRefBS* bs =
2830     barrier_set_cast<CardTableModRefBS>(Universe::heap()->barrier_set());
2831   assert(bs->kind() == BarrierSet::CardTableForRS ||
2832          bs->kind() == BarrierSet::CardTableExtension, "wrong barrier");
2833 #ifdef ASSERT
2834   cmpdi(CCR0, Rnew_val, 0);
2835   asm_assert_ne("null oop not allowed", 0x321);
2836 #endif
2837   card_table_write(bs->byte_map_base, Rtmp, Rstore_addr);
2838 }
2839 
2840 // Write the card table byte.
2841 void MacroAssembler::card_table_write(jbyte* byte_map_base, Register Rtmp, Register Robj) {
2842   assert_different_registers(Robj, Rtmp, R0);
2843   load_const_optimized(Rtmp, (address)byte_map_base, R0);
2844   srdi(Robj, Robj, CardTableModRefBS::card_shift);
2845   li(R0, 0); // dirty
2846   if (UseConcMarkSweepGC) membar(Assembler::StoreStore);
2847   stbx(R0, Rtmp, Robj);
2848 }
2849 
2850 #if INCLUDE_ALL_GCS
2851 // General G1 pre-barrier generator.
2852 // Goal: record the previous value if it is not null.
2853 void MacroAssembler::g1_write_barrier_pre(Register Robj, RegisterOrConstant offset, Register Rpre_val,
2854                                           Register Rtmp1, Register Rtmp2, bool needs_frame) {
2855   Label runtime, filtered;
2856 
2857   // Is marking active?
2858   if (in_bytes(SATBMarkQueue::byte_width_of_active()) == 4) {
2859     lwz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_active()), R16_thread);
2860   } else {
2861     guarantee(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, "Assumption");
2862     lbz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_active()), R16_thread);
2863   }
2864   cmpdi(CCR0, Rtmp1, 0);
2865   beq(CCR0, filtered);
2866 
2867   // Do we need to load the previous value?
2868   if (Robj != noreg) {
2869     // Load the previous value...
2870     if (UseCompressedOops) {
2871       lwz(Rpre_val, offset, Robj);
2872     } else {
2873       ld(Rpre_val, offset, Robj);
2874     }
2875     // Previous value has been loaded into Rpre_val.
2876   }
2877   assert(Rpre_val != noreg, "must have a real register");
2878 
2879   // Is the previous value null?
2880   cmpdi(CCR0, Rpre_val, 0);
2881   beq(CCR0, filtered);
2882 
2883   if (Robj != noreg && UseCompressedOops) {
2884     decode_heap_oop_not_null(Rpre_val);
2885   }
2886 
2887   // OK, it's not filtered, so we'll need to call enqueue. In the normal
2888   // case, pre_val will be a scratch G-reg, but there are some cases in
2889   // which it's an O-reg. In the first case, do a normal call. In the
2890   // latter, do a save here and call the frameless version.
2891 
2892   // Can we store original value in the thread's buffer?
2893   // Is index == 0?
2894   // (The index field is typed as size_t.)
2895   const Register Rbuffer = Rtmp1, Rindex = Rtmp2;
2896 
2897   ld(Rindex, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_index()), R16_thread);
2898   cmpdi(CCR0, Rindex, 0);
2899   beq(CCR0, runtime); // If index == 0, goto runtime.
2900   ld(Rbuffer, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_buf()), R16_thread);
2901 
2902   addi(Rindex, Rindex, -wordSize); // Decrement index.
2903   std(Rindex, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_index()), R16_thread);
2904 
2905   // Record the previous value.
2906   stdx(Rpre_val, Rbuffer, Rindex);
2907   b(filtered);
2908 
2909   bind(runtime);
2910 
2911   // VM call need frame to access(write) O register.
2912   if (needs_frame) {
2913     save_LR_CR(Rtmp1);
2914     push_frame_reg_args(0, Rtmp2);
2915   }
2916 
2917   if (Rpre_val->is_volatile() && Robj == noreg) mr(R31, Rpre_val); // Save pre_val across C call if it was preloaded.
2918   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), Rpre_val, R16_thread);
2919   if (Rpre_val->is_volatile() && Robj == noreg) mr(Rpre_val, R31); // restore
2920 
2921   if (needs_frame) {
2922     pop_frame();
2923     restore_LR_CR(Rtmp1);
2924   }
2925 
2926   bind(filtered);
2927 }
2928 
2929 // General G1 post-barrier generator
2930 // Store cross-region card.
2931 void MacroAssembler::g1_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp1, Register Rtmp2, Register Rtmp3, Label *filtered_ext) {
2932   Label runtime, filtered_int;
2933   Label& filtered = (filtered_ext != NULL) ? *filtered_ext : filtered_int;
2934   assert_different_registers(Rstore_addr, Rnew_val, Rtmp1, Rtmp2);
2935 
2936   G1SATBCardTableLoggingModRefBS* bs =
2937     barrier_set_cast<G1SATBCardTableLoggingModRefBS>(Universe::heap()->barrier_set());
2938 
2939   // Does store cross heap regions?
2940   if (G1RSBarrierRegionFilter) {
2941     xorr(Rtmp1, Rstore_addr, Rnew_val);
2942     srdi_(Rtmp1, Rtmp1, HeapRegion::LogOfHRGrainBytes);
2943     beq(CCR0, filtered);
2944   }
2945 
2946   // Crosses regions, storing NULL?
2947 #ifdef ASSERT
2948   cmpdi(CCR0, Rnew_val, 0);
2949   asm_assert_ne("null oop not allowed (G1)", 0x322); // Checked by caller on PPC64, so following branch is obsolete:
2950   //beq(CCR0, filtered);
2951 #endif
2952 
2953   // Storing region crossing non-NULL, is card already dirty?
2954   assert(sizeof(*bs->byte_map_base) == sizeof(jbyte), "adjust this code");
2955   const Register Rcard_addr = Rtmp1;
2956   Register Rbase = Rtmp2;
2957   load_const_optimized(Rbase, (address)bs->byte_map_base, /*temp*/ Rtmp3);
2958 
2959   srdi(Rcard_addr, Rstore_addr, CardTableModRefBS::card_shift);
2960 
2961   // Get the address of the card.
2962   lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr);
2963   cmpwi(CCR0, Rtmp3, (int)G1SATBCardTableModRefBS::g1_young_card_val());
2964   beq(CCR0, filtered);
2965 
2966   membar(Assembler::StoreLoad);
2967   lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr);  // Reload after membar.
2968   cmpwi(CCR0, Rtmp3 /* card value */, CardTableModRefBS::dirty_card_val());
2969   beq(CCR0, filtered);
2970 
2971   // Storing a region crossing, non-NULL oop, card is clean.
2972   // Dirty card and log.
2973   li(Rtmp3, CardTableModRefBS::dirty_card_val());
2974   //release(); // G1: oops are allowed to get visible after dirty marking.
2975   stbx(Rtmp3, Rbase, Rcard_addr);
2976 
2977   add(Rcard_addr, Rbase, Rcard_addr); // This is the address which needs to get enqueued.
2978   Rbase = noreg; // end of lifetime
2979 
2980   const Register Rqueue_index = Rtmp2,
2981                  Rqueue_buf   = Rtmp3;
2982   ld(Rqueue_index, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_index()), R16_thread);
2983   cmpdi(CCR0, Rqueue_index, 0);
2984   beq(CCR0, runtime); // index == 0 then jump to runtime
2985   ld(Rqueue_buf, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_buf()), R16_thread);
2986 
2987   addi(Rqueue_index, Rqueue_index, -wordSize); // decrement index
2988   std(Rqueue_index, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_index()), R16_thread);
2989 
2990   stdx(Rcard_addr, Rqueue_buf, Rqueue_index); // store card
2991   b(filtered);
2992 
2993   bind(runtime);
2994 
2995   // Save the live input values.
2996   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), Rcard_addr, R16_thread);
2997 
2998   bind(filtered_int);
2999 }
3000 #endif // INCLUDE_ALL_GCS
3001 
3002 // Values for last_Java_pc, and last_Java_sp must comply to the rules
3003 // in frame_ppc.hpp.
3004 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) {
3005   // Always set last_Java_pc and flags first because once last_Java_sp
3006   // is visible has_last_Java_frame is true and users will look at the
3007   // rest of the fields. (Note: flags should always be zero before we
3008   // get here so doesn't need to be set.)
3009 
3010   // Verify that last_Java_pc was zeroed on return to Java
3011   asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread,
3012                           "last_Java_pc not zeroed before leaving Java", 0x200);
3013 
3014   // When returning from calling out from Java mode the frame anchor's
3015   // last_Java_pc will always be set to NULL. It is set here so that
3016   // if we are doing a call to native (not VM) that we capture the
3017   // known pc and don't have to rely on the native call having a
3018   // standard frame linkage where we can find the pc.
3019   if (last_Java_pc != noreg)
3020     std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
3021 
3022   // Set last_Java_sp last.
3023   std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
3024 }
3025 
3026 void MacroAssembler::reset_last_Java_frame(void) {
3027   asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()),
3028                              R16_thread, "SP was not set, still zero", 0x202);
3029 
3030   BLOCK_COMMENT("reset_last_Java_frame {");
3031   li(R0, 0);
3032 
3033   // _last_Java_sp = 0
3034   std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
3035 
3036   // _last_Java_pc = 0
3037   std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
3038   BLOCK_COMMENT("} reset_last_Java_frame");
3039 }
3040 
3041 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) {
3042   assert_different_registers(sp, tmp1);
3043 
3044   // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via
3045   // TOP_IJAVA_FRAME_ABI.
3046   // FIXME: assert that we really have a TOP_IJAVA_FRAME here!
3047   address entry = pc();
3048   load_const_optimized(tmp1, entry);
3049 
3050   set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1);
3051 }
3052 
3053 void MacroAssembler::get_vm_result(Register oop_result) {
3054   // Read:
3055   //   R16_thread
3056   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
3057   //
3058   // Updated:
3059   //   oop_result
3060   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
3061 
3062   verify_thread();
3063 
3064   ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread);
3065   li(R0, 0);
3066   std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread);
3067 
3068   verify_oop(oop_result);
3069 }
3070 
3071 void MacroAssembler::get_vm_result_2(Register metadata_result) {
3072   // Read:
3073   //   R16_thread
3074   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
3075   //
3076   // Updated:
3077   //   metadata_result
3078   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
3079 
3080   ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
3081   li(R0, 0);
3082   std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
3083 }
3084 
3085 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3086   Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided.
3087   if (Universe::narrow_klass_base() != 0) {
3088     // Use dst as temp if it is free.
3089     sub_const_optimized(dst, current, Universe::narrow_klass_base(), R0);
3090     current = dst;
3091   }
3092   if (Universe::narrow_klass_shift() != 0) {
3093     srdi(dst, current, Universe::narrow_klass_shift());
3094     current = dst;
3095   }
3096   return current;
3097 }
3098 
3099 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) {
3100   if (UseCompressedClassPointers) {
3101     Register compressedKlass = encode_klass_not_null(ck, klass);
3102     stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop);
3103   } else {
3104     std(klass, oopDesc::klass_offset_in_bytes(), dst_oop);
3105   }
3106 }
3107 
3108 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) {
3109   if (UseCompressedClassPointers) {
3110     if (val == noreg) {
3111       val = R0;
3112       li(val, 0);
3113     }
3114     stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed
3115   }
3116 }
3117 
3118 int MacroAssembler::instr_size_for_decode_klass_not_null() {
3119   if (!UseCompressedClassPointers) return 0;
3120   int num_instrs = 1;  // shift or move
3121   if (Universe::narrow_klass_base() != 0) num_instrs = 7;  // shift + load const + add
3122   return num_instrs * BytesPerInstWord;
3123 }
3124 
3125 void MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3126   assert(dst != R0, "Dst reg may not be R0, as R0 is used here.");
3127   if (src == noreg) src = dst;
3128   Register shifted_src = src;
3129   if (Universe::narrow_klass_shift() != 0 ||
3130       Universe::narrow_klass_base() == 0 && src != dst) {  // Move required.
3131     shifted_src = dst;
3132     sldi(shifted_src, src, Universe::narrow_klass_shift());
3133   }
3134   if (Universe::narrow_klass_base() != 0) {
3135     add_const_optimized(dst, shifted_src, Universe::narrow_klass_base(), R0);
3136   }
3137 }
3138 
3139 void MacroAssembler::load_klass(Register dst, Register src) {
3140   if (UseCompressedClassPointers) {
3141     lwz(dst, oopDesc::klass_offset_in_bytes(), src);
3142     // Attention: no null check here!
3143     decode_klass_not_null(dst, dst);
3144   } else {
3145     ld(dst, oopDesc::klass_offset_in_bytes(), src);
3146   }
3147 }
3148 
3149 void MacroAssembler::load_mirror(Register mirror, Register method) {
3150   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
3151   ld(mirror, in_bytes(Method::const_offset()), method);
3152   ld(mirror, in_bytes(ConstMethod::constants_offset()), mirror);
3153   ld(mirror, ConstantPool::pool_holder_offset_in_bytes(), mirror);
3154   ld(mirror, mirror_offset, mirror);
3155 }
3156 
3157 // Clear Array
3158 // Kills both input registers. tmp == R0 is allowed.
3159 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp) {
3160   // Procedure for large arrays (uses data cache block zero instruction).
3161     Label startloop, fast, fastloop, small_rest, restloop, done;
3162     const int cl_size         = VM_Version::L1_data_cache_line_size(),
3163               cl_dwords       = cl_size>>3,
3164               cl_dw_addr_bits = exact_log2(cl_dwords),
3165               dcbz_min        = 1;                     // Min count of dcbz executions, needs to be >0.
3166 
3167 //2:
3168     cmpdi(CCR1, cnt_dwords, ((dcbz_min+1)<<cl_dw_addr_bits)-1); // Big enough? (ensure >=dcbz_min lines included).
3169     blt(CCR1, small_rest);                                      // Too small.
3170     rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits);           // Extract dword offset within first cache line.
3171     beq(CCR0, fast);                                            // Already 128byte aligned.
3172 
3173     subfic(tmp, tmp, cl_dwords);
3174     mtctr(tmp);                        // Set ctr to hit 128byte boundary (0<ctr<cl_dwords).
3175     subf(cnt_dwords, tmp, cnt_dwords); // rest.
3176     li(tmp, 0);
3177 //10:
3178   bind(startloop);                     // Clear at the beginning to reach 128byte boundary.
3179     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3180     addi(base_ptr, base_ptr, 8);
3181     bdnz(startloop);
3182 //13:
3183   bind(fast);                                  // Clear 128byte blocks.
3184     srdi(tmp, cnt_dwords, cl_dw_addr_bits);    // Loop count for 128byte loop (>0).
3185     andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords.
3186     mtctr(tmp);                                // Load counter.
3187 //16:
3188   bind(fastloop);
3189     dcbz(base_ptr);                    // Clear 128byte aligned block.
3190     addi(base_ptr, base_ptr, cl_size);
3191     bdnz(fastloop);
3192     if (InsertEndGroupPPC64) { endgroup(); } else { nop(); }
3193 //20:
3194   bind(small_rest);
3195     cmpdi(CCR0, cnt_dwords, 0);        // size 0?
3196     beq(CCR0, done);                   // rest == 0
3197     li(tmp, 0);
3198     mtctr(cnt_dwords);                 // Load counter.
3199 //24:
3200   bind(restloop);                      // Clear rest.
3201     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3202     addi(base_ptr, base_ptr, 8);
3203     bdnz(restloop);
3204 //27:
3205   bind(done);
3206 }
3207 
3208 /////////////////////////////////////////// String intrinsics ////////////////////////////////////////////
3209 
3210 #ifdef COMPILER2
3211 // Intrinsics for CompactStrings
3212 
3213 // Compress char[] to byte[] by compressing 16 bytes at once.
3214 void MacroAssembler::string_compress_16(Register src, Register dst, Register cnt,
3215                                         Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5,
3216                                         Label& Lfailure) {
3217 
3218   const Register tmp0 = R0;
3219   assert_different_registers(src, dst, cnt, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5);
3220   Label Lloop, Lslow;
3221 
3222   // Check if cnt >= 8 (= 16 bytes)
3223   lis(tmp1, 0xFF);                // tmp1 = 0x00FF00FF00FF00FF
3224   srwi_(tmp2, cnt, 3);
3225   beq(CCR0, Lslow);
3226   ori(tmp1, tmp1, 0xFF);
3227   rldimi(tmp1, tmp1, 32, 0);
3228   mtctr(tmp2);
3229 
3230   // 2x unrolled loop
3231   bind(Lloop);
3232   ld(tmp2, 0, src);               // _0_1_2_3 (Big Endian)
3233   ld(tmp4, 8, src);               // _4_5_6_7
3234 
3235   orr(tmp0, tmp2, tmp4);
3236   rldicl(tmp3, tmp2, 6*8, 64-24); // _____1_2
3237   rldimi(tmp2, tmp2, 2*8, 2*8);   // _0_2_3_3
3238   rldicl(tmp5, tmp4, 6*8, 64-24); // _____5_6
3239   rldimi(tmp4, tmp4, 2*8, 2*8);   // _4_6_7_7
3240 
3241   andc_(tmp0, tmp0, tmp1);
3242   bne(CCR0, Lfailure);            // Not latin1.
3243   addi(src, src, 16);
3244 
3245   rlwimi(tmp3, tmp2, 0*8, 24, 31);// _____1_3
3246   srdi(tmp2, tmp2, 3*8);          // ____0_2_
3247   rlwimi(tmp5, tmp4, 0*8, 24, 31);// _____5_7
3248   srdi(tmp4, tmp4, 3*8);          // ____4_6_
3249 
3250   orr(tmp2, tmp2, tmp3);          // ____0123
3251   orr(tmp4, tmp4, tmp5);          // ____4567
3252 
3253   stw(tmp2, 0, dst);
3254   stw(tmp4, 4, dst);
3255   addi(dst, dst, 8);
3256   bdnz(Lloop);
3257 
3258   bind(Lslow);                    // Fallback to slow version
3259 }
3260 
3261 // Compress char[] to byte[]. cnt must be positive int.
3262 void MacroAssembler::string_compress(Register src, Register dst, Register cnt, Register tmp, Label& Lfailure) {
3263   Label Lloop;
3264   mtctr(cnt);
3265 
3266   bind(Lloop);
3267   lhz(tmp, 0, src);
3268   cmplwi(CCR0, tmp, 0xff);
3269   bgt(CCR0, Lfailure);            // Not latin1.
3270   addi(src, src, 2);
3271   stb(tmp, 0, dst);
3272   addi(dst, dst, 1);
3273   bdnz(Lloop);
3274 }
3275 
3276 // Inflate byte[] to char[] by inflating 16 bytes at once.
3277 void MacroAssembler::string_inflate_16(Register src, Register dst, Register cnt,
3278                                        Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
3279   const Register tmp0 = R0;
3280   assert_different_registers(src, dst, cnt, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5);
3281   Label Lloop, Lslow;
3282 
3283   // Check if cnt >= 8
3284   srwi_(tmp2, cnt, 3);
3285   beq(CCR0, Lslow);
3286   lis(tmp1, 0xFF);                // tmp1 = 0x00FF00FF
3287   ori(tmp1, tmp1, 0xFF);
3288   mtctr(tmp2);
3289 
3290   // 2x unrolled loop
3291   bind(Lloop);
3292   lwz(tmp2, 0, src);              // ____0123 (Big Endian)
3293   lwz(tmp4, 4, src);              // ____4567
3294   addi(src, src, 8);
3295 
3296   rldicl(tmp3, tmp2, 7*8, 64-8);  // _______2
3297   rlwimi(tmp2, tmp2, 3*8, 16, 23);// ____0113
3298   rldicl(tmp5, tmp4, 7*8, 64-8);  // _______6
3299   rlwimi(tmp4, tmp4, 3*8, 16, 23);// ____4557
3300 
3301   andc(tmp0, tmp2, tmp1);         // ____0_1_
3302   rlwimi(tmp2, tmp3, 2*8, 0, 23); // _____2_3
3303   andc(tmp3, tmp4, tmp1);         // ____4_5_
3304   rlwimi(tmp4, tmp5, 2*8, 0, 23); // _____6_7
3305 
3306   rldimi(tmp2, tmp0, 3*8, 0*8);   // _0_1_2_3
3307   rldimi(tmp4, tmp3, 3*8, 0*8);   // _4_5_6_7
3308 
3309   std(tmp2, 0, dst);
3310   std(tmp4, 8, dst);
3311   addi(dst, dst, 16);
3312   bdnz(Lloop);
3313 
3314   bind(Lslow);                    // Fallback to slow version
3315 }
3316 
3317 // Inflate byte[] to char[]. cnt must be positive int.
3318 void MacroAssembler::string_inflate(Register src, Register dst, Register cnt, Register tmp) {
3319   Label Lloop;
3320   mtctr(cnt);
3321 
3322   bind(Lloop);
3323   lbz(tmp, 0, src);
3324   addi(src, src, 1);
3325   sth(tmp, 0, dst);
3326   addi(dst, dst, 2);
3327   bdnz(Lloop);
3328 }
3329 
3330 void MacroAssembler::string_compare(Register str1, Register str2,
3331                                     Register cnt1, Register cnt2,
3332                                     Register tmp1, Register result, int ae) {
3333   const Register tmp0 = R0,
3334                  diff = tmp1;
3335 
3336   assert_different_registers(str1, str2, cnt1, cnt2, tmp0, tmp1, result);
3337   Label Ldone, Lslow, Lloop, Lreturn_diff;
3338 
3339   // Note: Making use of the fact that compareTo(a, b) == -compareTo(b, a)
3340   // we interchange str1 and str2 in the UL case and negate the result.
3341   // Like this, str1 is always latin1 encoded, except for the UU case.
3342   // In addition, we need 0 (or sign which is 0) extend.
3343 
3344   if (ae == StrIntrinsicNode::UU) {
3345     srwi(cnt1, cnt1, 1);
3346   } else {
3347     clrldi(cnt1, cnt1, 32);
3348   }
3349 
3350   if (ae != StrIntrinsicNode::LL) {
3351     srwi(cnt2, cnt2, 1);
3352   } else {
3353     clrldi(cnt2, cnt2, 32);
3354   }
3355 
3356   // See if the lengths are different, and calculate min in cnt1.
3357   // Save diff in case we need it for a tie-breaker.
3358   subf_(diff, cnt2, cnt1); // diff = cnt1 - cnt2
3359   // if (diff > 0) { cnt1 = cnt2; }
3360   if (VM_Version::has_isel()) {
3361     isel(cnt1, CCR0, Assembler::greater, /*invert*/ false, cnt2);
3362   } else {
3363     Label Lskip;
3364     blt(CCR0, Lskip);
3365     mr(cnt1, cnt2);
3366     bind(Lskip);
3367   }
3368 
3369   // Rename registers
3370   Register chr1 = result;
3371   Register chr2 = tmp0;
3372 
3373   // Compare multiple characters in fast loop (only implemented for same encoding).
3374   int stride1 = 8, stride2 = 8;
3375   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3376     int log2_chars_per_iter = (ae == StrIntrinsicNode::LL) ? 3 : 2;
3377     Label Lfastloop, Lskipfast;
3378 
3379     srwi_(tmp0, cnt1, log2_chars_per_iter);
3380     beq(CCR0, Lskipfast);
3381     rldicl(cnt2, cnt1, 0, 64 - log2_chars_per_iter); // Remaining characters.
3382     li(cnt1, 1 << log2_chars_per_iter); // Initialize for failure case: Rescan characters from current iteration.
3383     mtctr(tmp0);
3384 
3385     bind(Lfastloop);
3386     ld(chr1, 0, str1);
3387     ld(chr2, 0, str2);
3388     cmpd(CCR0, chr1, chr2);
3389     bne(CCR0, Lslow);
3390     addi(str1, str1, stride1);
3391     addi(str2, str2, stride2);
3392     bdnz(Lfastloop);
3393     mr(cnt1, cnt2); // Remaining characters.
3394     bind(Lskipfast);
3395   }
3396 
3397   // Loop which searches the first difference character by character.
3398   cmpwi(CCR0, cnt1, 0);
3399   beq(CCR0, Lreturn_diff);
3400   bind(Lslow);
3401   mtctr(cnt1);
3402 
3403   switch (ae) {
3404     case StrIntrinsicNode::LL: stride1 = 1; stride2 = 1; break;
3405     case StrIntrinsicNode::UL: // fallthru (see comment above)
3406     case StrIntrinsicNode::LU: stride1 = 1; stride2 = 2; break;
3407     case StrIntrinsicNode::UU: stride1 = 2; stride2 = 2; break;
3408     default: ShouldNotReachHere(); break;
3409   }
3410 
3411   bind(Lloop);
3412   if (stride1 == 1) { lbz(chr1, 0, str1); } else { lhz(chr1, 0, str1); }
3413   if (stride2 == 1) { lbz(chr2, 0, str2); } else { lhz(chr2, 0, str2); }
3414   subf_(result, chr2, chr1); // result = chr1 - chr2
3415   bne(CCR0, Ldone);
3416   addi(str1, str1, stride1);
3417   addi(str2, str2, stride2);
3418   bdnz(Lloop);
3419 
3420   // If strings are equal up to min length, return the length difference.
3421   bind(Lreturn_diff);
3422   mr(result, diff);
3423 
3424   // Otherwise, return the difference between the first mismatched chars.
3425   bind(Ldone);
3426   if (ae == StrIntrinsicNode::UL) {
3427     neg(result, result); // Negate result (see note above).
3428   }
3429 }
3430 
3431 void MacroAssembler::array_equals(bool is_array_equ, Register ary1, Register ary2,
3432                                   Register limit, Register tmp1, Register result, bool is_byte) {
3433   const Register tmp0 = R0;
3434   assert_different_registers(ary1, ary2, limit, tmp0, tmp1, result);
3435   Label Ldone, Lskiploop, Lloop, Lfastloop, Lskipfast;
3436   bool limit_needs_shift = false;
3437 
3438   if (is_array_equ) {
3439     const int length_offset = arrayOopDesc::length_offset_in_bytes();
3440     const int base_offset   = arrayOopDesc::base_offset_in_bytes(is_byte ? T_BYTE : T_CHAR);
3441 
3442     // Return true if the same array.
3443     cmpd(CCR0, ary1, ary2);
3444     beq(CCR0, Lskiploop);
3445 
3446     // Return false if one of them is NULL.
3447     cmpdi(CCR0, ary1, 0);
3448     cmpdi(CCR1, ary2, 0);
3449     li(result, 0);
3450     cror(CCR0, Assembler::equal, CCR1, Assembler::equal);
3451     beq(CCR0, Ldone);
3452 
3453     // Load the lengths of arrays.
3454     lwz(limit, length_offset, ary1);
3455     lwz(tmp0, length_offset, ary2);
3456 
3457     // Return false if the two arrays are not equal length.
3458     cmpw(CCR0, limit, tmp0);
3459     bne(CCR0, Ldone);
3460 
3461     // Load array addresses.
3462     addi(ary1, ary1, base_offset);
3463     addi(ary2, ary2, base_offset);
3464   } else {
3465     limit_needs_shift = !is_byte;
3466     li(result, 0); // Assume not equal.
3467   }
3468 
3469   // Rename registers
3470   Register chr1 = tmp0;
3471   Register chr2 = tmp1;
3472 
3473   // Compare 8 bytes per iteration in fast loop.
3474   const int log2_chars_per_iter = is_byte ? 3 : 2;
3475 
3476   srwi_(tmp0, limit, log2_chars_per_iter + (limit_needs_shift ? 1 : 0));
3477   beq(CCR0, Lskipfast);
3478   mtctr(tmp0);
3479 
3480   bind(Lfastloop);
3481   ld(chr1, 0, ary1);
3482   ld(chr2, 0, ary2);
3483   addi(ary1, ary1, 8);
3484   addi(ary2, ary2, 8);
3485   cmpd(CCR0, chr1, chr2);
3486   bne(CCR0, Ldone);
3487   bdnz(Lfastloop);
3488 
3489   bind(Lskipfast);
3490   rldicl_(limit, limit, limit_needs_shift ? 64 - 1 : 0, 64 - log2_chars_per_iter); // Remaining characters.
3491   beq(CCR0, Lskiploop);
3492   mtctr(limit);
3493 
3494   // Character by character.
3495   bind(Lloop);
3496   if (is_byte) {
3497     lbz(chr1, 0, ary1);
3498     lbz(chr2, 0, ary2);
3499     addi(ary1, ary1, 1);
3500     addi(ary2, ary2, 1);
3501   } else {
3502     lhz(chr1, 0, ary1);
3503     lhz(chr2, 0, ary2);
3504     addi(ary1, ary1, 2);
3505     addi(ary2, ary2, 2);
3506   }
3507   cmpw(CCR0, chr1, chr2);
3508   bne(CCR0, Ldone);
3509   bdnz(Lloop);
3510 
3511   bind(Lskiploop);
3512   li(result, 1); // All characters are equal.
3513   bind(Ldone);
3514 }
3515 
3516 void MacroAssembler::string_indexof(Register result, Register haystack, Register haycnt,
3517                                     Register needle, ciTypeArray* needle_values, Register needlecnt, int needlecntval,
3518                                     Register tmp1, Register tmp2, Register tmp3, Register tmp4, int ae) {
3519 
3520   // Ensure 0<needlecnt<=haycnt in ideal graph as prerequisite!
3521   Label L_TooShort, L_Found, L_NotFound, L_End;
3522   Register last_addr = haycnt, // Kill haycnt at the beginning.
3523   addr      = tmp1,
3524   n_start   = tmp2,
3525   ch1       = tmp3,
3526   ch2       = R0;
3527 
3528   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
3529   const int h_csize = (ae == StrIntrinsicNode::LL) ? 1 : 2;
3530   const int n_csize = (ae == StrIntrinsicNode::UU) ? 2 : 1;
3531 
3532   // **************************************************************************************************
3533   // Prepare for main loop: optimized for needle count >=2, bail out otherwise.
3534   // **************************************************************************************************
3535 
3536   // Compute last haystack addr to use if no match gets found.
3537   clrldi(haycnt, haycnt, 32);         // Ensure positive int is valid as 64 bit value.
3538   addi(addr, haystack, -h_csize);     // Accesses use pre-increment.
3539   if (needlecntval == 0) { // variable needlecnt
3540    cmpwi(CCR6, needlecnt, 2);
3541    clrldi(needlecnt, needlecnt, 32);  // Ensure positive int is valid as 64 bit value.
3542    blt(CCR6, L_TooShort);             // Variable needlecnt: handle short needle separately.
3543   }
3544 
3545   if (n_csize == 2) { lwz(n_start, 0, needle); } else { lhz(n_start, 0, needle); } // Load first 2 characters of needle.
3546 
3547   if (needlecntval == 0) { // variable needlecnt
3548    subf(ch1, needlecnt, haycnt);      // Last character index to compare is haycnt-needlecnt.
3549    addi(needlecnt, needlecnt, -2);    // Rest of needle.
3550   } else { // constant needlecnt
3551   guarantee(needlecntval != 1, "IndexOf with single-character needle must be handled separately");
3552   assert((needlecntval & 0x7fff) == needlecntval, "wrong immediate");
3553    addi(ch1, haycnt, -needlecntval);  // Last character index to compare is haycnt-needlecnt.
3554    if (needlecntval > 3) { li(needlecnt, needlecntval - 2); } // Rest of needle.
3555   }
3556 
3557   if (h_csize == 2) { slwi(ch1, ch1, 1); } // Scale to number of bytes.
3558 
3559   if (ae ==StrIntrinsicNode::UL) {
3560    srwi(tmp4, n_start, 1*8);          // ___0
3561    rlwimi(n_start, tmp4, 2*8, 0, 23); // _0_1
3562   }
3563 
3564   add(last_addr, haystack, ch1);      // Point to last address to compare (haystack+2*(haycnt-needlecnt)).
3565 
3566   // Main Loop (now we have at least 2 characters).
3567   Label L_OuterLoop, L_InnerLoop, L_FinalCheck, L_Comp1, L_Comp2;
3568   bind(L_OuterLoop); // Search for 1st 2 characters.
3569   Register addr_diff = tmp4;
3570    subf(addr_diff, addr, last_addr);  // Difference between already checked address and last address to check.
3571    addi(addr, addr, h_csize);         // This is the new address we want to use for comparing.
3572    srdi_(ch2, addr_diff, h_csize);
3573    beq(CCR0, L_FinalCheck);           // 2 characters left?
3574    mtctr(ch2);                        // num of characters / 2
3575   bind(L_InnerLoop);                  // Main work horse (2x unrolled search loop)
3576    if (h_csize == 2) {                // Load 2 characters of haystack (ignore alignment).
3577     lwz(ch1, 0, addr);
3578     lwz(ch2, 2, addr);
3579    } else {
3580     lhz(ch1, 0, addr);
3581     lhz(ch2, 1, addr);
3582    }
3583    cmpw(CCR0, ch1, n_start);          // Compare 2 characters (1 would be sufficient but try to reduce branches to CompLoop).
3584    cmpw(CCR1, ch2, n_start);
3585    beq(CCR0, L_Comp1);                // Did we find the needle start?
3586    beq(CCR1, L_Comp2);
3587    addi(addr, addr, 2 * h_csize);
3588    bdnz(L_InnerLoop);
3589   bind(L_FinalCheck);
3590    andi_(addr_diff, addr_diff, h_csize); // Remaining characters not covered by InnerLoop: (num of characters) & 1.
3591    beq(CCR0, L_NotFound);
3592    if (h_csize == 2) { lwz(ch1, 0, addr); } else { lhz(ch1, 0, addr); } // One position left at which we have to compare.
3593    cmpw(CCR1, ch1, n_start);
3594    beq(CCR1, L_Comp1);
3595   bind(L_NotFound);
3596    li(result, -1);                    // not found
3597    b(L_End);
3598 
3599    // **************************************************************************************************
3600    // Special Case: unfortunately, the variable needle case can be called with needlecnt<2
3601    // **************************************************************************************************
3602   if (needlecntval == 0) {           // We have to handle these cases separately.
3603   Label L_OneCharLoop;
3604   bind(L_TooShort);
3605    mtctr(haycnt);
3606    if (n_csize == 2) { lhz(n_start, 0, needle); } else { lbz(n_start, 0, needle); } // First character of needle
3607   bind(L_OneCharLoop);
3608    if (h_csize == 2) { lhzu(ch1, 2, addr); } else { lbzu(ch1, 1, addr); }
3609    cmpw(CCR1, ch1, n_start);
3610    beq(CCR1, L_Found);               // Did we find the one character needle?
3611    bdnz(L_OneCharLoop);
3612    li(result, -1);                   // Not found.
3613    b(L_End);
3614   }
3615 
3616   // **************************************************************************************************
3617   // Regular Case Part II: compare rest of needle (first 2 characters have been compared already)
3618   // **************************************************************************************************
3619 
3620   // Compare the rest
3621   bind(L_Comp2);
3622    addi(addr, addr, h_csize);        // First comparison has failed, 2nd one hit.
3623   bind(L_Comp1);                     // Addr points to possible needle start.
3624   if (needlecntval != 2) {           // Const needlecnt==2?
3625    if (needlecntval != 3) {
3626     if (needlecntval == 0) { beq(CCR6, L_Found); } // Variable needlecnt==2?
3627     Register n_ind = tmp4,
3628              h_ind = n_ind;
3629     li(n_ind, 2 * n_csize);          // First 2 characters are already compared, use index 2.
3630     mtctr(needlecnt);                // Decremented by 2, still > 0.
3631    Label L_CompLoop;
3632    bind(L_CompLoop);
3633     if (ae ==StrIntrinsicNode::UL) {
3634       h_ind = ch1;
3635       sldi(h_ind, n_ind, 1);
3636     }
3637     if (n_csize == 2) { lhzx(ch2, needle, n_ind); } else { lbzx(ch2, needle, n_ind); }
3638     if (h_csize == 2) { lhzx(ch1, addr, h_ind); } else { lbzx(ch1, addr, h_ind); }
3639     cmpw(CCR1, ch1, ch2);
3640     bne(CCR1, L_OuterLoop);
3641     addi(n_ind, n_ind, n_csize);
3642     bdnz(L_CompLoop);
3643    } else { // No loop required if there's only one needle character left.
3644     if (n_csize == 2) { lhz(ch2, 2 * 2, needle); } else { lbz(ch2, 2 * 1, needle); }
3645     if (h_csize == 2) { lhz(ch1, 2 * 2, addr); } else { lbz(ch1, 2 * 1, addr); }
3646     cmpw(CCR1, ch1, ch2);
3647     bne(CCR1, L_OuterLoop);
3648    }
3649   }
3650   // Return index ...
3651   bind(L_Found);
3652    subf(result, haystack, addr);     // relative to haystack, ...
3653    if (h_csize == 2) { srdi(result, result, 1); } // in characters.
3654   bind(L_End);
3655 } // string_indexof
3656 
3657 void MacroAssembler::string_indexof_char(Register result, Register haystack, Register haycnt,
3658                                          Register needle, jchar needleChar, Register tmp1, Register tmp2, bool is_byte) {
3659   assert_different_registers(haystack, haycnt, needle, tmp1, tmp2);
3660 
3661   Label L_InnerLoop, L_FinalCheck, L_Found1, L_Found2, L_NotFound, L_End;
3662   Register addr = tmp1,
3663            ch1 = tmp2,
3664            ch2 = R0;
3665 
3666   const int h_csize = is_byte ? 1 : 2;
3667 
3668 //4:
3669    srwi_(tmp2, haycnt, 1);   // Shift right by exact_log2(UNROLL_FACTOR).
3670    mr(addr, haystack);
3671    beq(CCR0, L_FinalCheck);
3672    mtctr(tmp2);              // Move to count register.
3673 //8:
3674   bind(L_InnerLoop);         // Main work horse (2x unrolled search loop).
3675    if (!is_byte) {
3676     lhz(ch1, 0, addr);
3677     lhz(ch2, 2, addr);
3678    } else {
3679     lbz(ch1, 0, addr);
3680     lbz(ch2, 1, addr);
3681    }
3682    (needle != R0) ? cmpw(CCR0, ch1, needle) : cmplwi(CCR0, ch1, (unsigned int)needleChar);
3683    (needle != R0) ? cmpw(CCR1, ch2, needle) : cmplwi(CCR1, ch2, (unsigned int)needleChar);
3684    beq(CCR0, L_Found1);      // Did we find the needle?
3685    beq(CCR1, L_Found2);
3686    addi(addr, addr, 2 * h_csize);
3687    bdnz(L_InnerLoop);
3688 //16:
3689   bind(L_FinalCheck);
3690    andi_(R0, haycnt, 1);
3691    beq(CCR0, L_NotFound);
3692    if (!is_byte) { lhz(ch1, 0, addr); } else { lbz(ch1, 0, addr); } // One position left at which we have to compare.
3693    (needle != R0) ? cmpw(CCR1, ch1, needle) : cmplwi(CCR1, ch1, (unsigned int)needleChar);
3694    beq(CCR1, L_Found1);
3695 //21:
3696   bind(L_NotFound);
3697    li(result, -1);           // Not found.
3698    b(L_End);
3699 
3700   bind(L_Found2);
3701    addi(addr, addr, h_csize);
3702 //24:
3703   bind(L_Found1);            // Return index ...
3704    subf(result, haystack, addr); // relative to haystack, ...
3705    if (!is_byte) { srdi(result, result, 1); } // in characters.
3706   bind(L_End);
3707 } // string_indexof_char
3708 
3709 
3710 void MacroAssembler::has_negatives(Register src, Register cnt, Register result,
3711                                    Register tmp1, Register tmp2) {
3712   const Register tmp0 = R0;
3713   assert_different_registers(src, result, cnt, tmp0, tmp1, tmp2);
3714   Label Lfastloop, Lslow, Lloop, Lnoneg, Ldone;
3715 
3716   // Check if cnt >= 8 (= 16 bytes)
3717   lis(tmp1, (int)(short)0x8080);  // tmp1 = 0x8080808080808080
3718   srwi_(tmp2, cnt, 4);
3719   li(result, 1);                  // Assume there's a negative byte.
3720   beq(CCR0, Lslow);
3721   ori(tmp1, tmp1, 0x8080);
3722   rldimi(tmp1, tmp1, 32, 0);
3723   mtctr(tmp2);
3724 
3725   // 2x unrolled loop
3726   bind(Lfastloop);
3727   ld(tmp2, 0, src);
3728   ld(tmp0, 8, src);
3729 
3730   orr(tmp0, tmp2, tmp0);
3731 
3732   and_(tmp0, tmp0, tmp1);
3733   bne(CCR0, Ldone);               // Found negative byte.
3734   addi(src, src, 16);
3735 
3736   bdnz(Lfastloop);
3737 
3738   bind(Lslow);                    // Fallback to slow version
3739   rldicl_(tmp0, cnt, 0, 64-4);
3740   beq(CCR0, Lnoneg);
3741   mtctr(tmp0);
3742   bind(Lloop);
3743   lbz(tmp0, 0, src);
3744   addi(src, src, 1);
3745   andi_(tmp0, tmp0, 0x80);
3746   bne(CCR0, Ldone);               // Found negative byte.
3747   bdnz(Lloop);
3748   bind(Lnoneg);
3749   li(result, 0);
3750 
3751   bind(Ldone);
3752 }
3753 
3754 
3755 // Intrinsics for non-CompactStrings
3756 
3757 // Search for a single jchar in an jchar[].
3758 //
3759 // Assumes that result differs from all other registers.
3760 //
3761 // 'haystack' is the addresses of a jchar-array.
3762 // 'needle' is either the character to search for or R0.
3763 // 'needleChar' is the character to search for if 'needle' == R0..
3764 // 'haycnt' is the length of the haystack. We assume 'haycnt' >=1.
3765 //
3766 // Preserves haystack, haycnt, needle and kills all other registers.
3767 //
3768 // If needle == R0, we search for the constant needleChar.
3769 void MacroAssembler::string_indexof_1(Register result, Register haystack, Register haycnt,
3770                                       Register needle, jchar needleChar,
3771                                       Register tmp1, Register tmp2) {
3772 
3773   assert_different_registers(result, haystack, haycnt, needle, tmp1, tmp2);
3774 
3775   Label L_InnerLoop, L_FinalCheck, L_Found1, L_Found2, L_Found3, L_NotFound, L_End;
3776   Register addr = tmp1,
3777            ch1 = tmp2,
3778            ch2 = R0;
3779 
3780 //3:
3781    dcbtct(haystack, 0x00);                        // Indicate R/O access to haystack.
3782 
3783    srwi_(tmp2, haycnt, 1);   // Shift right by exact_log2(UNROLL_FACTOR).
3784    mr(addr, haystack);
3785    beq(CCR0, L_FinalCheck);
3786    mtctr(tmp2);              // Move to count register.
3787 //8:
3788   bind(L_InnerLoop);             // Main work horse (2x unrolled search loop).
3789    lhz(ch1, 0, addr);        // Load characters from haystack.
3790    lhz(ch2, 2, addr);
3791    (needle != R0) ? cmpw(CCR0, ch1, needle) : cmplwi(CCR0, ch1, needleChar);
3792    (needle != R0) ? cmpw(CCR1, ch2, needle) : cmplwi(CCR1, ch2, needleChar);
3793    beq(CCR0, L_Found1);   // Did we find the needle?
3794    beq(CCR1, L_Found2);
3795    addi(addr, addr, 4);
3796    bdnz(L_InnerLoop);
3797 //16:
3798   bind(L_FinalCheck);
3799    andi_(R0, haycnt, 1);
3800    beq(CCR0, L_NotFound);
3801    lhz(ch1, 0, addr);        // One position left at which we have to compare.
3802    (needle != R0) ? cmpw(CCR1, ch1, needle) : cmplwi(CCR1, ch1, needleChar);
3803    beq(CCR1, L_Found3);
3804 //21:
3805   bind(L_NotFound);
3806    li(result, -1);           // Not found.
3807    b(L_End);
3808 
3809   bind(L_Found2);
3810    addi(addr, addr, 2);
3811 //24:
3812   bind(L_Found1);
3813   bind(L_Found3);                  // Return index ...
3814    subf(addr, haystack, addr); // relative to haystack,
3815    srdi(result, addr, 1);      // in characters.
3816   bind(L_End);
3817 }
3818 
3819 
3820 // Implementation of IndexOf for jchar arrays.
3821 //
3822 // The length of haystack and needle are not constant, i.e. passed in a register.
3823 //
3824 // Preserves registers haystack, needle.
3825 // Kills registers haycnt, needlecnt.
3826 // Assumes that result differs from all other registers.
3827 // Haystack, needle are the addresses of jchar-arrays.
3828 // Haycnt, needlecnt are the lengths of them, respectively.
3829 //
3830 // Needlecntval must be zero or 15-bit unsigned immediate and > 1.
3831 void MacroAssembler::string_indexof(Register result, Register haystack, Register haycnt,
3832                                     Register needle, ciTypeArray* needle_values, Register needlecnt, int needlecntval,
3833                                     Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
3834 
3835   // Ensure 0<needlecnt<=haycnt in ideal graph as prerequisite!
3836   Label L_TooShort, L_Found, L_NotFound, L_End;
3837   Register last_addr = haycnt, // Kill haycnt at the beginning.
3838            addr      = tmp1,
3839            n_start   = tmp2,
3840            ch1       = tmp3,
3841            ch2       = R0;
3842 
3843   // **************************************************************************************************
3844   // Prepare for main loop: optimized for needle count >=2, bail out otherwise.
3845   // **************************************************************************************************
3846 
3847 //1 (variable) or 3 (const):
3848    dcbtct(needle, 0x00);    // Indicate R/O access to str1.
3849    dcbtct(haystack, 0x00);  // Indicate R/O access to str2.
3850 
3851   // Compute last haystack addr to use if no match gets found.
3852   if (needlecntval == 0) { // variable needlecnt
3853 //3:
3854    subf(ch1, needlecnt, haycnt);      // Last character index to compare is haycnt-needlecnt.
3855    addi(addr, haystack, -2);          // Accesses use pre-increment.
3856    cmpwi(CCR6, needlecnt, 2);
3857    blt(CCR6, L_TooShort);          // Variable needlecnt: handle short needle separately.
3858    slwi(ch1, ch1, 1);                 // Scale to number of bytes.
3859    lwz(n_start, 0, needle);           // Load first 2 characters of needle.
3860    add(last_addr, haystack, ch1);     // Point to last address to compare (haystack+2*(haycnt-needlecnt)).
3861    addi(needlecnt, needlecnt, -2);    // Rest of needle.
3862   } else { // constant needlecnt
3863   guarantee(needlecntval != 1, "IndexOf with single-character needle must be handled separately");
3864   assert((needlecntval & 0x7fff) == needlecntval, "wrong immediate");
3865 //5:
3866    addi(ch1, haycnt, -needlecntval);  // Last character index to compare is haycnt-needlecnt.
3867    lwz(n_start, 0, needle);           // Load first 2 characters of needle.
3868    addi(addr, haystack, -2);          // Accesses use pre-increment.
3869    slwi(ch1, ch1, 1);                 // Scale to number of bytes.
3870    add(last_addr, haystack, ch1);     // Point to last address to compare (haystack+2*(haycnt-needlecnt)).
3871    li(needlecnt, needlecntval-2);     // Rest of needle.
3872   }
3873 
3874   // Main Loop (now we have at least 3 characters).
3875 //11:
3876   Label L_OuterLoop, L_InnerLoop, L_FinalCheck, L_Comp1, L_Comp2, L_Comp3;
3877   bind(L_OuterLoop); // Search for 1st 2 characters.
3878   Register addr_diff = tmp4;
3879    subf(addr_diff, addr, last_addr); // Difference between already checked address and last address to check.
3880    addi(addr, addr, 2);              // This is the new address we want to use for comparing.
3881    srdi_(ch2, addr_diff, 2);
3882    beq(CCR0, L_FinalCheck);       // 2 characters left?
3883    mtctr(ch2);                       // addr_diff/4
3884 //16:
3885   bind(L_InnerLoop);                // Main work horse (2x unrolled search loop)
3886    lwz(ch1, 0, addr);           // Load 2 characters of haystack (ignore alignment).
3887    lwz(ch2, 2, addr);
3888    cmpw(CCR0, ch1, n_start); // Compare 2 characters (1 would be sufficient but try to reduce branches to CompLoop).
3889    cmpw(CCR1, ch2, n_start);
3890    beq(CCR0, L_Comp1);       // Did we find the needle start?
3891    beq(CCR1, L_Comp2);
3892    addi(addr, addr, 4);
3893    bdnz(L_InnerLoop);
3894 //24:
3895   bind(L_FinalCheck);
3896    rldicl_(addr_diff, addr_diff, 64-1, 63); // Remaining characters not covered by InnerLoop: (addr_diff>>1)&1.
3897    beq(CCR0, L_NotFound);
3898    lwz(ch1, 0, addr);                       // One position left at which we have to compare.
3899    cmpw(CCR1, ch1, n_start);
3900    beq(CCR1, L_Comp3);
3901 //29:
3902   bind(L_NotFound);
3903    li(result, -1); // not found
3904    b(L_End);
3905 
3906 
3907    // **************************************************************************************************
3908    // Special Case: unfortunately, the variable needle case can be called with needlecnt<2
3909    // **************************************************************************************************
3910 //31:
3911  if ((needlecntval>>1) !=1 ) { // Const needlecnt is 2 or 3? Reduce code size.
3912   int nopcnt = 5;
3913   if (needlecntval !=0 ) ++nopcnt; // Balance alignment (other case: see below).
3914   if (needlecntval == 0) {         // We have to handle these cases separately.
3915   Label L_OneCharLoop;
3916   bind(L_TooShort);
3917    mtctr(haycnt);
3918    lhz(n_start, 0, needle);    // First character of needle
3919   bind(L_OneCharLoop);
3920    lhzu(ch1, 2, addr);
3921    cmpw(CCR1, ch1, n_start);
3922    beq(CCR1, L_Found);      // Did we find the one character needle?
3923    bdnz(L_OneCharLoop);
3924    li(result, -1);             // Not found.
3925    b(L_End);
3926   } // 8 instructions, so no impact on alignment.
3927   for (int x = 0; x < nopcnt; ++x) nop();
3928  }
3929 
3930   // **************************************************************************************************
3931   // Regular Case Part II: compare rest of needle (first 2 characters have been compared already)
3932   // **************************************************************************************************
3933 
3934   // Compare the rest
3935 //36 if needlecntval==0, else 37:
3936   bind(L_Comp2);
3937    addi(addr, addr, 2); // First comparison has failed, 2nd one hit.
3938   bind(L_Comp1);            // Addr points to possible needle start.
3939   bind(L_Comp3);            // Could have created a copy and use a different return address but saving code size here.
3940   if (needlecntval != 2) {  // Const needlecnt==2?
3941    if (needlecntval != 3) {
3942     if (needlecntval == 0) beq(CCR6, L_Found); // Variable needlecnt==2?
3943     Register ind_reg = tmp4;
3944     li(ind_reg, 2*2);   // First 2 characters are already compared, use index 2.
3945     mtctr(needlecnt);   // Decremented by 2, still > 0.
3946 //40:
3947    Label L_CompLoop;
3948    bind(L_CompLoop);
3949     lhzx(ch2, needle, ind_reg);
3950     lhzx(ch1, addr, ind_reg);
3951     cmpw(CCR1, ch1, ch2);
3952     bne(CCR1, L_OuterLoop);
3953     addi(ind_reg, ind_reg, 2);
3954     bdnz(L_CompLoop);
3955    } else { // No loop required if there's only one needle character left.
3956     lhz(ch2, 2*2, needle);
3957     lhz(ch1, 2*2, addr);
3958     cmpw(CCR1, ch1, ch2);
3959     bne(CCR1, L_OuterLoop);
3960    }
3961   }
3962   // Return index ...
3963 //46:
3964   bind(L_Found);
3965    subf(addr, haystack, addr); // relative to haystack, ...
3966    srdi(result, addr, 1);      // in characters.
3967 //48:
3968   bind(L_End);
3969 }
3970 
3971 // Implementation of Compare for jchar arrays.
3972 //
3973 // Kills the registers str1, str2, cnt1, cnt2.
3974 // Kills cr0, ctr.
3975 // Assumes that result differes from the input registers.
3976 void MacroAssembler::string_compare(Register str1_reg, Register str2_reg, Register cnt1_reg, Register cnt2_reg,
3977                                     Register result_reg, Register tmp_reg) {
3978    assert_different_registers(result_reg, str1_reg, str2_reg, cnt1_reg, cnt2_reg, tmp_reg);
3979 
3980    Label Ldone, Lslow_case, Lslow_loop, Lfast_loop;
3981    Register cnt_diff = R0,
3982             limit_reg = cnt1_reg,
3983             chr1_reg = result_reg,
3984             chr2_reg = cnt2_reg,
3985             addr_diff = str2_reg;
3986 
3987    // 'cnt_reg' contains the number of characters in the string's character array for the
3988    // pre-CompactStrings strings implementation and the number of bytes in the string's
3989    // byte array for the CompactStrings strings implementation.
3990    const int HAS_COMPACT_STRING = java_lang_String::has_coder_field() ? 1 : 0; // '1' = byte array, '0' = char array
3991 
3992    // Offset 0 should be 32 byte aligned.
3993 //-6:
3994     srawi(cnt1_reg, cnt1_reg, HAS_COMPACT_STRING);
3995     srawi(cnt2_reg, cnt2_reg, HAS_COMPACT_STRING);
3996 //-4:
3997     dcbtct(str1_reg, 0x00);  // Indicate R/O access to str1.
3998     dcbtct(str2_reg, 0x00);  // Indicate R/O access to str2.
3999 //-2:
4000    // Compute min(cnt1, cnt2) and check if 0 (bail out if we don't need to compare characters).
4001     subf(result_reg, cnt2_reg, cnt1_reg);  // difference between cnt1/2
4002     subf_(addr_diff, str1_reg, str2_reg);  // alias?
4003     beq(CCR0, Ldone);                   // return cnt difference if both ones are identical
4004     srawi(limit_reg, result_reg, 31);      // generate signmask (cnt1/2 must be non-negative so cnt_diff can't overflow)
4005     mr(cnt_diff, result_reg);
4006     andr(limit_reg, result_reg, limit_reg); // difference or zero (negative): cnt1<cnt2 ? cnt1-cnt2 : 0
4007     add_(limit_reg, cnt2_reg, limit_reg);  // min(cnt1, cnt2)==0?
4008     beq(CCR0, Ldone);                   // return cnt difference if one has 0 length
4009 
4010     lhz(chr1_reg, 0, str1_reg);            // optional: early out if first characters mismatch
4011     lhzx(chr2_reg, str1_reg, addr_diff);   // optional: early out if first characters mismatch
4012     addi(tmp_reg, limit_reg, -1);          // min(cnt1, cnt2)-1
4013     subf_(result_reg, chr2_reg, chr1_reg); // optional: early out if first characters mismatch
4014     bne(CCR0, Ldone);                   // optional: early out if first characters mismatch
4015 
4016    // Set loop counter by scaling down tmp_reg
4017     srawi_(chr2_reg, tmp_reg, exact_log2(4)); // (min(cnt1, cnt2)-1)/4
4018     ble(CCR0, Lslow_case);                 // need >4 characters for fast loop
4019     andi(limit_reg, tmp_reg, 4-1);            // remaining characters
4020 
4021    // Adapt str1_reg str2_reg for the first loop iteration
4022     mtctr(chr2_reg);                 // (min(cnt1, cnt2)-1)/4
4023     addi(limit_reg, limit_reg, 4+1); // compare last 5-8 characters in slow_case if mismatch found in fast_loop
4024 //16:
4025    // Compare the rest of the characters
4026    bind(Lfast_loop);
4027     ld(chr1_reg, 0, str1_reg);
4028     ldx(chr2_reg, str1_reg, addr_diff);
4029     cmpd(CCR0, chr2_reg, chr1_reg);
4030     bne(CCR0, Lslow_case); // return chr1_reg
4031     addi(str1_reg, str1_reg, 4*2);
4032     bdnz(Lfast_loop);
4033     addi(limit_reg, limit_reg, -4); // no mismatch found in fast_loop, only 1-4 characters missing
4034 //23:
4035    bind(Lslow_case);
4036     mtctr(limit_reg);
4037 //24:
4038    bind(Lslow_loop);
4039     lhz(chr1_reg, 0, str1_reg);
4040     lhzx(chr2_reg, str1_reg, addr_diff);
4041     subf_(result_reg, chr2_reg, chr1_reg);
4042     bne(CCR0, Ldone); // return chr1_reg
4043     addi(str1_reg, str1_reg, 1*2);
4044     bdnz(Lslow_loop);
4045 //30:
4046    // If strings are equal up to min length, return the length difference.
4047     mr(result_reg, cnt_diff);
4048     nop(); // alignment
4049 //32:
4050    // Otherwise, return the difference between the first mismatched chars.
4051    bind(Ldone);
4052 }
4053 
4054 
4055 // Compare char[] arrays.
4056 //
4057 // str1_reg   USE only
4058 // str2_reg   USE only
4059 // cnt_reg    USE_DEF, due to tmp reg shortage
4060 // result_reg DEF only, might compromise USE only registers
4061 void MacroAssembler::char_arrays_equals(Register str1_reg, Register str2_reg, Register cnt_reg, Register result_reg,
4062                                         Register tmp1_reg, Register tmp2_reg, Register tmp3_reg, Register tmp4_reg,
4063                                         Register tmp5_reg) {
4064 
4065   // Str1 may be the same register as str2 which can occur e.g. after scalar replacement.
4066   assert_different_registers(result_reg, str1_reg, cnt_reg, tmp1_reg, tmp2_reg, tmp3_reg, tmp4_reg, tmp5_reg);
4067   assert_different_registers(result_reg, str2_reg, cnt_reg, tmp1_reg, tmp2_reg, tmp3_reg, tmp4_reg, tmp5_reg);
4068 
4069   // Offset 0 should be 32 byte aligned.
4070   Label Linit_cbc, Lcbc, Lloop, Ldone_true, Ldone_false;
4071   Register index_reg = tmp5_reg;
4072   Register cbc_iter  = tmp4_reg;
4073 
4074   // 'cnt_reg' contains the number of characters in the string's character array for the
4075   // pre-CompactStrings strings implementation and the number of bytes in the string's
4076   // byte array for the CompactStrings strings implementation.
4077   const int HAS_COMPACT_STRING = java_lang_String::has_coder_field() ? 1 : 0; // '1' = byte array, '0' = char array
4078 
4079 //-1:
4080   dcbtct(str1_reg, 0x00);  // Indicate R/O access to str1.
4081   dcbtct(str2_reg, 0x00);  // Indicate R/O access to str2.
4082 //1:
4083   // cbc_iter: remaining characters after the '4 java characters per iteration' loop.
4084   rlwinm(cbc_iter, cnt_reg, 32 - HAS_COMPACT_STRING, 30, 31); // (cnt_reg % (HAS_COMPACT_STRING ? 8 : 4)) >> HAS_COMPACT_STRING
4085   li(index_reg, 0); // init
4086   li(result_reg, 0); // assume false
4087   // tmp2_reg: units of 4 java characters (i.e. 8 bytes) per iteration (main loop).
4088   srwi_(tmp2_reg, cnt_reg, exact_log2(4 << HAS_COMPACT_STRING)); // cnt_reg / (HAS_COMPACT_STRING ? 8 : 4)
4089 
4090   cmpwi(CCR1, cbc_iter, 0);             // CCR1 = (cbc_iter==0)
4091   beq(CCR0, Linit_cbc);                 // too short
4092     mtctr(tmp2_reg);
4093 //8:
4094     bind(Lloop);
4095       ldx(tmp1_reg, str1_reg, index_reg);
4096       ldx(tmp2_reg, str2_reg, index_reg);
4097       cmpd(CCR0, tmp1_reg, tmp2_reg);
4098       bne(CCR0, Ldone_false);  // Unequal char pair found -> done.
4099       addi(index_reg, index_reg, 4*sizeof(jchar));
4100       bdnz(Lloop);
4101 //14:
4102   bind(Linit_cbc);
4103   beq(CCR1, Ldone_true);
4104     mtctr(cbc_iter);
4105 //16:
4106     bind(Lcbc);
4107       lhzx(tmp1_reg, str1_reg, index_reg);
4108       lhzx(tmp2_reg, str2_reg, index_reg);
4109       cmpw(CCR0, tmp1_reg, tmp2_reg);
4110       bne(CCR0, Ldone_false);  // Unequal char pair found -> done.
4111       addi(index_reg, index_reg, 1*sizeof(jchar));
4112       bdnz(Lcbc);
4113     nop();
4114   bind(Ldone_true);
4115   li(result_reg, 1);
4116 //24:
4117   bind(Ldone_false);
4118 }
4119 
4120 
4121 void MacroAssembler::char_arrays_equalsImm(Register str1_reg, Register str2_reg, int cntval, Register result_reg,
4122                                            Register tmp1_reg, Register tmp2_reg) {
4123   // Str1 may be the same register as str2 which can occur e.g. after scalar replacement.
4124   assert_different_registers(result_reg, str1_reg, tmp1_reg, tmp2_reg);
4125   assert_different_registers(result_reg, str2_reg, tmp1_reg, tmp2_reg);
4126   assert(sizeof(jchar) == 2, "must be");
4127   assert(cntval >= 0 && ((cntval & 0x7fff) == cntval), "wrong immediate");
4128 
4129   // 'cntval' contains the number of characters in the string's character array for the
4130   // pre-CompactStrings strings implementation and the number of bytes in the string's
4131   // byte array for the CompactStrings strings implementation.
4132   cntval >>= (java_lang_String::has_coder_field() ? 1 : 0); // '1' = byte array strings, '0' = char array strings
4133 
4134   Label Ldone_false;
4135 
4136   if (cntval < 16) { // short case
4137     if (cntval != 0) li(result_reg, 0); // assume false
4138 
4139     const int num_bytes = cntval*sizeof(jchar);
4140     int index = 0;
4141     for (int next_index; (next_index = index + 8) <= num_bytes; index = next_index) {
4142       ld(tmp1_reg, index, str1_reg);
4143       ld(tmp2_reg, index, str2_reg);
4144       cmpd(CCR0, tmp1_reg, tmp2_reg);
4145       bne(CCR0, Ldone_false);
4146     }
4147     if (cntval & 2) {
4148       lwz(tmp1_reg, index, str1_reg);
4149       lwz(tmp2_reg, index, str2_reg);
4150       cmpw(CCR0, tmp1_reg, tmp2_reg);
4151       bne(CCR0, Ldone_false);
4152       index += 4;
4153     }
4154     if (cntval & 1) {
4155       lhz(tmp1_reg, index, str1_reg);
4156       lhz(tmp2_reg, index, str2_reg);
4157       cmpw(CCR0, tmp1_reg, tmp2_reg);
4158       bne(CCR0, Ldone_false);
4159     }
4160     // fallthrough: true
4161   } else {
4162     Label Lloop;
4163     Register index_reg = tmp1_reg;
4164     const int loopcnt = cntval/4;
4165     assert(loopcnt > 0, "must be");
4166     // Offset 0 should be 32 byte aligned.
4167     //2:
4168     dcbtct(str1_reg, 0x00);  // Indicate R/O access to str1.
4169     dcbtct(str2_reg, 0x00);  // Indicate R/O access to str2.
4170     li(tmp2_reg, loopcnt);
4171     li(index_reg, 0); // init
4172     li(result_reg, 0); // assume false
4173     mtctr(tmp2_reg);
4174     //8:
4175     bind(Lloop);
4176     ldx(R0, str1_reg, index_reg);
4177     ldx(tmp2_reg, str2_reg, index_reg);
4178     cmpd(CCR0, R0, tmp2_reg);
4179     bne(CCR0, Ldone_false);  // Unequal char pair found -> done.
4180     addi(index_reg, index_reg, 4*sizeof(jchar));
4181     bdnz(Lloop);
4182     //14:
4183     if (cntval & 2) {
4184       lwzx(R0, str1_reg, index_reg);
4185       lwzx(tmp2_reg, str2_reg, index_reg);
4186       cmpw(CCR0, R0, tmp2_reg);
4187       bne(CCR0, Ldone_false);
4188       if (cntval & 1) addi(index_reg, index_reg, 2*sizeof(jchar));
4189     }
4190     if (cntval & 1) {
4191       lhzx(R0, str1_reg, index_reg);
4192       lhzx(tmp2_reg, str2_reg, index_reg);
4193       cmpw(CCR0, R0, tmp2_reg);
4194       bne(CCR0, Ldone_false);
4195     }
4196     // fallthru: true
4197   }
4198   li(result_reg, 1);
4199   bind(Ldone_false);
4200 }
4201 
4202 #endif // Compiler2
4203 
4204 // Helpers for Intrinsic Emitters
4205 //
4206 // Revert the byte order of a 32bit value in a register
4207 //   src: 0x44556677
4208 //   dst: 0x77665544
4209 // Three steps to obtain the result:
4210 //  1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word
4211 //     into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared.
4212 //     This value initializes dst.
4213 //  2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost
4214 //     byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go.
4215 //     This value is mask inserted into dst with a [0..23] mask of 1s.
4216 //  3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position.
4217 //     This value is mask inserted into dst with a [8..15] mask of 1s.
4218 void MacroAssembler::load_reverse_32(Register dst, Register src) {
4219   assert_different_registers(dst, src);
4220 
4221   rldicl(dst, src, (4+1)*8, 56);       // Rotate byte 4 into position 7 (rightmost), clear all to the left.
4222   rlwimi(dst, src,     3*8,  0, 23);   // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone.
4223   rlwimi(dst, src,     1*8,  8, 15);   // Insert byte 6 into position 5, leave the rest alone.
4224 }
4225 
4226 // Calculate the column addresses of the crc32 lookup table into distinct registers.
4227 // This loop-invariant calculation is moved out of the loop body, reducing the loop
4228 // body size from 20 to 16 instructions.
4229 // Returns the offset that was used to calculate the address of column tc3.
4230 // Due to register shortage, setting tc3 may overwrite table. With the return offset
4231 // at hand, the original table address can be easily reconstructed.
4232 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {
4233 
4234 #ifdef VM_LITTLE_ENDIAN
4235   // This is what we implement (the DOLIT4 part):
4236   // ========================================================================= */
4237   // #define DOLIT4 c ^= *buf4++; \
4238   //         c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \
4239   //             crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24]
4240   // #define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4
4241   // ========================================================================= */
4242   const int ix0 = 3*(4*CRC32_COLUMN_SIZE);
4243   const int ix1 = 2*(4*CRC32_COLUMN_SIZE);
4244   const int ix2 = 1*(4*CRC32_COLUMN_SIZE);
4245   const int ix3 = 0*(4*CRC32_COLUMN_SIZE);
4246 #else
4247   // This is what we implement (the DOBIG4 part):
4248   // =========================================================================
4249   // #define DOBIG4 c ^= *++buf4; \
4250   //         c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \
4251   //             crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24]
4252   // #define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4
4253   // =========================================================================
4254   const int ix0 = 4*(4*CRC32_COLUMN_SIZE);
4255   const int ix1 = 5*(4*CRC32_COLUMN_SIZE);
4256   const int ix2 = 6*(4*CRC32_COLUMN_SIZE);
4257   const int ix3 = 7*(4*CRC32_COLUMN_SIZE);
4258 #endif
4259   assert_different_registers(table, tc0, tc1, tc2);
4260   assert(table == tc3, "must be!");
4261 
4262   addi(tc0, table, ix0);
4263   addi(tc1, table, ix1);
4264   addi(tc2, table, ix2);
4265   if (ix3 != 0) addi(tc3, table, ix3);
4266 
4267   return ix3;
4268 }
4269 
4270 /**
4271  * uint32_t crc;
4272  * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
4273  */
4274 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) {
4275   assert_different_registers(crc, table, tmp);
4276   assert_different_registers(val, table);
4277 
4278   if (crc == val) {                   // Must rotate first to use the unmodified value.
4279     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
4280                                       // As we use a word (4-byte) instruction, we have to adapt the mask bit positions.
4281     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
4282   } else {
4283     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
4284     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
4285   }
4286   lwzx(tmp, table, tmp);
4287   xorr(crc, crc, tmp);
4288 }
4289 
4290 /**
4291  * uint32_t crc;
4292  * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
4293  */
4294 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
4295   fold_byte_crc32(crc, crc, table, tmp);
4296 }
4297 
4298 /**
4299  * Emits code to update CRC-32 with a byte value according to constants in table.
4300  *
4301  * @param [in,out]crc   Register containing the crc.
4302  * @param [in]val       Register containing the byte to fold into the CRC.
4303  * @param [in]table     Register containing the table of crc constants.
4304  *
4305  * uint32_t crc;
4306  * val = crc_table[(val ^ crc) & 0xFF];
4307  * crc = val ^ (crc >> 8);
4308  */
4309 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
4310   BLOCK_COMMENT("update_byte_crc32:");
4311   xorr(val, val, crc);
4312   fold_byte_crc32(crc, val, table, val);
4313 }
4314 
4315 /**
4316  * @param crc   register containing existing CRC (32-bit)
4317  * @param buf   register pointing to input byte buffer (byte*)
4318  * @param len   register containing number of bytes
4319  * @param table register pointing to CRC table
4320  */
4321 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,
4322                                            Register data, bool loopAlignment, bool invertCRC) {
4323   assert_different_registers(crc, buf, len, table, data);
4324 
4325   Label L_mainLoop, L_done;
4326   const int mainLoop_stepping  = 1;
4327   const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4;
4328 
4329   // Process all bytes in a single-byte loop.
4330   clrldi_(len, len, 32);                         // Enforce 32 bit. Anything to do?
4331   beq(CCR0, L_done);
4332 
4333   if (invertCRC) {
4334     nand(crc, crc, crc);                         // ~c
4335   }
4336 
4337   mtctr(len);
4338   align(mainLoop_alignment);
4339   BIND(L_mainLoop);
4340     lbz(data, 0, buf);                           // Byte from buffer, zero-extended.
4341     addi(buf, buf, mainLoop_stepping);           // Advance buffer position.
4342     update_byte_crc32(crc, data, table);
4343     bdnz(L_mainLoop);                            // Iterate.
4344 
4345   if (invertCRC) {
4346     nand(crc, crc, crc);                         // ~c
4347   }
4348 
4349   bind(L_done);
4350 }
4351 
4352 /**
4353  * Emits code to update CRC-32 with a 4-byte value according to constants in table
4354  * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
4355  */
4356 // A not on the lookup table address(es):
4357 // The lookup table consists of two sets of four columns each.
4358 // The columns {0..3} are used for little-endian machines.
4359 // The columns {4..7} are used for big-endian machines.
4360 // To save the effort of adding the column offset to the table address each time
4361 // a table element is looked up, it is possible to pass the pre-calculated
4362 // column addresses.
4363 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary.
4364 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,
4365                                         Register t0,  Register t1,  Register t2,  Register t3,
4366                                         Register tc0, Register tc1, Register tc2, Register tc3) {
4367   assert_different_registers(crc, t3);
4368 
4369   // XOR crc with next four bytes of buffer.
4370   lwz(t3, bufDisp, buf);
4371   if (bufInc != 0) {
4372     addi(buf, buf, bufInc);
4373   }
4374   xorr(t3, t3, crc);
4375 
4376   // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices.
4377   rlwinm(t0, t3,  2,         24-2, 31-2);  // ((t1 >>  0) & 0xff) << 2
4378   rlwinm(t1, t3,  32+(2- 8), 24-2, 31-2);  // ((t1 >>  8) & 0xff) << 2
4379   rlwinm(t2, t3,  32+(2-16), 24-2, 31-2);  // ((t1 >> 16) & 0xff) << 2
4380   rlwinm(t3, t3,  32+(2-24), 24-2, 31-2);  // ((t1 >> 24) & 0xff) << 2
4381 
4382   // Use the pre-calculated column addresses.
4383   // Load pre-calculated table values.
4384   lwzx(t0, tc0, t0);
4385   lwzx(t1, tc1, t1);
4386   lwzx(t2, tc2, t2);
4387   lwzx(t3, tc3, t3);
4388 
4389   // Calculate new crc from table values.
4390   xorr(t0,  t0, t1);
4391   xorr(t2,  t2, t3);
4392   xorr(crc, t0, t2);  // Now crc contains the final checksum value.
4393 }
4394 
4395 /**
4396  * @param crc   register containing existing CRC (32-bit)
4397  * @param buf   register pointing to input byte buffer (byte*)
4398  * @param len   register containing number of bytes
4399  * @param table register pointing to CRC table
4400  *
4401  * Uses R9..R12 as work register. Must be saved/restored by caller!
4402  */
4403 void MacroAssembler::kernel_crc32_2word(Register crc, Register buf, Register len, Register table,
4404                                         Register t0,  Register t1,  Register t2,  Register t3,
4405                                         Register tc0, Register tc1, Register tc2, Register tc3) {
4406   assert_different_registers(crc, buf, len, table);
4407 
4408   Label L_mainLoop, L_tail;
4409   Register  tmp  = t0;
4410   Register  data = t0;
4411   Register  tmp2 = t1;
4412   const int mainLoop_stepping  = 8;
4413   const int tailLoop_stepping  = 1;
4414   const int log_stepping       = exact_log2(mainLoop_stepping);
4415   const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
4416   const int complexThreshold   = 2*mainLoop_stepping;
4417 
4418   // Don't test for len <= 0 here. This pathological case should not occur anyway.
4419   // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles.
4420   // The situation itself is detected and handled correctly by the conditional branches
4421   // following  aghi(len, -stepping) and aghi(len, +stepping).
4422   assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
4423 
4424   BLOCK_COMMENT("kernel_crc32_2word {");
4425 
4426   nand(crc, crc, crc);                           // ~c
4427 
4428   // Check for short (<mainLoop_stepping) buffer.
4429   cmpdi(CCR0, len, complexThreshold);
4430   blt(CCR0, L_tail);
4431 
4432   // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
4433   // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
4434   {
4435     // Align buf addr to mainLoop_stepping boundary.
4436     neg(tmp2, buf);                           // Calculate # preLoop iterations for alignment.
4437     rldicl(tmp2, tmp2, 0, 64-log_stepping);   // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
4438 
4439     if (complexThreshold > mainLoop_stepping) {
4440       sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4441     } else {
4442       sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
4443       cmpdi(CCR0, tmp, mainLoop_stepping);
4444       blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
4445       mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4446     }
4447     update_byteLoop_crc32(crc, buf, tmp2, table, data, false, false);
4448   }
4449 
4450   srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
4451   andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
4452   mtctr(tmp2);
4453 
4454 #ifdef VM_LITTLE_ENDIAN
4455   Register crc_rv = crc;
4456 #else
4457   Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
4458                                                  // Occupies tmp, but frees up crc.
4459   load_reverse_32(crc_rv, crc);                  // Revert byte order because we are dealing with big-endian data.
4460   tmp = crc;
4461 #endif
4462 
4463   int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
4464 
4465   align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
4466   BIND(L_mainLoop);
4467     update_1word_crc32(crc_rv, buf, table, 0, 0, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
4468     update_1word_crc32(crc_rv, buf, table, 4, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
4469     bdnz(L_mainLoop);
4470 
4471 #ifndef VM_LITTLE_ENDIAN
4472   load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
4473   tmp = crc_rv;                                  // Tmp uses it's original register again.
4474 #endif
4475 
4476   // Restore original table address for tailLoop.
4477   if (reconstructTableOffset != 0) {
4478     addi(table, table, -reconstructTableOffset);
4479   }
4480 
4481   // Process last few (<complexThreshold) bytes of buffer.
4482   BIND(L_tail);
4483   update_byteLoop_crc32(crc, buf, len, table, data, false, false);
4484 
4485   nand(crc, crc, crc);                           // ~c
4486   BLOCK_COMMENT("} kernel_crc32_2word");
4487 }
4488 
4489 /**
4490  * @param crc   register containing existing CRC (32-bit)
4491  * @param buf   register pointing to input byte buffer (byte*)
4492  * @param len   register containing number of bytes
4493  * @param table register pointing to CRC table
4494  *
4495  * uses R9..R12 as work register. Must be saved/restored by caller!
4496  */
4497 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table,
4498                                         Register t0,  Register t1,  Register t2,  Register t3,
4499                                         Register tc0, Register tc1, Register tc2, Register tc3) {
4500   assert_different_registers(crc, buf, len, table);
4501 
4502   Label L_mainLoop, L_tail;
4503   Register  tmp          = t0;
4504   Register  data         = t0;
4505   Register  tmp2         = t1;
4506   const int mainLoop_stepping  = 4;
4507   const int tailLoop_stepping  = 1;
4508   const int log_stepping       = exact_log2(mainLoop_stepping);
4509   const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
4510   const int complexThreshold   = 2*mainLoop_stepping;
4511 
4512   // Don't test for len <= 0 here. This pathological case should not occur anyway.
4513   // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles.
4514   // The situation itself is detected and handled correctly by the conditional branches
4515   // following  aghi(len, -stepping) and aghi(len, +stepping).
4516   assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
4517 
4518   BLOCK_COMMENT("kernel_crc32_1word {");
4519 
4520   nand(crc, crc, crc);                           // ~c
4521 
4522   // Check for short (<mainLoop_stepping) buffer.
4523   cmpdi(CCR0, len, complexThreshold);
4524   blt(CCR0, L_tail);
4525 
4526   // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
4527   // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
4528   {
4529     // Align buf addr to mainLoop_stepping boundary.
4530     neg(tmp2, buf);                              // Calculate # preLoop iterations for alignment.
4531     rldicl(tmp2, tmp2, 0, 64-log_stepping);      // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
4532 
4533     if (complexThreshold > mainLoop_stepping) {
4534       sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4535     } else {
4536       sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
4537       cmpdi(CCR0, tmp, mainLoop_stepping);
4538       blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
4539       mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4540     }
4541     update_byteLoop_crc32(crc, buf, tmp2, table, data, false, false);
4542   }
4543 
4544   srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
4545   andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
4546   mtctr(tmp2);
4547 
4548 #ifdef VM_LITTLE_ENDIAN
4549   Register crc_rv = crc;
4550 #else
4551   Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
4552                                                  // Occupies tmp, but frees up crc.
4553   load_reverse_32(crc_rv, crc);                  // Revert byte order because we are dealing with big-endian data.
4554   tmp = crc;
4555 #endif
4556 
4557   int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
4558 
4559   align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
4560   BIND(L_mainLoop);
4561     update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
4562     bdnz(L_mainLoop);
4563 
4564 #ifndef VM_LITTLE_ENDIAN
4565   load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
4566   tmp = crc_rv;                                  // Tmp uses it's original register again.
4567 #endif
4568 
4569   // Restore original table address for tailLoop.
4570   if (reconstructTableOffset != 0) {
4571     addi(table, table, -reconstructTableOffset);
4572   }
4573 
4574   // Process last few (<complexThreshold) bytes of buffer.
4575   BIND(L_tail);
4576   update_byteLoop_crc32(crc, buf, len, table, data, false, false);
4577 
4578   nand(crc, crc, crc);                           // ~c
4579   BLOCK_COMMENT("} kernel_crc32_1word");
4580 }
4581 
4582 /**
4583  * @param crc   register containing existing CRC (32-bit)
4584  * @param buf   register pointing to input byte buffer (byte*)
4585  * @param len   register containing number of bytes
4586  * @param table register pointing to CRC table
4587  *
4588  * Uses R7_ARG5, R8_ARG6 as work registers.
4589  */
4590 void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len, Register table,
4591                                         Register t0,  Register t1,  Register t2,  Register t3) {
4592   assert_different_registers(crc, buf, len, table);
4593 
4594   Register  data = t0;                   // Holds the current byte to be folded into crc.
4595 
4596   BLOCK_COMMENT("kernel_crc32_1byte {");
4597 
4598   // Process all bytes in a single-byte loop.
4599   update_byteLoop_crc32(crc, buf, len, table, data, true, true);
4600 
4601   BLOCK_COMMENT("} kernel_crc32_1byte");
4602 }
4603 
4604 void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp) {
4605   assert_different_registers(crc, buf, /* len,  not used!! */ table, tmp);
4606 
4607   BLOCK_COMMENT("kernel_crc32_singleByte:");
4608   nand(crc, crc, crc);       // ~c
4609 
4610   lbz(tmp, 0, buf);          // Byte from buffer, zero-extended.
4611   update_byte_crc32(crc, tmp, table);
4612 
4613   nand(crc, crc, crc);       // ~c
4614 }
4615 
4616 // dest_lo += src1 + src2
4617 // dest_hi += carry1 + carry2
4618 void MacroAssembler::add2_with_carry(Register dest_hi,
4619                                      Register dest_lo,
4620                                      Register src1, Register src2) {
4621   li(R0, 0);
4622   addc(dest_lo, dest_lo, src1);
4623   adde(dest_hi, dest_hi, R0);
4624   addc(dest_lo, dest_lo, src2);
4625   adde(dest_hi, dest_hi, R0);
4626 }
4627 
4628 // Multiply 64 bit by 64 bit first loop.
4629 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,
4630                                            Register x_xstart,
4631                                            Register y, Register y_idx,
4632                                            Register z,
4633                                            Register carry,
4634                                            Register product_high, Register product,
4635                                            Register idx, Register kdx,
4636                                            Register tmp) {
4637   //  jlong carry, x[], y[], z[];
4638   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
4639   //    huge_128 product = y[idx] * x[xstart] + carry;
4640   //    z[kdx] = (jlong)product;
4641   //    carry  = (jlong)(product >>> 64);
4642   //  }
4643   //  z[xstart] = carry;
4644 
4645   Label L_first_loop, L_first_loop_exit;
4646   Label L_one_x, L_one_y, L_multiply;
4647 
4648   addic_(xstart, xstart, -1);
4649   blt(CCR0, L_one_x);   // Special case: length of x is 1.
4650 
4651   // Load next two integers of x.
4652   sldi(tmp, xstart, LogBytesPerInt);
4653   ldx(x_xstart, x, tmp);
4654 #ifdef VM_LITTLE_ENDIAN
4655   rldicl(x_xstart, x_xstart, 32, 0);
4656 #endif
4657 
4658   align(32, 16);
4659   bind(L_first_loop);
4660 
4661   cmpdi(CCR0, idx, 1);
4662   blt(CCR0, L_first_loop_exit);
4663   addi(idx, idx, -2);
4664   beq(CCR0, L_one_y);
4665 
4666   // Load next two integers of y.
4667   sldi(tmp, idx, LogBytesPerInt);
4668   ldx(y_idx, y, tmp);
4669 #ifdef VM_LITTLE_ENDIAN
4670   rldicl(y_idx, y_idx, 32, 0);
4671 #endif
4672 
4673 
4674   bind(L_multiply);
4675   multiply64(product_high, product, x_xstart, y_idx);
4676 
4677   li(tmp, 0);
4678   addc(product, product, carry);         // Add carry to result.
4679   adde(product_high, product_high, tmp); // Add carry of the last addition.
4680   addi(kdx, kdx, -2);
4681 
4682   // Store result.
4683 #ifdef VM_LITTLE_ENDIAN
4684   rldicl(product, product, 32, 0);
4685 #endif
4686   sldi(tmp, kdx, LogBytesPerInt);
4687   stdx(product, z, tmp);
4688   mr_if_needed(carry, product_high);
4689   b(L_first_loop);
4690 
4691 
4692   bind(L_one_y); // Load one 32 bit portion of y as (0,value).
4693 
4694   lwz(y_idx, 0, y);
4695   b(L_multiply);
4696 
4697 
4698   bind(L_one_x); // Load one 32 bit portion of x as (0,value).
4699 
4700   lwz(x_xstart, 0, x);
4701   b(L_first_loop);
4702 
4703   bind(L_first_loop_exit);
4704 }
4705 
4706 // Multiply 64 bit by 64 bit and add 128 bit.
4707 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y,
4708                                             Register z, Register yz_idx,
4709                                             Register idx, Register carry,
4710                                             Register product_high, Register product,
4711                                             Register tmp, int offset) {
4712 
4713   //  huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
4714   //  z[kdx] = (jlong)product;
4715 
4716   sldi(tmp, idx, LogBytesPerInt);
4717   if (offset) {
4718     addi(tmp, tmp, offset);
4719   }
4720   ldx(yz_idx, y, tmp);
4721 #ifdef VM_LITTLE_ENDIAN
4722   rldicl(yz_idx, yz_idx, 32, 0);
4723 #endif
4724 
4725   multiply64(product_high, product, x_xstart, yz_idx);
4726   ldx(yz_idx, z, tmp);
4727 #ifdef VM_LITTLE_ENDIAN
4728   rldicl(yz_idx, yz_idx, 32, 0);
4729 #endif
4730 
4731   add2_with_carry(product_high, product, carry, yz_idx);
4732 
4733   sldi(tmp, idx, LogBytesPerInt);
4734   if (offset) {
4735     addi(tmp, tmp, offset);
4736   }
4737 #ifdef VM_LITTLE_ENDIAN
4738   rldicl(product, product, 32, 0);
4739 #endif
4740   stdx(product, z, tmp);
4741 }
4742 
4743 // Multiply 128 bit by 128 bit. Unrolled inner loop.
4744 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart,
4745                                              Register y, Register z,
4746                                              Register yz_idx, Register idx, Register carry,
4747                                              Register product_high, Register product,
4748                                              Register carry2, Register tmp) {
4749 
4750   //  jlong carry, x[], y[], z[];
4751   //  int kdx = ystart+1;
4752   //  for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
4753   //    huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
4754   //    z[kdx+idx+1] = (jlong)product;
4755   //    jlong carry2 = (jlong)(product >>> 64);
4756   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
4757   //    z[kdx+idx] = (jlong)product;
4758   //    carry = (jlong)(product >>> 64);
4759   //  }
4760   //  idx += 2;
4761   //  if (idx > 0) {
4762   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
4763   //    z[kdx+idx] = (jlong)product;
4764   //    carry = (jlong)(product >>> 64);
4765   //  }
4766 
4767   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
4768   const Register jdx = R0;
4769 
4770   // Scale the index.
4771   srdi_(jdx, idx, 2);
4772   beq(CCR0, L_third_loop_exit);
4773   mtctr(jdx);
4774 
4775   align(32, 16);
4776   bind(L_third_loop);
4777 
4778   addi(idx, idx, -4);
4779 
4780   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8);
4781   mr_if_needed(carry2, product_high);
4782 
4783   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0);
4784   mr_if_needed(carry, product_high);
4785   bdnz(L_third_loop);
4786 
4787   bind(L_third_loop_exit);  // Handle any left-over operand parts.
4788 
4789   andi_(idx, idx, 0x3);
4790   beq(CCR0, L_post_third_loop_done);
4791 
4792   Label L_check_1;
4793 
4794   addic_(idx, idx, -2);
4795   blt(CCR0, L_check_1);
4796 
4797   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0);
4798   mr_if_needed(carry, product_high);
4799 
4800   bind(L_check_1);
4801 
4802   addi(idx, idx, 0x2);
4803   andi_(idx, idx, 0x1);
4804   addic_(idx, idx, -1);
4805   blt(CCR0, L_post_third_loop_done);
4806 
4807   sldi(tmp, idx, LogBytesPerInt);
4808   lwzx(yz_idx, y, tmp);
4809   multiply64(product_high, product, x_xstart, yz_idx);
4810   lwzx(yz_idx, z, tmp);
4811 
4812   add2_with_carry(product_high, product, yz_idx, carry);
4813 
4814   sldi(tmp, idx, LogBytesPerInt);
4815   stwx(product, z, tmp);
4816   srdi(product, product, 32);
4817 
4818   sldi(product_high, product_high, 32);
4819   orr(product, product, product_high);
4820   mr_if_needed(carry, product);
4821 
4822   bind(L_post_third_loop_done);
4823 }   // multiply_128_x_128_loop
4824 
4825 void MacroAssembler::multiply_to_len(Register x, Register xlen,
4826                                      Register y, Register ylen,
4827                                      Register z, Register zlen,
4828                                      Register tmp1, Register tmp2,
4829                                      Register tmp3, Register tmp4,
4830                                      Register tmp5, Register tmp6,
4831                                      Register tmp7, Register tmp8,
4832                                      Register tmp9, Register tmp10,
4833                                      Register tmp11, Register tmp12,
4834                                      Register tmp13) {
4835 
4836   ShortBranchVerifier sbv(this);
4837 
4838   assert_different_registers(x, xlen, y, ylen, z, zlen,
4839                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
4840   assert_different_registers(x, xlen, y, ylen, z, zlen,
4841                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp7);
4842   assert_different_registers(x, xlen, y, ylen, z, zlen,
4843                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp8);
4844 
4845   const Register idx = tmp1;
4846   const Register kdx = tmp2;
4847   const Register xstart = tmp3;
4848 
4849   const Register y_idx = tmp4;
4850   const Register carry = tmp5;
4851   const Register product = tmp6;
4852   const Register product_high = tmp7;
4853   const Register x_xstart = tmp8;
4854   const Register tmp = tmp9;
4855 
4856   // First Loop.
4857   //
4858   //  final static long LONG_MASK = 0xffffffffL;
4859   //  int xstart = xlen - 1;
4860   //  int ystart = ylen - 1;
4861   //  long carry = 0;
4862   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
4863   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
4864   //    z[kdx] = (int)product;
4865   //    carry = product >>> 32;
4866   //  }
4867   //  z[xstart] = (int)carry;
4868 
4869   mr_if_needed(idx, ylen);        // idx = ylen
4870   mr_if_needed(kdx, zlen);        // kdx = xlen + ylen
4871   li(carry, 0);                   // carry = 0
4872 
4873   Label L_done;
4874 
4875   addic_(xstart, xlen, -1);
4876   blt(CCR0, L_done);
4877 
4878   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z,
4879                         carry, product_high, product, idx, kdx, tmp);
4880 
4881   Label L_second_loop;
4882 
4883   cmpdi(CCR0, kdx, 0);
4884   beq(CCR0, L_second_loop);
4885 
4886   Label L_carry;
4887 
4888   addic_(kdx, kdx, -1);
4889   beq(CCR0, L_carry);
4890 
4891   // Store lower 32 bits of carry.
4892   sldi(tmp, kdx, LogBytesPerInt);
4893   stwx(carry, z, tmp);
4894   srdi(carry, carry, 32);
4895   addi(kdx, kdx, -1);
4896 
4897 
4898   bind(L_carry);
4899 
4900   // Store upper 32 bits of carry.
4901   sldi(tmp, kdx, LogBytesPerInt);
4902   stwx(carry, z, tmp);
4903 
4904   // Second and third (nested) loops.
4905   //
4906   //  for (int i = xstart-1; i >= 0; i--) { // Second loop
4907   //    carry = 0;
4908   //    for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
4909   //      long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
4910   //                     (z[k] & LONG_MASK) + carry;
4911   //      z[k] = (int)product;
4912   //      carry = product >>> 32;
4913   //    }
4914   //    z[i] = (int)carry;
4915   //  }
4916   //
4917   //  i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
4918 
4919   bind(L_second_loop);
4920 
4921   li(carry, 0);                   // carry = 0;
4922 
4923   addic_(xstart, xstart, -1);     // i = xstart-1;
4924   blt(CCR0, L_done);
4925 
4926   Register zsave = tmp10;
4927 
4928   mr(zsave, z);
4929 
4930 
4931   Label L_last_x;
4932 
4933   sldi(tmp, xstart, LogBytesPerInt);
4934   add(z, z, tmp);                 // z = z + k - j
4935   addi(z, z, 4);
4936   addic_(xstart, xstart, -1);     // i = xstart-1;
4937   blt(CCR0, L_last_x);
4938 
4939   sldi(tmp, xstart, LogBytesPerInt);
4940   ldx(x_xstart, x, tmp);
4941 #ifdef VM_LITTLE_ENDIAN
4942   rldicl(x_xstart, x_xstart, 32, 0);
4943 #endif
4944 
4945 
4946   Label L_third_loop_prologue;
4947 
4948   bind(L_third_loop_prologue);
4949 
4950   Register xsave = tmp11;
4951   Register xlensave = tmp12;
4952   Register ylensave = tmp13;
4953 
4954   mr(xsave, x);
4955   mr(xlensave, xstart);
4956   mr(ylensave, ylen);
4957 
4958 
4959   multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen,
4960                           carry, product_high, product, x, tmp);
4961 
4962   mr(z, zsave);
4963   mr(x, xsave);
4964   mr(xlen, xlensave);   // This is the decrement of the loop counter!
4965   mr(ylen, ylensave);
4966 
4967   addi(tmp3, xlen, 1);
4968   sldi(tmp, tmp3, LogBytesPerInt);
4969   stwx(carry, z, tmp);
4970   addic_(tmp3, tmp3, -1);
4971   blt(CCR0, L_done);
4972 
4973   srdi(carry, carry, 32);
4974   sldi(tmp, tmp3, LogBytesPerInt);
4975   stwx(carry, z, tmp);
4976   b(L_second_loop);
4977 
4978   // Next infrequent code is moved outside loops.
4979   bind(L_last_x);
4980 
4981   lwz(x_xstart, 0, x);
4982   b(L_third_loop_prologue);
4983 
4984   bind(L_done);
4985 }   // multiply_to_len
4986 
4987 void MacroAssembler::asm_assert(bool check_equal, const char *msg, int id) {
4988 #ifdef ASSERT
4989   Label ok;
4990   if (check_equal) {
4991     beq(CCR0, ok);
4992   } else {
4993     bne(CCR0, ok);
4994   }
4995   stop(msg, id);
4996   bind(ok);
4997 #endif
4998 }
4999 
5000 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset,
5001                                           Register mem_base, const char* msg, int id) {
5002 #ifdef ASSERT
5003   switch (size) {
5004     case 4:
5005       lwz(R0, mem_offset, mem_base);
5006       cmpwi(CCR0, R0, 0);
5007       break;
5008     case 8:
5009       ld(R0, mem_offset, mem_base);
5010       cmpdi(CCR0, R0, 0);
5011       break;
5012     default:
5013       ShouldNotReachHere();
5014   }
5015   asm_assert(check_equal, msg, id);
5016 #endif // ASSERT
5017 }
5018 
5019 void MacroAssembler::verify_thread() {
5020   if (VerifyThread) {
5021     unimplemented("'VerifyThread' currently not implemented on PPC");
5022   }
5023 }
5024 
5025 // READ: oop. KILL: R0. Volatile floats perhaps.
5026 void MacroAssembler::verify_oop(Register oop, const char* msg) {
5027   if (!VerifyOops) {
5028     return;
5029   }
5030 
5031   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
5032   const Register tmp = R11; // Will be preserved.
5033   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
5034   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
5035 
5036   mr_if_needed(R4_ARG2, oop);
5037   save_LR_CR(tmp); // save in old frame
5038   push_frame_reg_args(nbytes_save, tmp);
5039   // load FunctionDescriptor** / entry_address *
5040   load_const_optimized(tmp, fd, R0);
5041   // load FunctionDescriptor* / entry_address
5042   ld(tmp, 0, tmp);
5043   load_const_optimized(R3_ARG1, (address)msg, R0);
5044   // Call destination for its side effect.
5045   call_c(tmp);
5046 
5047   pop_frame();
5048   restore_LR_CR(tmp);
5049   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
5050 }
5051 
5052 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) {
5053   if (!VerifyOops) {
5054     return;
5055   }
5056 
5057   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
5058   const Register tmp = R11; // Will be preserved.
5059   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
5060   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
5061 
5062   ld(R4_ARG2, offs, base);
5063   save_LR_CR(tmp); // save in old frame
5064   push_frame_reg_args(nbytes_save, tmp);
5065   // load FunctionDescriptor** / entry_address *
5066   load_const_optimized(tmp, fd, R0);
5067   // load FunctionDescriptor* / entry_address
5068   ld(tmp, 0, tmp);
5069   load_const_optimized(R3_ARG1, (address)msg, R0);
5070   // Call destination for its side effect.
5071   call_c(tmp);
5072 
5073   pop_frame();
5074   restore_LR_CR(tmp);
5075   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
5076 }
5077 
5078 const char* stop_types[] = {
5079   "stop",
5080   "untested",
5081   "unimplemented",
5082   "shouldnotreachhere"
5083 };
5084 
5085 static void stop_on_request(int tp, const char* msg) {
5086   tty->print("PPC assembly code requires stop: (%s) %s\n", stop_types[tp%/*stop_end*/4], msg);
5087   guarantee(false, "PPC assembly code requires stop: %s", msg);
5088 }
5089 
5090 // Call a C-function that prints output.
5091 void MacroAssembler::stop(int type, const char* msg, int id) {
5092 #ifndef PRODUCT
5093   block_comment(err_msg("stop: %s %s {", stop_types[type%stop_end], msg));
5094 #else
5095   block_comment("stop {");
5096 #endif
5097 
5098   // setup arguments
5099   load_const_optimized(R3_ARG1, type);
5100   load_const_optimized(R4_ARG2, (void *)msg, /*tmp=*/R0);
5101   call_VM_leaf(CAST_FROM_FN_PTR(address, stop_on_request), R3_ARG1, R4_ARG2);
5102   illtrap();
5103   emit_int32(id);
5104   block_comment("} stop;");
5105 }
5106 
5107 #ifndef PRODUCT
5108 // Write pattern 0x0101010101010101 in memory region [low-before, high+after].
5109 // Val, addr are temp registers.
5110 // If low == addr, addr is killed.
5111 // High is preserved.
5112 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) {
5113   if (!ZapMemory) return;
5114 
5115   assert_different_registers(low, val);
5116 
5117   BLOCK_COMMENT("zap memory region {");
5118   load_const_optimized(val, 0x0101010101010101);
5119   int size = before + after;
5120   if (low == high && size < 5 && size > 0) {
5121     int offset = -before*BytesPerWord;
5122     for (int i = 0; i < size; ++i) {
5123       std(val, offset, low);
5124       offset += (1*BytesPerWord);
5125     }
5126   } else {
5127     addi(addr, low, -before*BytesPerWord);
5128     assert_different_registers(high, val);
5129     if (after) addi(high, high, after * BytesPerWord);
5130     Label loop;
5131     bind(loop);
5132     std(val, 0, addr);
5133     addi(addr, addr, 8);
5134     cmpd(CCR6, addr, high);
5135     ble(CCR6, loop);
5136     if (after) addi(high, high, -after * BytesPerWord);  // Correct back to old value.
5137   }
5138   BLOCK_COMMENT("} zap memory region");
5139 }
5140 
5141 #endif // !PRODUCT
5142 
5143 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() {
5144   int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true);
5145   assert(sizeof(bool) == 1, "PowerPC ABI");
5146   masm->lbz(temp, simm16_offset, temp);
5147   masm->cmpwi(CCR0, temp, 0);
5148   masm->beq(CCR0, _label);
5149 }
5150 
5151 SkipIfEqualZero::~SkipIfEqualZero() {
5152   _masm->bind(_label);
5153 }