1 /*
   2  * Copyright (c) 1997, 2016, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2012, 2016 SAP SE. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "compiler/disassembler.hpp"
  29 #include "gc/shared/cardTableModRefBS.hpp"
  30 #include "gc/shared/collectedHeap.inline.hpp"
  31 #include "interpreter/interpreter.hpp"
  32 #include "memory/resourceArea.hpp"
  33 #include "nativeInst_ppc.hpp"
  34 #include "prims/methodHandles.hpp"
  35 #include "runtime/biasedLocking.hpp"
  36 #include "runtime/icache.hpp"
  37 #include "runtime/interfaceSupport.hpp"
  38 #include "runtime/objectMonitor.hpp"
  39 #include "runtime/os.hpp"
  40 #include "runtime/sharedRuntime.hpp"
  41 #include "runtime/stubRoutines.hpp"
  42 #include "utilities/macros.hpp"
  43 #if INCLUDE_ALL_GCS
  44 #include "gc/g1/g1CollectedHeap.inline.hpp"
  45 #include "gc/g1/g1SATBCardTableModRefBS.hpp"
  46 #include "gc/g1/heapRegion.hpp"
  47 #endif // INCLUDE_ALL_GCS
  48 #ifdef COMPILER2
  49 #include "opto/intrinsicnode.hpp"
  50 #endif
  51 
  52 #ifdef PRODUCT
  53 #define BLOCK_COMMENT(str) // nothing
  54 #else
  55 #define BLOCK_COMMENT(str) block_comment(str)
  56 #endif
  57 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  58 
  59 #ifdef ASSERT
  60 // On RISC, there's no benefit to verifying instruction boundaries.
  61 bool AbstractAssembler::pd_check_instruction_mark() { return false; }
  62 #endif
  63 
  64 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) {
  65   assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range");
  66   if (Assembler::is_simm(si31, 16)) {
  67     ld(d, si31, a);
  68     if (emit_filler_nop) nop();
  69   } else {
  70     const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31);
  71     const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31);
  72     addis(d, a, hi);
  73     ld(d, lo, d);
  74   }
  75 }
  76 
  77 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) {
  78   assert_different_registers(d, a);
  79   ld_largeoffset_unchecked(d, si31, a, emit_filler_nop);
  80 }
  81 
  82 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base,
  83                                       size_t size_in_bytes, bool is_signed) {
  84   switch (size_in_bytes) {
  85   case  8:              ld(dst, offs, base);                         break;
  86   case  4:  is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break;
  87   case  2:  is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break;
  88   case  1:  lbz(dst, offs, base); if (is_signed) extsb(dst, dst);    break; // lba doesn't exist :(
  89   default:  ShouldNotReachHere();
  90   }
  91 }
  92 
  93 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base,
  94                                        size_t size_in_bytes) {
  95   switch (size_in_bytes) {
  96   case  8:  std(dst, offs, base); break;
  97   case  4:  stw(dst, offs, base); break;
  98   case  2:  sth(dst, offs, base); break;
  99   case  1:  stb(dst, offs, base); break;
 100   default:  ShouldNotReachHere();
 101   }
 102 }
 103 
 104 void MacroAssembler::align(int modulus, int max, int rem) {
 105   int padding = (rem + modulus - (offset() % modulus)) % modulus;
 106   if (padding > max) return;
 107   for (int c = (padding >> 2); c > 0; --c) { nop(); }
 108 }
 109 
 110 // Issue instructions that calculate given TOC from global TOC.
 111 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16,
 112                                                        bool add_relocation, bool emit_dummy_addr) {
 113   int offset = -1;
 114   if (emit_dummy_addr) {
 115     offset = -128; // dummy address
 116   } else if (addr != (address)(intptr_t)-1) {
 117     offset = MacroAssembler::offset_to_global_toc(addr);
 118   }
 119 
 120   if (hi16) {
 121     addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset));
 122   }
 123   if (lo16) {
 124     if (add_relocation) {
 125       // Relocate at the addi to avoid confusion with a load from the method's TOC.
 126       relocate(internal_word_Relocation::spec(addr));
 127     }
 128     addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset));
 129   }
 130 }
 131 
 132 int MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) {
 133   const int offset = MacroAssembler::offset_to_global_toc(addr);
 134 
 135   const address inst2_addr = a;
 136   const int inst2 = *(int *)inst2_addr;
 137 
 138   // The relocation points to the second instruction, the addi,
 139   // and the addi reads and writes the same register dst.
 140   const int dst = inv_rt_field(inst2);
 141   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 142 
 143   // Now, find the preceding addis which writes to dst.
 144   int inst1 = 0;
 145   address inst1_addr = inst2_addr - BytesPerInstWord;
 146   while (inst1_addr >= bound) {
 147     inst1 = *(int *) inst1_addr;
 148     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 149       // Stop, found the addis which writes dst.
 150       break;
 151     }
 152     inst1_addr -= BytesPerInstWord;
 153   }
 154 
 155   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 156   set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset));
 157   set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset));
 158   return (int)((intptr_t)addr - (intptr_t)inst1_addr);
 159 }
 160 
 161 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) {
 162   const address inst2_addr = a;
 163   const int inst2 = *(int *)inst2_addr;
 164 
 165   // The relocation points to the second instruction, the addi,
 166   // and the addi reads and writes the same register dst.
 167   const int dst = inv_rt_field(inst2);
 168   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 169 
 170   // Now, find the preceding addis which writes to dst.
 171   int inst1 = 0;
 172   address inst1_addr = inst2_addr - BytesPerInstWord;
 173   while (inst1_addr >= bound) {
 174     inst1 = *(int *) inst1_addr;
 175     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 176       // stop, found the addis which writes dst
 177       break;
 178     }
 179     inst1_addr -= BytesPerInstWord;
 180   }
 181 
 182   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 183 
 184   int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0);
 185   // -1 is a special case
 186   if (offset == -1) {
 187     return (address)(intptr_t)-1;
 188   } else {
 189     return global_toc() + offset;
 190   }
 191 }
 192 
 193 #ifdef _LP64
 194 // Patch compressed oops or klass constants.
 195 // Assembler sequence is
 196 // 1) compressed oops:
 197 //    lis  rx = const.hi
 198 //    ori rx = rx | const.lo
 199 // 2) compressed klass:
 200 //    lis  rx = const.hi
 201 //    clrldi rx = rx & 0xFFFFffff // clearMS32b, optional
 202 //    ori rx = rx | const.lo
 203 // Clrldi will be passed by.
 204 int MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) {
 205   assert(UseCompressedOops, "Should only patch compressed oops");
 206 
 207   const address inst2_addr = a;
 208   const int inst2 = *(int *)inst2_addr;
 209 
 210   // The relocation points to the second instruction, the ori,
 211   // and the ori reads and writes the same register dst.
 212   const int dst = inv_rta_field(inst2);
 213   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 214   // Now, find the preceding addis which writes to dst.
 215   int inst1 = 0;
 216   address inst1_addr = inst2_addr - BytesPerInstWord;
 217   bool inst1_found = false;
 218   while (inst1_addr >= bound) {
 219     inst1 = *(int *)inst1_addr;
 220     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; }
 221     inst1_addr -= BytesPerInstWord;
 222   }
 223   assert(inst1_found, "inst is not lis");
 224 
 225   int xc = (data >> 16) & 0xffff;
 226   int xd = (data >>  0) & 0xffff;
 227 
 228   set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo
 229   set_imm((int *)inst2_addr,        (xd)); // unsigned int
 230   return (int)((intptr_t)inst2_addr - (intptr_t)inst1_addr);
 231 }
 232 
 233 // Get compressed oop or klass constant.
 234 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) {
 235   assert(UseCompressedOops, "Should only patch compressed oops");
 236 
 237   const address inst2_addr = a;
 238   const int inst2 = *(int *)inst2_addr;
 239 
 240   // The relocation points to the second instruction, the ori,
 241   // and the ori reads and writes the same register dst.
 242   const int dst = inv_rta_field(inst2);
 243   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 244   // Now, find the preceding lis which writes to dst.
 245   int inst1 = 0;
 246   address inst1_addr = inst2_addr - BytesPerInstWord;
 247   bool inst1_found = false;
 248 
 249   while (inst1_addr >= bound) {
 250     inst1 = *(int *) inst1_addr;
 251     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;}
 252     inst1_addr -= BytesPerInstWord;
 253   }
 254   assert(inst1_found, "inst is not lis");
 255 
 256   uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff));
 257   uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16);
 258 
 259   return (int) (xl | xh);
 260 }
 261 #endif // _LP64
 262 
 263 // Returns true if successful.
 264 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a,
 265                                                 Register toc, bool fixed_size) {
 266   int toc_offset = 0;
 267   // Use RelocationHolder::none for the constant pool entry, otherwise
 268   // we will end up with a failing NativeCall::verify(x) where x is
 269   // the address of the constant pool entry.
 270   // FIXME: We should insert relocation information for oops at the constant
 271   // pool entries instead of inserting it at the loads; patching of a constant
 272   // pool entry should be less expensive.
 273   address const_address = address_constant((address)a.value(), RelocationHolder::none);
 274   if (const_address == NULL) { return false; } // allocation failure
 275   // Relocate at the pc of the load.
 276   relocate(a.rspec());
 277   toc_offset = (int)(const_address - code()->consts()->start());
 278   ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size);
 279   return true;
 280 }
 281 
 282 bool MacroAssembler::is_load_const_from_method_toc_at(address a) {
 283   const address inst1_addr = a;
 284   const int inst1 = *(int *)inst1_addr;
 285 
 286    // The relocation points to the ld or the addis.
 287    return (is_ld(inst1)) ||
 288           (is_addis(inst1) && inv_ra_field(inst1) != 0);
 289 }
 290 
 291 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) {
 292   assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc");
 293 
 294   const address inst1_addr = a;
 295   const int inst1 = *(int *)inst1_addr;
 296 
 297   if (is_ld(inst1)) {
 298     return inv_d1_field(inst1);
 299   } else if (is_addis(inst1)) {
 300     const int dst = inv_rt_field(inst1);
 301 
 302     // Now, find the succeeding ld which reads and writes to dst.
 303     address inst2_addr = inst1_addr + BytesPerInstWord;
 304     int inst2 = 0;
 305     while (true) {
 306       inst2 = *(int *) inst2_addr;
 307       if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) {
 308         // Stop, found the ld which reads and writes dst.
 309         break;
 310       }
 311       inst2_addr += BytesPerInstWord;
 312     }
 313     return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2);
 314   }
 315   ShouldNotReachHere();
 316   return 0;
 317 }
 318 
 319 // Get the constant from a `load_const' sequence.
 320 long MacroAssembler::get_const(address a) {
 321   assert(is_load_const_at(a), "not a load of a constant");
 322   const int *p = (const int*) a;
 323   unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48);
 324   if (is_ori(*(p+1))) {
 325     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32);
 326     x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16);
 327     x |= (((unsigned long) (get_imm(a,4) & 0xffff)));
 328   } else if (is_lis(*(p+1))) {
 329     x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32);
 330     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16);
 331     x |= (((unsigned long) (get_imm(a,3) & 0xffff)));
 332   } else {
 333     ShouldNotReachHere();
 334     return (long) 0;
 335   }
 336   return (long) x;
 337 }
 338 
 339 // Patch the 64 bit constant of a `load_const' sequence. This is a low
 340 // level procedure. It neither flushes the instruction cache nor is it
 341 // mt safe.
 342 void MacroAssembler::patch_const(address a, long x) {
 343   assert(is_load_const_at(a), "not a load of a constant");
 344   int *p = (int*) a;
 345   if (is_ori(*(p+1))) {
 346     set_imm(0 + p, (x >> 48) & 0xffff);
 347     set_imm(1 + p, (x >> 32) & 0xffff);
 348     set_imm(3 + p, (x >> 16) & 0xffff);
 349     set_imm(4 + p, x & 0xffff);
 350   } else if (is_lis(*(p+1))) {
 351     set_imm(0 + p, (x >> 48) & 0xffff);
 352     set_imm(2 + p, (x >> 32) & 0xffff);
 353     set_imm(1 + p, (x >> 16) & 0xffff);
 354     set_imm(3 + p, x & 0xffff);
 355   } else {
 356     ShouldNotReachHere();
 357   }
 358 }
 359 
 360 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) {
 361   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
 362   int index = oop_recorder()->allocate_metadata_index(obj);
 363   RelocationHolder rspec = metadata_Relocation::spec(index);
 364   return AddressLiteral((address)obj, rspec);
 365 }
 366 
 367 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) {
 368   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
 369   int index = oop_recorder()->find_index(obj);
 370   RelocationHolder rspec = metadata_Relocation::spec(index);
 371   return AddressLiteral((address)obj, rspec);
 372 }
 373 
 374 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) {
 375   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
 376   int oop_index = oop_recorder()->allocate_oop_index(obj);
 377   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 378 }
 379 
 380 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) {
 381   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
 382   int oop_index = oop_recorder()->find_index(obj);
 383   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 384 }
 385 
 386 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
 387                                                       Register tmp, int offset) {
 388   intptr_t value = *delayed_value_addr;
 389   if (value != 0) {
 390     return RegisterOrConstant(value + offset);
 391   }
 392 
 393   // Load indirectly to solve generation ordering problem.
 394   // static address, no relocation
 395   int simm16_offset = load_const_optimized(tmp, delayed_value_addr, noreg, true);
 396   ld(tmp, simm16_offset, tmp); // must be aligned ((xa & 3) == 0)
 397 
 398   if (offset != 0) {
 399     addi(tmp, tmp, offset);
 400   }
 401 
 402   return RegisterOrConstant(tmp);
 403 }
 404 
 405 #ifndef PRODUCT
 406 void MacroAssembler::pd_print_patched_instruction(address branch) {
 407   Unimplemented(); // TODO: PPC port
 408 }
 409 #endif // ndef PRODUCT
 410 
 411 // Conditional far branch for destinations encodable in 24+2 bits.
 412 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) {
 413 
 414   // If requested by flag optimize, relocate the bc_far as a
 415   // runtime_call and prepare for optimizing it when the code gets
 416   // relocated.
 417   if (optimize == bc_far_optimize_on_relocate) {
 418     relocate(relocInfo::runtime_call_type);
 419   }
 420 
 421   // variant 2:
 422   //
 423   //    b!cxx SKIP
 424   //    bxx   DEST
 425   //  SKIP:
 426   //
 427 
 428   const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 429                                                 opposite_bcond(inv_boint_bcond(boint)));
 430 
 431   // We emit two branches.
 432   // First, a conditional branch which jumps around the far branch.
 433   const address not_taken_pc = pc() + 2 * BytesPerInstWord;
 434   const address bc_pc        = pc();
 435   bc(opposite_boint, biint, not_taken_pc);
 436 
 437   const int bc_instr = *(int*)bc_pc;
 438   assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition");
 439   assert(opposite_boint == inv_bo_field(bc_instr), "postcondition");
 440   assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))),
 441                                      opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))),
 442          "postcondition");
 443   assert(biint == inv_bi_field(bc_instr), "postcondition");
 444 
 445   // Second, an unconditional far branch which jumps to dest.
 446   // Note: target(dest) remembers the current pc (see CodeSection::target)
 447   //       and returns the current pc if the label is not bound yet; when
 448   //       the label gets bound, the unconditional far branch will be patched.
 449   const address target_pc = target(dest);
 450   const address b_pc  = pc();
 451   b(target_pc);
 452 
 453   assert(not_taken_pc == pc(),                     "postcondition");
 454   assert(dest.is_bound() || target_pc == b_pc, "postcondition");
 455 }
 456 
 457 // 1 or 2 instructions
 458 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) {
 459   if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) {
 460     bc(boint, biint, dest);
 461   } else {
 462     bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate);
 463   }
 464 }
 465 
 466 bool MacroAssembler::is_bc_far_at(address instruction_addr) {
 467   return is_bc_far_variant1_at(instruction_addr) ||
 468          is_bc_far_variant2_at(instruction_addr) ||
 469          is_bc_far_variant3_at(instruction_addr);
 470 }
 471 
 472 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) {
 473   if (is_bc_far_variant1_at(instruction_addr)) {
 474     const address instruction_1_addr = instruction_addr;
 475     const int instruction_1 = *(int*)instruction_1_addr;
 476     return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr);
 477   } else if (is_bc_far_variant2_at(instruction_addr)) {
 478     const address instruction_2_addr = instruction_addr + 4;
 479     return bxx_destination(instruction_2_addr);
 480   } else if (is_bc_far_variant3_at(instruction_addr)) {
 481     return instruction_addr + 8;
 482   }
 483   // variant 4 ???
 484   ShouldNotReachHere();
 485   return NULL;
 486 }
 487 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) {
 488 
 489   if (is_bc_far_variant3_at(instruction_addr)) {
 490     // variant 3, far cond branch to the next instruction, already patched to nops:
 491     //
 492     //    nop
 493     //    endgroup
 494     //  SKIP/DEST:
 495     //
 496     return;
 497   }
 498 
 499   // first, extract boint and biint from the current branch
 500   int boint = 0;
 501   int biint = 0;
 502 
 503   ResourceMark rm;
 504   const int code_size = 2 * BytesPerInstWord;
 505   CodeBuffer buf(instruction_addr, code_size);
 506   MacroAssembler masm(&buf);
 507   if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) {
 508     // Far branch to next instruction: Optimize it by patching nops (produce variant 3).
 509     masm.nop();
 510     masm.endgroup();
 511   } else {
 512     if (is_bc_far_variant1_at(instruction_addr)) {
 513       // variant 1, the 1st instruction contains the destination address:
 514       //
 515       //    bcxx  DEST
 516       //    nop
 517       //
 518       const int instruction_1 = *(int*)(instruction_addr);
 519       boint = inv_bo_field(instruction_1);
 520       biint = inv_bi_field(instruction_1);
 521     } else if (is_bc_far_variant2_at(instruction_addr)) {
 522       // variant 2, the 2nd instruction contains the destination address:
 523       //
 524       //    b!cxx SKIP
 525       //    bxx   DEST
 526       //  SKIP:
 527       //
 528       const int instruction_1 = *(int*)(instruction_addr);
 529       boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))),
 530           opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1))));
 531       biint = inv_bi_field(instruction_1);
 532     } else {
 533       // variant 4???
 534       ShouldNotReachHere();
 535     }
 536 
 537     // second, set the new branch destination and optimize the code
 538     if (dest != instruction_addr + 4 && // the bc_far is still unbound!
 539         masm.is_within_range_of_bcxx(dest, instruction_addr)) {
 540       // variant 1:
 541       //
 542       //    bcxx  DEST
 543       //    nop
 544       //
 545       masm.bc(boint, biint, dest);
 546       masm.nop();
 547     } else {
 548       // variant 2:
 549       //
 550       //    b!cxx SKIP
 551       //    bxx   DEST
 552       //  SKIP:
 553       //
 554       const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 555                                                     opposite_bcond(inv_boint_bcond(boint)));
 556       const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord;
 557       masm.bc(opposite_boint, biint, not_taken_pc);
 558       masm.b(dest);
 559     }
 560   }
 561   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 562 }
 563 
 564 // Emit a NOT mt-safe patchable 64 bit absolute call/jump.
 565 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) {
 566   // get current pc
 567   uint64_t start_pc = (uint64_t) pc();
 568 
 569   const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last
 570   const address pc_of_b  = (address) (start_pc + (0*BytesPerInstWord)); // b is first
 571 
 572   // relocate here
 573   if (rt != relocInfo::none) {
 574     relocate(rt);
 575   }
 576 
 577   if ( ReoptimizeCallSequences &&
 578        (( link && is_within_range_of_b(dest, pc_of_bl)) ||
 579         (!link && is_within_range_of_b(dest, pc_of_b)))) {
 580     // variant 2:
 581     // Emit an optimized, pc-relative call/jump.
 582 
 583     if (link) {
 584       // some padding
 585       nop();
 586       nop();
 587       nop();
 588       nop();
 589       nop();
 590       nop();
 591 
 592       // do the call
 593       assert(pc() == pc_of_bl, "just checking");
 594       bl(dest, relocInfo::none);
 595     } else {
 596       // do the jump
 597       assert(pc() == pc_of_b, "just checking");
 598       b(dest, relocInfo::none);
 599 
 600       // some padding
 601       nop();
 602       nop();
 603       nop();
 604       nop();
 605       nop();
 606       nop();
 607     }
 608 
 609     // Assert that we can identify the emitted call/jump.
 610     assert(is_bxx64_patchable_variant2_at((address)start_pc, link),
 611            "can't identify emitted call");
 612   } else {
 613     // variant 1:
 614     mr(R0, R11);  // spill R11 -> R0.
 615 
 616     // Load the destination address into CTR,
 617     // calculate destination relative to global toc.
 618     calculate_address_from_global_toc(R11, dest, true, true, false);
 619 
 620     mtctr(R11);
 621     mr(R11, R0);  // spill R11 <- R0.
 622     nop();
 623 
 624     // do the call/jump
 625     if (link) {
 626       bctrl();
 627     } else{
 628       bctr();
 629     }
 630     // Assert that we can identify the emitted call/jump.
 631     assert(is_bxx64_patchable_variant1b_at((address)start_pc, link),
 632            "can't identify emitted call");
 633   }
 634 
 635   // Assert that we can identify the emitted call/jump.
 636   assert(is_bxx64_patchable_at((address)start_pc, link),
 637          "can't identify emitted call");
 638   assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest,
 639          "wrong encoding of dest address");
 640 }
 641 
 642 // Identify a bxx64_patchable instruction.
 643 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) {
 644   return is_bxx64_patchable_variant1b_at(instruction_addr, link)
 645     //|| is_bxx64_patchable_variant1_at(instruction_addr, link)
 646       || is_bxx64_patchable_variant2_at(instruction_addr, link);
 647 }
 648 
 649 // Does the call64_patchable instruction use a pc-relative encoding of
 650 // the call destination?
 651 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) {
 652   // variant 2 is pc-relative
 653   return is_bxx64_patchable_variant2_at(instruction_addr, link);
 654 }
 655 
 656 // Identify variant 1.
 657 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) {
 658   unsigned int* instr = (unsigned int*) instruction_addr;
 659   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 660       && is_mtctr(instr[5]) // mtctr
 661     && is_load_const_at(instruction_addr);
 662 }
 663 
 664 // Identify variant 1b: load destination relative to global toc.
 665 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) {
 666   unsigned int* instr = (unsigned int*) instruction_addr;
 667   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 668     && is_mtctr(instr[3]) // mtctr
 669     && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr);
 670 }
 671 
 672 // Identify variant 2.
 673 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) {
 674   unsigned int* instr = (unsigned int*) instruction_addr;
 675   if (link) {
 676     return is_bl (instr[6])  // bl dest is last
 677       && is_nop(instr[0])  // nop
 678       && is_nop(instr[1])  // nop
 679       && is_nop(instr[2])  // nop
 680       && is_nop(instr[3])  // nop
 681       && is_nop(instr[4])  // nop
 682       && is_nop(instr[5]); // nop
 683   } else {
 684     return is_b  (instr[0])  // b  dest is first
 685       && is_nop(instr[1])  // nop
 686       && is_nop(instr[2])  // nop
 687       && is_nop(instr[3])  // nop
 688       && is_nop(instr[4])  // nop
 689       && is_nop(instr[5])  // nop
 690       && is_nop(instr[6]); // nop
 691   }
 692 }
 693 
 694 // Set dest address of a bxx64_patchable instruction.
 695 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) {
 696   ResourceMark rm;
 697   int code_size = MacroAssembler::bxx64_patchable_size;
 698   CodeBuffer buf(instruction_addr, code_size);
 699   MacroAssembler masm(&buf);
 700   masm.bxx64_patchable(dest, relocInfo::none, link);
 701   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 702 }
 703 
 704 // Get dest address of a bxx64_patchable instruction.
 705 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) {
 706   if (is_bxx64_patchable_variant1_at(instruction_addr, link)) {
 707     return (address) (unsigned long) get_const(instruction_addr);
 708   } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) {
 709     unsigned int* instr = (unsigned int*) instruction_addr;
 710     if (link) {
 711       const int instr_idx = 6; // bl is last
 712       int branchoffset = branch_destination(instr[instr_idx], 0);
 713       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 714     } else {
 715       const int instr_idx = 0; // b is first
 716       int branchoffset = branch_destination(instr[instr_idx], 0);
 717       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 718     }
 719   // Load dest relative to global toc.
 720   } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) {
 721     return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord,
 722                                                                instruction_addr);
 723   } else {
 724     ShouldNotReachHere();
 725     return NULL;
 726   }
 727 }
 728 
 729 // Uses ordering which corresponds to ABI:
 730 //    _savegpr0_14:  std  r14,-144(r1)
 731 //    _savegpr0_15:  std  r15,-136(r1)
 732 //    _savegpr0_16:  std  r16,-128(r1)
 733 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) {
 734   std(R14, offset, dst);   offset += 8;
 735   std(R15, offset, dst);   offset += 8;
 736   std(R16, offset, dst);   offset += 8;
 737   std(R17, offset, dst);   offset += 8;
 738   std(R18, offset, dst);   offset += 8;
 739   std(R19, offset, dst);   offset += 8;
 740   std(R20, offset, dst);   offset += 8;
 741   std(R21, offset, dst);   offset += 8;
 742   std(R22, offset, dst);   offset += 8;
 743   std(R23, offset, dst);   offset += 8;
 744   std(R24, offset, dst);   offset += 8;
 745   std(R25, offset, dst);   offset += 8;
 746   std(R26, offset, dst);   offset += 8;
 747   std(R27, offset, dst);   offset += 8;
 748   std(R28, offset, dst);   offset += 8;
 749   std(R29, offset, dst);   offset += 8;
 750   std(R30, offset, dst);   offset += 8;
 751   std(R31, offset, dst);   offset += 8;
 752 
 753   stfd(F14, offset, dst);   offset += 8;
 754   stfd(F15, offset, dst);   offset += 8;
 755   stfd(F16, offset, dst);   offset += 8;
 756   stfd(F17, offset, dst);   offset += 8;
 757   stfd(F18, offset, dst);   offset += 8;
 758   stfd(F19, offset, dst);   offset += 8;
 759   stfd(F20, offset, dst);   offset += 8;
 760   stfd(F21, offset, dst);   offset += 8;
 761   stfd(F22, offset, dst);   offset += 8;
 762   stfd(F23, offset, dst);   offset += 8;
 763   stfd(F24, offset, dst);   offset += 8;
 764   stfd(F25, offset, dst);   offset += 8;
 765   stfd(F26, offset, dst);   offset += 8;
 766   stfd(F27, offset, dst);   offset += 8;
 767   stfd(F28, offset, dst);   offset += 8;
 768   stfd(F29, offset, dst);   offset += 8;
 769   stfd(F30, offset, dst);   offset += 8;
 770   stfd(F31, offset, dst);
 771 }
 772 
 773 // Uses ordering which corresponds to ABI:
 774 //    _restgpr0_14:  ld   r14,-144(r1)
 775 //    _restgpr0_15:  ld   r15,-136(r1)
 776 //    _restgpr0_16:  ld   r16,-128(r1)
 777 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) {
 778   ld(R14, offset, src);   offset += 8;
 779   ld(R15, offset, src);   offset += 8;
 780   ld(R16, offset, src);   offset += 8;
 781   ld(R17, offset, src);   offset += 8;
 782   ld(R18, offset, src);   offset += 8;
 783   ld(R19, offset, src);   offset += 8;
 784   ld(R20, offset, src);   offset += 8;
 785   ld(R21, offset, src);   offset += 8;
 786   ld(R22, offset, src);   offset += 8;
 787   ld(R23, offset, src);   offset += 8;
 788   ld(R24, offset, src);   offset += 8;
 789   ld(R25, offset, src);   offset += 8;
 790   ld(R26, offset, src);   offset += 8;
 791   ld(R27, offset, src);   offset += 8;
 792   ld(R28, offset, src);   offset += 8;
 793   ld(R29, offset, src);   offset += 8;
 794   ld(R30, offset, src);   offset += 8;
 795   ld(R31, offset, src);   offset += 8;
 796 
 797   // FP registers
 798   lfd(F14, offset, src);   offset += 8;
 799   lfd(F15, offset, src);   offset += 8;
 800   lfd(F16, offset, src);   offset += 8;
 801   lfd(F17, offset, src);   offset += 8;
 802   lfd(F18, offset, src);   offset += 8;
 803   lfd(F19, offset, src);   offset += 8;
 804   lfd(F20, offset, src);   offset += 8;
 805   lfd(F21, offset, src);   offset += 8;
 806   lfd(F22, offset, src);   offset += 8;
 807   lfd(F23, offset, src);   offset += 8;
 808   lfd(F24, offset, src);   offset += 8;
 809   lfd(F25, offset, src);   offset += 8;
 810   lfd(F26, offset, src);   offset += 8;
 811   lfd(F27, offset, src);   offset += 8;
 812   lfd(F28, offset, src);   offset += 8;
 813   lfd(F29, offset, src);   offset += 8;
 814   lfd(F30, offset, src);   offset += 8;
 815   lfd(F31, offset, src);
 816 }
 817 
 818 // For verify_oops.
 819 void MacroAssembler::save_volatile_gprs(Register dst, int offset) {
 820   std(R2,  offset, dst);   offset += 8;
 821   std(R3,  offset, dst);   offset += 8;
 822   std(R4,  offset, dst);   offset += 8;
 823   std(R5,  offset, dst);   offset += 8;
 824   std(R6,  offset, dst);   offset += 8;
 825   std(R7,  offset, dst);   offset += 8;
 826   std(R8,  offset, dst);   offset += 8;
 827   std(R9,  offset, dst);   offset += 8;
 828   std(R10, offset, dst);   offset += 8;
 829   std(R11, offset, dst);   offset += 8;
 830   std(R12, offset, dst);   offset += 8;
 831 
 832   stfd(F0, offset, dst);   offset += 8;
 833   stfd(F1, offset, dst);   offset += 8;
 834   stfd(F2, offset, dst);   offset += 8;
 835   stfd(F3, offset, dst);   offset += 8;
 836   stfd(F4, offset, dst);   offset += 8;
 837   stfd(F5, offset, dst);   offset += 8;
 838   stfd(F6, offset, dst);   offset += 8;
 839   stfd(F7, offset, dst);   offset += 8;
 840   stfd(F8, offset, dst);   offset += 8;
 841   stfd(F9, offset, dst);   offset += 8;
 842   stfd(F10, offset, dst);  offset += 8;
 843   stfd(F11, offset, dst);  offset += 8;
 844   stfd(F12, offset, dst);  offset += 8;
 845   stfd(F13, offset, dst);
 846 }
 847 
 848 // For verify_oops.
 849 void MacroAssembler::restore_volatile_gprs(Register src, int offset) {
 850   ld(R2,  offset, src);   offset += 8;
 851   ld(R3,  offset, src);   offset += 8;
 852   ld(R4,  offset, src);   offset += 8;
 853   ld(R5,  offset, src);   offset += 8;
 854   ld(R6,  offset, src);   offset += 8;
 855   ld(R7,  offset, src);   offset += 8;
 856   ld(R8,  offset, src);   offset += 8;
 857   ld(R9,  offset, src);   offset += 8;
 858   ld(R10, offset, src);   offset += 8;
 859   ld(R11, offset, src);   offset += 8;
 860   ld(R12, offset, src);   offset += 8;
 861 
 862   lfd(F0, offset, src);   offset += 8;
 863   lfd(F1, offset, src);   offset += 8;
 864   lfd(F2, offset, src);   offset += 8;
 865   lfd(F3, offset, src);   offset += 8;
 866   lfd(F4, offset, src);   offset += 8;
 867   lfd(F5, offset, src);   offset += 8;
 868   lfd(F6, offset, src);   offset += 8;
 869   lfd(F7, offset, src);   offset += 8;
 870   lfd(F8, offset, src);   offset += 8;
 871   lfd(F9, offset, src);   offset += 8;
 872   lfd(F10, offset, src);  offset += 8;
 873   lfd(F11, offset, src);  offset += 8;
 874   lfd(F12, offset, src);  offset += 8;
 875   lfd(F13, offset, src);
 876 }
 877 
 878 void MacroAssembler::save_LR_CR(Register tmp) {
 879   mfcr(tmp);
 880   std(tmp, _abi(cr), R1_SP);
 881   mflr(tmp);
 882   std(tmp, _abi(lr), R1_SP);
 883   // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad)
 884 }
 885 
 886 void MacroAssembler::restore_LR_CR(Register tmp) {
 887   assert(tmp != R1_SP, "must be distinct");
 888   ld(tmp, _abi(lr), R1_SP);
 889   mtlr(tmp);
 890   ld(tmp, _abi(cr), R1_SP);
 891   mtcr(tmp);
 892 }
 893 
 894 address MacroAssembler::get_PC_trash_LR(Register result) {
 895   Label L;
 896   bl(L);
 897   bind(L);
 898   address lr_pc = pc();
 899   mflr(result);
 900   return lr_pc;
 901 }
 902 
 903 void MacroAssembler::resize_frame(Register offset, Register tmp) {
 904 #ifdef ASSERT
 905   assert_different_registers(offset, tmp, R1_SP);
 906   andi_(tmp, offset, frame::alignment_in_bytes-1);
 907   asm_assert_eq("resize_frame: unaligned", 0x204);
 908 #endif
 909 
 910   // tmp <- *(SP)
 911   ld(tmp, _abi(callers_sp), R1_SP);
 912   // addr <- SP + offset;
 913   // *(addr) <- tmp;
 914   // SP <- addr
 915   stdux(tmp, R1_SP, offset);
 916 }
 917 
 918 void MacroAssembler::resize_frame(int offset, Register tmp) {
 919   assert(is_simm(offset, 16), "too big an offset");
 920   assert_different_registers(tmp, R1_SP);
 921   assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned");
 922   // tmp <- *(SP)
 923   ld(tmp, _abi(callers_sp), R1_SP);
 924   // addr <- SP + offset;
 925   // *(addr) <- tmp;
 926   // SP <- addr
 927   stdu(tmp, offset, R1_SP);
 928 }
 929 
 930 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) {
 931   // (addr == tmp1) || (addr == tmp2) is allowed here!
 932   assert(tmp1 != tmp2, "must be distinct");
 933 
 934   // compute offset w.r.t. current stack pointer
 935   // tmp_1 <- addr - SP (!)
 936   subf(tmp1, R1_SP, addr);
 937 
 938   // atomically update SP keeping back link.
 939   resize_frame(tmp1/* offset */, tmp2/* tmp */);
 940 }
 941 
 942 void MacroAssembler::push_frame(Register bytes, Register tmp) {
 943 #ifdef ASSERT
 944   assert(bytes != R0, "r0 not allowed here");
 945   andi_(R0, bytes, frame::alignment_in_bytes-1);
 946   asm_assert_eq("push_frame(Reg, Reg): unaligned", 0x203);
 947 #endif
 948   neg(tmp, bytes);
 949   stdux(R1_SP, R1_SP, tmp);
 950 }
 951 
 952 // Push a frame of size `bytes'.
 953 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) {
 954   long offset = align_addr(bytes, frame::alignment_in_bytes);
 955   if (is_simm(-offset, 16)) {
 956     stdu(R1_SP, -offset, R1_SP);
 957   } else {
 958     load_const_optimized(tmp, -offset);
 959     stdux(R1_SP, R1_SP, tmp);
 960   }
 961 }
 962 
 963 // Push a frame of size `bytes' plus abi_reg_args on top.
 964 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) {
 965   push_frame(bytes + frame::abi_reg_args_size, tmp);
 966 }
 967 
 968 // Setup up a new C frame with a spill area for non-volatile GPRs and
 969 // additional space for local variables.
 970 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes,
 971                                                       Register tmp) {
 972   push_frame(bytes + frame::abi_reg_args_size + frame::spill_nonvolatiles_size, tmp);
 973 }
 974 
 975 // Pop current C frame.
 976 void MacroAssembler::pop_frame() {
 977   ld(R1_SP, _abi(callers_sp), R1_SP);
 978 }
 979 
 980 #if defined(ABI_ELFv2)
 981 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) {
 982   // TODO(asmundak): make sure the caller uses R12 as function descriptor
 983   // most of the times.
 984   if (R12 != r_function_entry) {
 985     mr(R12, r_function_entry);
 986   }
 987   mtctr(R12);
 988   // Do a call or a branch.
 989   if (and_link) {
 990     bctrl();
 991   } else {
 992     bctr();
 993   }
 994   _last_calls_return_pc = pc();
 995 
 996   return _last_calls_return_pc;
 997 }
 998 
 999 // Call a C function via a function descriptor and use full C
1000 // calling conventions. Updates and returns _last_calls_return_pc.
1001 address MacroAssembler::call_c(Register r_function_entry) {
1002   return branch_to(r_function_entry, /*and_link=*/true);
1003 }
1004 
1005 // For tail calls: only branch, don't link, so callee returns to caller of this function.
1006 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) {
1007   return branch_to(r_function_entry, /*and_link=*/false);
1008 }
1009 
1010 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) {
1011   load_const(R12, function_entry, R0);
1012   return branch_to(R12,  /*and_link=*/true);
1013 }
1014 
1015 #else
1016 // Generic version of a call to C function via a function descriptor
1017 // with variable support for C calling conventions (TOC, ENV, etc.).
1018 // Updates and returns _last_calls_return_pc.
1019 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call,
1020                                   bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) {
1021   // we emit standard ptrgl glue code here
1022   assert((function_descriptor != R0), "function_descriptor cannot be R0");
1023 
1024   // retrieve necessary entries from the function descriptor
1025   ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor);
1026   mtctr(R0);
1027 
1028   if (load_toc_of_callee) {
1029     ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor);
1030   }
1031   if (load_env_of_callee) {
1032     ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor);
1033   } else if (load_toc_of_callee) {
1034     li(R11, 0);
1035   }
1036 
1037   // do a call or a branch
1038   if (and_link) {
1039     bctrl();
1040   } else {
1041     bctr();
1042   }
1043   _last_calls_return_pc = pc();
1044 
1045   return _last_calls_return_pc;
1046 }
1047 
1048 // Call a C function via a function descriptor and use full C calling
1049 // conventions.
1050 // We don't use the TOC in generated code, so there is no need to save
1051 // and restore its value.
1052 address MacroAssembler::call_c(Register fd) {
1053   return branch_to(fd, /*and_link=*/true,
1054                        /*save toc=*/false,
1055                        /*restore toc=*/false,
1056                        /*load toc=*/true,
1057                        /*load env=*/true);
1058 }
1059 
1060 address MacroAssembler::call_c_and_return_to_caller(Register fd) {
1061   return branch_to(fd, /*and_link=*/false,
1062                        /*save toc=*/false,
1063                        /*restore toc=*/false,
1064                        /*load toc=*/true,
1065                        /*load env=*/true);
1066 }
1067 
1068 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) {
1069   if (rt != relocInfo::none) {
1070     // this call needs to be relocatable
1071     if (!ReoptimizeCallSequences
1072         || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1073         || fd == NULL   // support code-size estimation
1074         || !fd->is_friend_function()
1075         || fd->entry() == NULL) {
1076       // it's not a friend function as defined by class FunctionDescriptor,
1077       // so do a full call-c here.
1078       load_const(R11, (address)fd, R0);
1079 
1080       bool has_env = (fd != NULL && fd->env() != NULL);
1081       return branch_to(R11, /*and_link=*/true,
1082                             /*save toc=*/false,
1083                             /*restore toc=*/false,
1084                             /*load toc=*/true,
1085                             /*load env=*/has_env);
1086     } else {
1087       // It's a friend function. Load the entry point and don't care about
1088       // toc and env. Use an optimizable call instruction, but ensure the
1089       // same code-size as in the case of a non-friend function.
1090       nop();
1091       nop();
1092       nop();
1093       bl64_patchable(fd->entry(), rt);
1094       _last_calls_return_pc = pc();
1095       return _last_calls_return_pc;
1096     }
1097   } else {
1098     // This call does not need to be relocatable, do more aggressive
1099     // optimizations.
1100     if (!ReoptimizeCallSequences
1101       || !fd->is_friend_function()) {
1102       // It's not a friend function as defined by class FunctionDescriptor,
1103       // so do a full call-c here.
1104       load_const(R11, (address)fd, R0);
1105       return branch_to(R11, /*and_link=*/true,
1106                             /*save toc=*/false,
1107                             /*restore toc=*/false,
1108                             /*load toc=*/true,
1109                             /*load env=*/true);
1110     } else {
1111       // it's a friend function, load the entry point and don't care about
1112       // toc and env.
1113       address dest = fd->entry();
1114       if (is_within_range_of_b(dest, pc())) {
1115         bl(dest);
1116       } else {
1117         bl64_patchable(dest, rt);
1118       }
1119       _last_calls_return_pc = pc();
1120       return _last_calls_return_pc;
1121     }
1122   }
1123 }
1124 
1125 // Call a C function.  All constants needed reside in TOC.
1126 //
1127 // Read the address to call from the TOC.
1128 // Read env from TOC, if fd specifies an env.
1129 // Read new TOC from TOC.
1130 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd,
1131                                          relocInfo::relocType rt, Register toc) {
1132   if (!ReoptimizeCallSequences
1133     || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1134     || !fd->is_friend_function()) {
1135     // It's not a friend function as defined by class FunctionDescriptor,
1136     // so do a full call-c here.
1137     assert(fd->entry() != NULL, "function must be linked");
1138 
1139     AddressLiteral fd_entry(fd->entry());
1140     bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true);
1141     mtctr(R11);
1142     if (fd->env() == NULL) {
1143       li(R11, 0);
1144       nop();
1145     } else {
1146       AddressLiteral fd_env(fd->env());
1147       success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true);
1148     }
1149     AddressLiteral fd_toc(fd->toc());
1150     // Set R2_TOC (load from toc)
1151     success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true);
1152     bctrl();
1153     _last_calls_return_pc = pc();
1154     if (!success) { return NULL; }
1155   } else {
1156     // It's a friend function, load the entry point and don't care about
1157     // toc and env. Use an optimizable call instruction, but ensure the
1158     // same code-size as in the case of a non-friend function.
1159     nop();
1160     bl64_patchable(fd->entry(), rt);
1161     _last_calls_return_pc = pc();
1162   }
1163   return _last_calls_return_pc;
1164 }
1165 #endif // ABI_ELFv2
1166 
1167 void MacroAssembler::call_VM_base(Register oop_result,
1168                                   Register last_java_sp,
1169                                   address  entry_point,
1170                                   bool     check_exceptions) {
1171   BLOCK_COMMENT("call_VM {");
1172   // Determine last_java_sp register.
1173   if (!last_java_sp->is_valid()) {
1174     last_java_sp = R1_SP;
1175   }
1176   set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1);
1177 
1178   // ARG1 must hold thread address.
1179   mr(R3_ARG1, R16_thread);
1180 #if defined(ABI_ELFv2)
1181   address return_pc = call_c(entry_point, relocInfo::none);
1182 #else
1183   address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none);
1184 #endif
1185 
1186   reset_last_Java_frame();
1187 
1188   // Check for pending exceptions.
1189   if (check_exceptions) {
1190     // We don't check for exceptions here.
1191     ShouldNotReachHere();
1192   }
1193 
1194   // Get oop result if there is one and reset the value in the thread.
1195   if (oop_result->is_valid()) {
1196     get_vm_result(oop_result);
1197   }
1198 
1199   _last_calls_return_pc = return_pc;
1200   BLOCK_COMMENT("} call_VM");
1201 }
1202 
1203 void MacroAssembler::call_VM_leaf_base(address entry_point) {
1204   BLOCK_COMMENT("call_VM_leaf {");
1205 #if defined(ABI_ELFv2)
1206   call_c(entry_point, relocInfo::none);
1207 #else
1208   call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none);
1209 #endif
1210   BLOCK_COMMENT("} call_VM_leaf");
1211 }
1212 
1213 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) {
1214   call_VM_base(oop_result, noreg, entry_point, check_exceptions);
1215 }
1216 
1217 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1,
1218                              bool check_exceptions) {
1219   // R3_ARG1 is reserved for the thread.
1220   mr_if_needed(R4_ARG2, arg_1);
1221   call_VM(oop_result, entry_point, check_exceptions);
1222 }
1223 
1224 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2,
1225                              bool check_exceptions) {
1226   // R3_ARG1 is reserved for the thread
1227   mr_if_needed(R4_ARG2, arg_1);
1228   assert(arg_2 != R4_ARG2, "smashed argument");
1229   mr_if_needed(R5_ARG3, arg_2);
1230   call_VM(oop_result, entry_point, check_exceptions);
1231 }
1232 
1233 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3,
1234                              bool check_exceptions) {
1235   // R3_ARG1 is reserved for the thread
1236   mr_if_needed(R4_ARG2, arg_1);
1237   assert(arg_2 != R4_ARG2, "smashed argument");
1238   mr_if_needed(R5_ARG3, arg_2);
1239   mr_if_needed(R6_ARG4, arg_3);
1240   call_VM(oop_result, entry_point, check_exceptions);
1241 }
1242 
1243 void MacroAssembler::call_VM_leaf(address entry_point) {
1244   call_VM_leaf_base(entry_point);
1245 }
1246 
1247 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) {
1248   mr_if_needed(R3_ARG1, arg_1);
1249   call_VM_leaf(entry_point);
1250 }
1251 
1252 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) {
1253   mr_if_needed(R3_ARG1, arg_1);
1254   assert(arg_2 != R3_ARG1, "smashed argument");
1255   mr_if_needed(R4_ARG2, arg_2);
1256   call_VM_leaf(entry_point);
1257 }
1258 
1259 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) {
1260   mr_if_needed(R3_ARG1, arg_1);
1261   assert(arg_2 != R3_ARG1, "smashed argument");
1262   mr_if_needed(R4_ARG2, arg_2);
1263   assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument");
1264   mr_if_needed(R5_ARG3, arg_3);
1265   call_VM_leaf(entry_point);
1266 }
1267 
1268 // Check whether instruction is a read access to the polling page
1269 // which was emitted by load_from_polling_page(..).
1270 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext,
1271                                                address* polling_address_ptr) {
1272   if (!is_ld(instruction))
1273     return false; // It's not a ld. Fail.
1274 
1275   int rt = inv_rt_field(instruction);
1276   int ra = inv_ra_field(instruction);
1277   int ds = inv_ds_field(instruction);
1278   if (!(ds == 0 && ra != 0 && rt == 0)) {
1279     return false; // It's not a ld(r0, X, ra). Fail.
1280   }
1281 
1282   if (!ucontext) {
1283     // Set polling address.
1284     if (polling_address_ptr != NULL) {
1285       *polling_address_ptr = NULL;
1286     }
1287     return true; // No ucontext given. Can't check value of ra. Assume true.
1288   }
1289 
1290 #ifdef LINUX
1291   // Ucontext given. Check that register ra contains the address of
1292   // the safepoing polling page.
1293   ucontext_t* uc = (ucontext_t*) ucontext;
1294   // Set polling address.
1295   address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds;
1296   if (polling_address_ptr != NULL) {
1297     *polling_address_ptr = addr;
1298   }
1299   return os::is_poll_address(addr);
1300 #else
1301   // Not on Linux, ucontext must be NULL.
1302   ShouldNotReachHere();
1303   return false;
1304 #endif
1305 }
1306 
1307 bool MacroAssembler::is_memory_serialization(int instruction, JavaThread* thread, void* ucontext) {
1308 #ifdef LINUX
1309   ucontext_t* uc = (ucontext_t*) ucontext;
1310 
1311   if (is_stwx(instruction) || is_stwux(instruction)) {
1312     int ra = inv_ra_field(instruction);
1313     int rb = inv_rb_field(instruction);
1314 
1315     // look up content of ra and rb in ucontext
1316     address ra_val=(address)uc->uc_mcontext.regs->gpr[ra];
1317     long rb_val=(long)uc->uc_mcontext.regs->gpr[rb];
1318     return os::is_memory_serialize_page(thread, ra_val+rb_val);
1319   } else if (is_stw(instruction) || is_stwu(instruction)) {
1320     int ra = inv_ra_field(instruction);
1321     int d1 = inv_d1_field(instruction);
1322 
1323     // look up content of ra in ucontext
1324     address ra_val=(address)uc->uc_mcontext.regs->gpr[ra];
1325     return os::is_memory_serialize_page(thread, ra_val+d1);
1326   } else {
1327     return false;
1328   }
1329 #else
1330   // workaround not needed on !LINUX :-)
1331   ShouldNotCallThis();
1332   return false;
1333 #endif
1334 }
1335 
1336 void MacroAssembler::bang_stack_with_offset(int offset) {
1337   // When increasing the stack, the old stack pointer will be written
1338   // to the new top of stack according to the PPC64 abi.
1339   // Therefore, stack banging is not necessary when increasing
1340   // the stack by <= os::vm_page_size() bytes.
1341   // When increasing the stack by a larger amount, this method is
1342   // called repeatedly to bang the intermediate pages.
1343 
1344   // Stack grows down, caller passes positive offset.
1345   assert(offset > 0, "must bang with positive offset");
1346 
1347   long stdoffset = -offset;
1348 
1349   if (is_simm(stdoffset, 16)) {
1350     // Signed 16 bit offset, a simple std is ok.
1351     if (UseLoadInstructionsForStackBangingPPC64) {
1352       ld(R0, (int)(signed short)stdoffset, R1_SP);
1353     } else {
1354       std(R0,(int)(signed short)stdoffset, R1_SP);
1355     }
1356   } else if (is_simm(stdoffset, 31)) {
1357     const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset);
1358     const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset);
1359 
1360     Register tmp = R11;
1361     addis(tmp, R1_SP, hi);
1362     if (UseLoadInstructionsForStackBangingPPC64) {
1363       ld(R0,  lo, tmp);
1364     } else {
1365       std(R0, lo, tmp);
1366     }
1367   } else {
1368     ShouldNotReachHere();
1369   }
1370 }
1371 
1372 // If instruction is a stack bang of the form
1373 //    std    R0,    x(Ry),       (see bang_stack_with_offset())
1374 //    stdu   R1_SP, x(R1_SP),    (see push_frame(), resize_frame())
1375 // or stdux  R1_SP, Rx, R1_SP    (see push_frame(), resize_frame())
1376 // return the banged address. Otherwise, return 0.
1377 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) {
1378 #ifdef LINUX
1379   ucontext_t* uc = (ucontext_t*) ucontext;
1380   int rs = inv_rs_field(instruction);
1381   int ra = inv_ra_field(instruction);
1382   if (   (is_ld(instruction)   && rs == 0 &&  UseLoadInstructionsForStackBangingPPC64)
1383       || (is_std(instruction)  && rs == 0 && !UseLoadInstructionsForStackBangingPPC64)
1384       || (is_stdu(instruction) && rs == 1)) {
1385     int ds = inv_ds_field(instruction);
1386     // return banged address
1387     return ds+(address)uc->uc_mcontext.regs->gpr[ra];
1388   } else if (is_stdux(instruction) && rs == 1) {
1389     int rb = inv_rb_field(instruction);
1390     address sp = (address)uc->uc_mcontext.regs->gpr[1];
1391     long rb_val = (long)uc->uc_mcontext.regs->gpr[rb];
1392     return ra != 1 || rb_val >= 0 ? NULL         // not a stack bang
1393                                   : sp + rb_val; // banged address
1394   }
1395   return NULL; // not a stack bang
1396 #else
1397   // workaround not needed on !LINUX :-)
1398   ShouldNotCallThis();
1399   return NULL;
1400 #endif
1401 }
1402 
1403 // CmpxchgX sets condition register to cmpX(current, compare).
1404 void MacroAssembler::cmpxchgw(ConditionRegister flag, Register dest_current_value,
1405                               Register compare_value, Register exchange_value,
1406                               Register addr_base, int semantics, bool cmpxchgx_hint,
1407                               Register int_flag_success, bool contention_hint) {
1408   Label retry;
1409   Label failed;
1410   Label done;
1411 
1412   // Save one branch if result is returned via register and
1413   // result register is different from the other ones.
1414   bool use_result_reg    = (int_flag_success != noreg);
1415   bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value &&
1416                             int_flag_success != exchange_value && int_flag_success != addr_base);
1417 
1418   if (use_result_reg && preset_result_reg) {
1419     li(int_flag_success, 0); // preset (assume cas failed)
1420   }
1421 
1422   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1423   if (contention_hint) { // Don't try to reserve if cmp fails.
1424     lwz(dest_current_value, 0, addr_base);
1425     cmpw(flag, dest_current_value, compare_value);
1426     bne(flag, failed);
1427   }
1428 
1429   // release/fence semantics
1430   if (semantics & MemBarRel) {
1431     release();
1432   }
1433 
1434   // atomic emulation loop
1435   bind(retry);
1436 
1437   lwarx(dest_current_value, addr_base, cmpxchgx_hint);
1438   cmpw(flag, dest_current_value, compare_value);
1439   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1440     bne_predict_not_taken(flag, failed);
1441   } else {
1442     bne(                  flag, failed);
1443   }
1444   // branch to done  => (flag == ne), (dest_current_value != compare_value)
1445   // fall through    => (flag == eq), (dest_current_value == compare_value)
1446 
1447   stwcx_(exchange_value, addr_base);
1448   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1449     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1450   } else {
1451     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1452   }
1453   // fall through    => (flag == eq), (dest_current_value == compare_value), (swapped)
1454 
1455   // Result in register (must do this at the end because int_flag_success can be the
1456   // same register as one above).
1457   if (use_result_reg) {
1458     li(int_flag_success, 1);
1459   }
1460 
1461   if (semantics & MemBarFenceAfter) {
1462     fence();
1463   } else if (semantics & MemBarAcq) {
1464     isync();
1465   }
1466 
1467   if (use_result_reg && !preset_result_reg) {
1468     b(done);
1469   }
1470 
1471   bind(failed);
1472   if (use_result_reg && !preset_result_reg) {
1473     li(int_flag_success, 0);
1474   }
1475 
1476   bind(done);
1477   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1478   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1479 }
1480 
1481 // Preforms atomic compare exchange:
1482 //   if (compare_value == *addr_base)
1483 //     *addr_base = exchange_value
1484 //     int_flag_success = 1;
1485 //   else
1486 //     int_flag_success = 0;
1487 //
1488 // ConditionRegister flag       = cmp(compare_value, *addr_base)
1489 // Register dest_current_value  = *addr_base
1490 // Register compare_value       Used to compare with value in memory
1491 // Register exchange_value      Written to memory if compare_value == *addr_base
1492 // Register addr_base           The memory location to compareXChange
1493 // Register int_flag_success    Set to 1 if exchange_value was written to *addr_base
1494 //
1495 // To avoid the costly compare exchange the value is tested beforehand.
1496 // Several special cases exist to avoid that unnecessary information is generated.
1497 //
1498 void MacroAssembler::cmpxchgd(ConditionRegister flag,
1499                               Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value,
1500                               Register addr_base, int semantics, bool cmpxchgx_hint,
1501                               Register int_flag_success, Label* failed_ext, bool contention_hint) {
1502   Label retry;
1503   Label failed_int;
1504   Label& failed = (failed_ext != NULL) ? *failed_ext : failed_int;
1505   Label done;
1506 
1507   // Save one branch if result is returned via register and result register is different from the other ones.
1508   bool use_result_reg    = (int_flag_success!=noreg);
1509   bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() &&
1510                             int_flag_success!=exchange_value && int_flag_success!=addr_base);
1511   assert(int_flag_success == noreg || failed_ext == NULL, "cannot have both");
1512 
1513   if (use_result_reg && preset_result_reg) {
1514     li(int_flag_success, 0); // preset (assume cas failed)
1515   }
1516 
1517   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1518   if (contention_hint) { // Don't try to reserve if cmp fails.
1519     ld(dest_current_value, 0, addr_base);
1520     cmpd(flag, compare_value, dest_current_value);
1521     bne(flag, failed);
1522   }
1523 
1524   // release/fence semantics
1525   if (semantics & MemBarRel) {
1526     release();
1527   }
1528 
1529   // atomic emulation loop
1530   bind(retry);
1531 
1532   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1533   cmpd(flag, compare_value, dest_current_value);
1534   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1535     bne_predict_not_taken(flag, failed);
1536   } else {
1537     bne(                  flag, failed);
1538   }
1539 
1540   stdcx_(exchange_value, addr_base);
1541   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1542     bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
1543   } else {
1544     bne(                  CCR0, retry); // stXcx_ sets CCR0
1545   }
1546 
1547   // result in register (must do this at the end because int_flag_success can be the same register as one above)
1548   if (use_result_reg) {
1549     li(int_flag_success, 1);
1550   }
1551 
1552   if (semantics & MemBarFenceAfter) {
1553     fence();
1554   } else if (semantics & MemBarAcq) {
1555     isync();
1556   }
1557 
1558   if (use_result_reg && !preset_result_reg) {
1559     b(done);
1560   }
1561 
1562   bind(failed_int);
1563   if (use_result_reg && !preset_result_reg) {
1564     li(int_flag_success, 0);
1565   }
1566 
1567   bind(done);
1568   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1569   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1570 }
1571 
1572 // Look up the method for a megamorphic invokeinterface call.
1573 // The target method is determined by <intf_klass, itable_index>.
1574 // The receiver klass is in recv_klass.
1575 // On success, the result will be in method_result, and execution falls through.
1576 // On failure, execution transfers to the given label.
1577 void MacroAssembler::lookup_interface_method(Register recv_klass,
1578                                              Register intf_klass,
1579                                              RegisterOrConstant itable_index,
1580                                              Register method_result,
1581                                              Register scan_temp,
1582                                              Register sethi_temp,
1583                                              Label& L_no_such_interface) {
1584   assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
1585   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
1586          "caller must use same register for non-constant itable index as for method");
1587 
1588   // Compute start of first itableOffsetEntry (which is at the end of the vtable).
1589   int vtable_base = in_bytes(Klass::vtable_start_offset());
1590   int itentry_off = itableMethodEntry::method_offset_in_bytes();
1591   int logMEsize   = exact_log2(itableMethodEntry::size() * wordSize);
1592   int scan_step   = itableOffsetEntry::size() * wordSize;
1593   int log_vte_size= exact_log2(vtableEntry::size_in_bytes());
1594 
1595   lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass);
1596   // %%% We should store the aligned, prescaled offset in the klassoop.
1597   // Then the next several instructions would fold away.
1598 
1599   sldi(scan_temp, scan_temp, log_vte_size);
1600   addi(scan_temp, scan_temp, vtable_base);
1601   add(scan_temp, recv_klass, scan_temp);
1602 
1603   // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1604   if (itable_index.is_register()) {
1605     Register itable_offset = itable_index.as_register();
1606     sldi(itable_offset, itable_offset, logMEsize);
1607     if (itentry_off) addi(itable_offset, itable_offset, itentry_off);
1608     add(recv_klass, itable_offset, recv_klass);
1609   } else {
1610     long itable_offset = (long)itable_index.as_constant();
1611     load_const_optimized(sethi_temp, (itable_offset<<logMEsize)+itentry_off); // static address, no relocation
1612     add(recv_klass, sethi_temp, recv_klass);
1613   }
1614 
1615   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
1616   //   if (scan->interface() == intf) {
1617   //     result = (klass + scan->offset() + itable_index);
1618   //   }
1619   // }
1620   Label search, found_method;
1621 
1622   for (int peel = 1; peel >= 0; peel--) {
1623     // %%%% Could load both offset and interface in one ldx, if they were
1624     // in the opposite order. This would save a load.
1625     ld(method_result, itableOffsetEntry::interface_offset_in_bytes(), scan_temp);
1626 
1627     // Check that this entry is non-null. A null entry means that
1628     // the receiver class doesn't implement the interface, and wasn't the
1629     // same as when the caller was compiled.
1630     cmpd(CCR0, method_result, intf_klass);
1631 
1632     if (peel) {
1633       beq(CCR0, found_method);
1634     } else {
1635       bne(CCR0, search);
1636       // (invert the test to fall through to found_method...)
1637     }
1638 
1639     if (!peel) break;
1640 
1641     bind(search);
1642 
1643     cmpdi(CCR0, method_result, 0);
1644     beq(CCR0, L_no_such_interface);
1645     addi(scan_temp, scan_temp, scan_step);
1646   }
1647 
1648   bind(found_method);
1649 
1650   // Got a hit.
1651   int ito_offset = itableOffsetEntry::offset_offset_in_bytes();
1652   lwz(scan_temp, ito_offset, scan_temp);
1653   ldx(method_result, scan_temp, recv_klass);
1654 }
1655 
1656 // virtual method calling
1657 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1658                                            RegisterOrConstant vtable_index,
1659                                            Register method_result) {
1660 
1661   assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg());
1662 
1663   const int base = in_bytes(Klass::vtable_start_offset());
1664   assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1665 
1666   if (vtable_index.is_register()) {
1667     sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord);
1668     add(recv_klass, vtable_index.as_register(), recv_klass);
1669   } else {
1670     addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord);
1671   }
1672   ld(R19_method, base + vtableEntry::method_offset_in_bytes(), recv_klass);
1673 }
1674 
1675 /////////////////////////////////////////// subtype checking ////////////////////////////////////////////
1676 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1677                                                    Register super_klass,
1678                                                    Register temp1_reg,
1679                                                    Register temp2_reg,
1680                                                    Label* L_success,
1681                                                    Label* L_failure,
1682                                                    Label* L_slow_path,
1683                                                    RegisterOrConstant super_check_offset) {
1684 
1685   const Register check_cache_offset = temp1_reg;
1686   const Register cached_super       = temp2_reg;
1687 
1688   assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super);
1689 
1690   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1691   int sc_offset  = in_bytes(Klass::secondary_super_cache_offset());
1692 
1693   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1694   bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset);
1695 
1696   Label L_fallthrough;
1697   int label_nulls = 0;
1698   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1699   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1700   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1701   assert(label_nulls <= 1 ||
1702          (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path),
1703          "at most one NULL in the batch, usually");
1704 
1705   // If the pointers are equal, we are done (e.g., String[] elements).
1706   // This self-check enables sharing of secondary supertype arrays among
1707   // non-primary types such as array-of-interface. Otherwise, each such
1708   // type would need its own customized SSA.
1709   // We move this check to the front of the fast path because many
1710   // type checks are in fact trivially successful in this manner,
1711   // so we get a nicely predicted branch right at the start of the check.
1712   cmpd(CCR0, sub_klass, super_klass);
1713   beq(CCR0, *L_success);
1714 
1715   // Check the supertype display:
1716   if (must_load_sco) {
1717     // The super check offset is always positive...
1718   lwz(check_cache_offset, sco_offset, super_klass);
1719     super_check_offset = RegisterOrConstant(check_cache_offset);
1720     // super_check_offset is register.
1721     assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register());
1722   }
1723   // The loaded value is the offset from KlassOopDesc.
1724 
1725   ld(cached_super, super_check_offset, sub_klass);
1726   cmpd(CCR0, cached_super, super_klass);
1727 
1728   // This check has worked decisively for primary supers.
1729   // Secondary supers are sought in the super_cache ('super_cache_addr').
1730   // (Secondary supers are interfaces and very deeply nested subtypes.)
1731   // This works in the same check above because of a tricky aliasing
1732   // between the super_cache and the primary super display elements.
1733   // (The 'super_check_addr' can address either, as the case requires.)
1734   // Note that the cache is updated below if it does not help us find
1735   // what we need immediately.
1736   // So if it was a primary super, we can just fail immediately.
1737   // Otherwise, it's the slow path for us (no success at this point).
1738 
1739 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); }
1740 
1741   if (super_check_offset.is_register()) {
1742     beq(CCR0, *L_success);
1743     cmpwi(CCR0, super_check_offset.as_register(), sc_offset);
1744     if (L_failure == &L_fallthrough) {
1745       beq(CCR0, *L_slow_path);
1746     } else {
1747       bne(CCR0, *L_failure);
1748       FINAL_JUMP(*L_slow_path);
1749     }
1750   } else {
1751     if (super_check_offset.as_constant() == sc_offset) {
1752       // Need a slow path; fast failure is impossible.
1753       if (L_slow_path == &L_fallthrough) {
1754         beq(CCR0, *L_success);
1755       } else {
1756         bne(CCR0, *L_slow_path);
1757         FINAL_JUMP(*L_success);
1758       }
1759     } else {
1760       // No slow path; it's a fast decision.
1761       if (L_failure == &L_fallthrough) {
1762         beq(CCR0, *L_success);
1763       } else {
1764         bne(CCR0, *L_failure);
1765         FINAL_JUMP(*L_success);
1766       }
1767     }
1768   }
1769 
1770   bind(L_fallthrough);
1771 #undef FINAL_JUMP
1772 }
1773 
1774 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1775                                                    Register super_klass,
1776                                                    Register temp1_reg,
1777                                                    Register temp2_reg,
1778                                                    Label* L_success,
1779                                                    Register result_reg) {
1780   const Register array_ptr = temp1_reg; // current value from cache array
1781   const Register temp      = temp2_reg;
1782 
1783   assert_different_registers(sub_klass, super_klass, array_ptr, temp);
1784 
1785   int source_offset = in_bytes(Klass::secondary_supers_offset());
1786   int target_offset = in_bytes(Klass::secondary_super_cache_offset());
1787 
1788   int length_offset = Array<Klass*>::length_offset_in_bytes();
1789   int base_offset   = Array<Klass*>::base_offset_in_bytes();
1790 
1791   Label hit, loop, failure, fallthru;
1792 
1793   ld(array_ptr, source_offset, sub_klass);
1794 
1795   // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated.");
1796   lwz(temp, length_offset, array_ptr);
1797   cmpwi(CCR0, temp, 0);
1798   beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0
1799 
1800   mtctr(temp); // load ctr
1801 
1802   bind(loop);
1803   // Oops in table are NO MORE compressed.
1804   ld(temp, base_offset, array_ptr);
1805   cmpd(CCR0, temp, super_klass);
1806   beq(CCR0, hit);
1807   addi(array_ptr, array_ptr, BytesPerWord);
1808   bdnz(loop);
1809 
1810   bind(failure);
1811   if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss)
1812   b(fallthru);
1813 
1814   bind(hit);
1815   std(super_klass, target_offset, sub_klass); // save result to cache
1816   if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit)
1817   if (L_success != NULL) { b(*L_success); }
1818   else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided
1819 
1820   bind(fallthru);
1821 }
1822 
1823 // Try fast path, then go to slow one if not successful
1824 void MacroAssembler::check_klass_subtype(Register sub_klass,
1825                          Register super_klass,
1826                          Register temp1_reg,
1827                          Register temp2_reg,
1828                          Label& L_success) {
1829   Label L_failure;
1830   check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure);
1831   check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success);
1832   bind(L_failure); // Fallthru if not successful.
1833 }
1834 
1835 void MacroAssembler::check_method_handle_type(Register mtype_reg, Register mh_reg,
1836                                               Register temp_reg,
1837                                               Label& wrong_method_type) {
1838   assert_different_registers(mtype_reg, mh_reg, temp_reg);
1839   // Compare method type against that of the receiver.
1840   load_heap_oop_not_null(temp_reg, delayed_value(java_lang_invoke_MethodHandle::type_offset_in_bytes, temp_reg), mh_reg);
1841   cmpd(CCR0, temp_reg, mtype_reg);
1842   bne(CCR0, wrong_method_type);
1843 }
1844 
1845 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot,
1846                                                    Register temp_reg,
1847                                                    int extra_slot_offset) {
1848   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
1849   int stackElementSize = Interpreter::stackElementSize;
1850   int offset = extra_slot_offset * stackElementSize;
1851   if (arg_slot.is_constant()) {
1852     offset += arg_slot.as_constant() * stackElementSize;
1853     return offset;
1854   } else {
1855     assert(temp_reg != noreg, "must specify");
1856     sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize));
1857     if (offset != 0)
1858       addi(temp_reg, temp_reg, offset);
1859     return temp_reg;
1860   }
1861 }
1862 
1863 // Supports temp2_reg = R0.
1864 void MacroAssembler::biased_locking_enter(ConditionRegister cr_reg, Register obj_reg,
1865                                           Register mark_reg, Register temp_reg,
1866                                           Register temp2_reg, Label& done, Label* slow_case) {
1867   assert(UseBiasedLocking, "why call this otherwise?");
1868 
1869 #ifdef ASSERT
1870   assert_different_registers(obj_reg, mark_reg, temp_reg, temp2_reg);
1871 #endif
1872 
1873   Label cas_label;
1874 
1875   // Branch to done if fast path fails and no slow_case provided.
1876   Label *slow_case_int = (slow_case != NULL) ? slow_case : &done;
1877 
1878   // Biased locking
1879   // See whether the lock is currently biased toward our thread and
1880   // whether the epoch is still valid
1881   // Note that the runtime guarantees sufficient alignment of JavaThread
1882   // pointers to allow age to be placed into low bits
1883   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits,
1884          "biased locking makes assumptions about bit layout");
1885 
1886   if (PrintBiasedLockingStatistics) {
1887     load_const(temp2_reg, (address) BiasedLocking::total_entry_count_addr(), temp_reg);
1888     lwzx(temp_reg, temp2_reg);
1889     addi(temp_reg, temp_reg, 1);
1890     stwx(temp_reg, temp2_reg);
1891   }
1892 
1893   andi(temp_reg, mark_reg, markOopDesc::biased_lock_mask_in_place);
1894   cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern);
1895   bne(cr_reg, cas_label);
1896 
1897   load_klass(temp_reg, obj_reg);
1898 
1899   load_const_optimized(temp2_reg, ~((int) markOopDesc::age_mask_in_place));
1900   ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
1901   orr(temp_reg, R16_thread, temp_reg);
1902   xorr(temp_reg, mark_reg, temp_reg);
1903   andr(temp_reg, temp_reg, temp2_reg);
1904   cmpdi(cr_reg, temp_reg, 0);
1905   if (PrintBiasedLockingStatistics) {
1906     Label l;
1907     bne(cr_reg, l);
1908     load_const(temp2_reg, (address) BiasedLocking::biased_lock_entry_count_addr());
1909     lwzx(mark_reg, temp2_reg);
1910     addi(mark_reg, mark_reg, 1);
1911     stwx(mark_reg, temp2_reg);
1912     // restore mark_reg
1913     ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);
1914     bind(l);
1915   }
1916   beq(cr_reg, done);
1917 
1918   Label try_revoke_bias;
1919   Label try_rebias;
1920 
1921   // At this point we know that the header has the bias pattern and
1922   // that we are not the bias owner in the current epoch. We need to
1923   // figure out more details about the state of the header in order to
1924   // know what operations can be legally performed on the object's
1925   // header.
1926 
1927   // If the low three bits in the xor result aren't clear, that means
1928   // the prototype header is no longer biased and we have to revoke
1929   // the bias on this object.
1930   andi(temp2_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
1931   cmpwi(cr_reg, temp2_reg, 0);
1932   bne(cr_reg, try_revoke_bias);
1933 
1934   // Biasing is still enabled for this data type. See whether the
1935   // epoch of the current bias is still valid, meaning that the epoch
1936   // bits of the mark word are equal to the epoch bits of the
1937   // prototype header. (Note that the prototype header's epoch bits
1938   // only change at a safepoint.) If not, attempt to rebias the object
1939   // toward the current thread. Note that we must be absolutely sure
1940   // that the current epoch is invalid in order to do this because
1941   // otherwise the manipulations it performs on the mark word are
1942   // illegal.
1943 
1944   int shift_amount = 64 - markOopDesc::epoch_shift;
1945   // rotate epoch bits to right (little) end and set other bits to 0
1946   // [ big part | epoch | little part ] -> [ 0..0 | epoch ]
1947   rldicl_(temp2_reg, temp_reg, shift_amount, 64 - markOopDesc::epoch_bits);
1948   // branch if epoch bits are != 0, i.e. they differ, because the epoch has been incremented
1949   bne(CCR0, try_rebias);
1950 
1951   // The epoch of the current bias is still valid but we know nothing
1952   // about the owner; it might be set or it might be clear. Try to
1953   // acquire the bias of the object using an atomic operation. If this
1954   // fails we will go in to the runtime to revoke the object's bias.
1955   // Note that we first construct the presumed unbiased header so we
1956   // don't accidentally blow away another thread's valid bias.
1957   andi(mark_reg, mark_reg, (markOopDesc::biased_lock_mask_in_place |
1958                                 markOopDesc::age_mask_in_place |
1959                                 markOopDesc::epoch_mask_in_place));
1960   orr(temp_reg, R16_thread, mark_reg);
1961 
1962   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
1963 
1964   // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
1965   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
1966            /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
1967            /*where=*/obj_reg,
1968            MacroAssembler::MemBarAcq,
1969            MacroAssembler::cmpxchgx_hint_acquire_lock(),
1970            noreg, slow_case_int); // bail out if failed
1971 
1972   // If the biasing toward our thread failed, this means that
1973   // another thread succeeded in biasing it toward itself and we
1974   // need to revoke that bias. The revocation will occur in the
1975   // interpreter runtime in the slow case.
1976   if (PrintBiasedLockingStatistics) {
1977     load_const(temp2_reg, (address) BiasedLocking::anonymously_biased_lock_entry_count_addr(), temp_reg);
1978     lwzx(temp_reg, temp2_reg);
1979     addi(temp_reg, temp_reg, 1);
1980     stwx(temp_reg, temp2_reg);
1981   }
1982   b(done);
1983 
1984   bind(try_rebias);
1985   // At this point we know the epoch has expired, meaning that the
1986   // current "bias owner", if any, is actually invalid. Under these
1987   // circumstances _only_, we are allowed to use the current header's
1988   // value as the comparison value when doing the cas to acquire the
1989   // bias in the current epoch. In other words, we allow transfer of
1990   // the bias from one thread to another directly in this situation.
1991   load_klass(temp_reg, obj_reg);
1992   andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place);
1993   orr(temp2_reg, R16_thread, temp2_reg);
1994   ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
1995   orr(temp_reg, temp2_reg, temp_reg);
1996 
1997   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
1998 
1999   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
2000                  /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
2001                  /*where=*/obj_reg,
2002                  MacroAssembler::MemBarAcq,
2003                  MacroAssembler::cmpxchgx_hint_acquire_lock(),
2004                  noreg, slow_case_int); // bail out if failed
2005 
2006   // If the biasing toward our thread failed, this means that
2007   // another thread succeeded in biasing it toward itself and we
2008   // need to revoke that bias. The revocation will occur in the
2009   // interpreter runtime in the slow case.
2010   if (PrintBiasedLockingStatistics) {
2011     load_const(temp2_reg, (address) BiasedLocking::rebiased_lock_entry_count_addr(), temp_reg);
2012     lwzx(temp_reg, temp2_reg);
2013     addi(temp_reg, temp_reg, 1);
2014     stwx(temp_reg, temp2_reg);
2015   }
2016   b(done);
2017 
2018   bind(try_revoke_bias);
2019   // The prototype mark in the klass doesn't have the bias bit set any
2020   // more, indicating that objects of this data type are not supposed
2021   // to be biased any more. We are going to try to reset the mark of
2022   // this object to the prototype value and fall through to the
2023   // CAS-based locking scheme. Note that if our CAS fails, it means
2024   // that another thread raced us for the privilege of revoking the
2025   // bias of this particular object, so it's okay to continue in the
2026   // normal locking code.
2027   load_klass(temp_reg, obj_reg);
2028   ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
2029   andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place);
2030   orr(temp_reg, temp_reg, temp2_reg);
2031 
2032   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2033 
2034   // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
2035   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
2036                  /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
2037                  /*where=*/obj_reg,
2038                  MacroAssembler::MemBarAcq,
2039                  MacroAssembler::cmpxchgx_hint_acquire_lock());
2040 
2041   // reload markOop in mark_reg before continuing with lightweight locking
2042   ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);
2043 
2044   // Fall through to the normal CAS-based lock, because no matter what
2045   // the result of the above CAS, some thread must have succeeded in
2046   // removing the bias bit from the object's header.
2047   if (PrintBiasedLockingStatistics) {
2048     Label l;
2049     bne(cr_reg, l);
2050     load_const(temp2_reg, (address) BiasedLocking::revoked_lock_entry_count_addr(), temp_reg);
2051     lwzx(temp_reg, temp2_reg);
2052     addi(temp_reg, temp_reg, 1);
2053     stwx(temp_reg, temp2_reg);
2054     bind(l);
2055   }
2056 
2057   bind(cas_label);
2058 }
2059 
2060 void MacroAssembler::biased_locking_exit (ConditionRegister cr_reg, Register mark_addr, Register temp_reg, Label& done) {
2061   // Check for biased locking unlock case, which is a no-op
2062   // Note: we do not have to check the thread ID for two reasons.
2063   // First, the interpreter checks for IllegalMonitorStateException at
2064   // a higher level. Second, if the bias was revoked while we held the
2065   // lock, the object could not be rebiased toward another thread, so
2066   // the bias bit would be clear.
2067 
2068   ld(temp_reg, 0, mark_addr);
2069   andi(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
2070 
2071   cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern);
2072   beq(cr_reg, done);
2073 }
2074 
2075 // allocation (for C1)
2076 void MacroAssembler::eden_allocate(
2077   Register obj,                      // result: pointer to object after successful allocation
2078   Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
2079   int      con_size_in_bytes,        // object size in bytes if   known at compile time
2080   Register t1,                       // temp register
2081   Register t2,                       // temp register
2082   Label&   slow_case                 // continuation point if fast allocation fails
2083 ) {
2084   b(slow_case);
2085 }
2086 
2087 void MacroAssembler::tlab_allocate(
2088   Register obj,                      // result: pointer to object after successful allocation
2089   Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
2090   int      con_size_in_bytes,        // object size in bytes if   known at compile time
2091   Register t1,                       // temp register
2092   Label&   slow_case                 // continuation point if fast allocation fails
2093 ) {
2094   // make sure arguments make sense
2095   assert_different_registers(obj, var_size_in_bytes, t1);
2096   assert(0 <= con_size_in_bytes && is_simm13(con_size_in_bytes), "illegal object size");
2097   assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment");
2098 
2099   const Register new_top = t1;
2100   //verify_tlab(); not implemented
2101 
2102   ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2103   ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread);
2104   if (var_size_in_bytes == noreg) {
2105     addi(new_top, obj, con_size_in_bytes);
2106   } else {
2107     add(new_top, obj, var_size_in_bytes);
2108   }
2109   cmpld(CCR0, new_top, R0);
2110   bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case);
2111 
2112 #ifdef ASSERT
2113   // make sure new free pointer is properly aligned
2114   {
2115     Label L;
2116     andi_(R0, new_top, MinObjAlignmentInBytesMask);
2117     beq(CCR0, L);
2118     stop("updated TLAB free is not properly aligned", 0x934);
2119     bind(L);
2120   }
2121 #endif // ASSERT
2122 
2123   // update the tlab top pointer
2124   std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2125   //verify_tlab(); not implemented
2126 }
2127 void MacroAssembler::tlab_refill(Label& retry_tlab, Label& try_eden, Label& slow_case) {
2128   unimplemented("tlab_refill");
2129 }
2130 void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register t1, Register t2) {
2131   unimplemented("incr_allocated_bytes");
2132 }
2133 
2134 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset,
2135                                              int insts_call_instruction_offset, Register Rtoc) {
2136   // Start the stub.
2137   address stub = start_a_stub(64);
2138   if (stub == NULL) { return NULL; } // CodeCache full: bail out
2139 
2140   // Create a trampoline stub relocation which relates this trampoline stub
2141   // with the call instruction at insts_call_instruction_offset in the
2142   // instructions code-section.
2143   relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset));
2144   const int stub_start_offset = offset();
2145 
2146   // For java_to_interp stubs we use R11_scratch1 as scratch register
2147   // and in call trampoline stubs we use R12_scratch2. This way we
2148   // can distinguish them (see is_NativeCallTrampolineStub_at()).
2149   Register reg_scratch = R12_scratch2;
2150 
2151   // Now, create the trampoline stub's code:
2152   // - load the TOC
2153   // - load the call target from the constant pool
2154   // - call
2155   if (Rtoc == noreg) {
2156     calculate_address_from_global_toc(reg_scratch, method_toc());
2157     Rtoc = reg_scratch;
2158   }
2159 
2160   ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false);
2161   mtctr(reg_scratch);
2162   bctr();
2163 
2164   const address stub_start_addr = addr_at(stub_start_offset);
2165 
2166   // Assert that the encoded destination_toc_offset can be identified and that it is correct.
2167   assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(),
2168          "encoded offset into the constant pool must match");
2169   // Trampoline_stub_size should be good.
2170   assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size");
2171   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
2172 
2173   // End the stub.
2174   end_a_stub();
2175   return stub;
2176 }
2177 
2178 // TM on PPC64.
2179 void MacroAssembler::atomic_inc_ptr(Register addr, Register result, int simm16) {
2180   Label retry;
2181   bind(retry);
2182   ldarx(result, addr, /*hint*/ false);
2183   addi(result, result, simm16);
2184   stdcx_(result, addr);
2185   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
2186     bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
2187   } else {
2188     bne(                  CCR0, retry); // stXcx_ sets CCR0
2189   }
2190 }
2191 
2192 void MacroAssembler::atomic_ori_int(Register addr, Register result, int uimm16) {
2193   Label retry;
2194   bind(retry);
2195   lwarx(result, addr, /*hint*/ false);
2196   ori(result, result, uimm16);
2197   stwcx_(result, addr);
2198   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
2199     bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
2200   } else {
2201     bne(                  CCR0, retry); // stXcx_ sets CCR0
2202   }
2203 }
2204 
2205 #if INCLUDE_RTM_OPT
2206 
2207 // Update rtm_counters based on abort status
2208 // input: abort_status
2209 //        rtm_counters (RTMLockingCounters*)
2210 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters_Reg) {
2211   // Mapping to keep PreciseRTMLockingStatistics similar to x86.
2212   // x86 ppc (! means inverted, ? means not the same)
2213   //  0   31  Set if abort caused by XABORT instruction.
2214   //  1  ! 7  If set, the transaction may succeed on a retry. This bit is always clear if bit 0 is set.
2215   //  2   13  Set if another logical processor conflicted with a memory address that was part of the transaction that aborted.
2216   //  3   10  Set if an internal buffer overflowed.
2217   //  4  ?12  Set if a debug breakpoint was hit.
2218   //  5  ?32  Set if an abort occurred during execution of a nested transaction.
2219   const  int tm_failure_bit[] = {Assembler::tm_tabort, // Note: Seems like signal handler sets this, too.
2220                                  Assembler::tm_failure_persistent, // inverted: transient
2221                                  Assembler::tm_trans_cf,
2222                                  Assembler::tm_footprint_of,
2223                                  Assembler::tm_non_trans_cf,
2224                                  Assembler::tm_suspended};
2225   const bool tm_failure_inv[] = {false, true, false, false, false, false};
2226   assert(sizeof(tm_failure_bit)/sizeof(int) == RTMLockingCounters::ABORT_STATUS_LIMIT, "adapt mapping!");
2227 
2228   const Register addr_Reg = R0;
2229   // Keep track of offset to where rtm_counters_Reg had pointed to.
2230   int counters_offs = RTMLockingCounters::abort_count_offset();
2231   addi(addr_Reg, rtm_counters_Reg, counters_offs);
2232   const Register temp_Reg = rtm_counters_Reg;
2233 
2234   //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically
2235   ldx(temp_Reg, addr_Reg);
2236   addi(temp_Reg, temp_Reg, 1);
2237   stdx(temp_Reg, addr_Reg);
2238 
2239   if (PrintPreciseRTMLockingStatistics) {
2240     int counters_offs_delta = RTMLockingCounters::abortX_count_offset() - counters_offs;
2241 
2242     //mftexasr(abort_status); done by caller
2243     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
2244       counters_offs += counters_offs_delta;
2245       li(temp_Reg, counters_offs_delta); // can't use addi with R0
2246       add(addr_Reg, addr_Reg, temp_Reg); // point to next counter
2247       counters_offs_delta = sizeof(uintx);
2248 
2249       Label check_abort;
2250       rldicr_(temp_Reg, abort_status, tm_failure_bit[i], 0);
2251       if (tm_failure_inv[i]) {
2252         bne(CCR0, check_abort);
2253       } else {
2254         beq(CCR0, check_abort);
2255       }
2256       //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically
2257       ldx(temp_Reg, addr_Reg);
2258       addi(temp_Reg, temp_Reg, 1);
2259       stdx(temp_Reg, addr_Reg);
2260       bind(check_abort);
2261     }
2262   }
2263   li(temp_Reg, -counters_offs); // can't use addi with R0
2264   add(rtm_counters_Reg, addr_Reg, temp_Reg); // restore
2265 }
2266 
2267 // Branch if (random & (count-1) != 0), count is 2^n
2268 // tmp and CR0 are killed
2269 void MacroAssembler::branch_on_random_using_tb(Register tmp, int count, Label& brLabel) {
2270   mftb(tmp);
2271   andi_(tmp, tmp, count-1);
2272   bne(CCR0, brLabel);
2273 }
2274 
2275 // Perform abort ratio calculation, set no_rtm bit if high ratio.
2276 // input:  rtm_counters_Reg (RTMLockingCounters* address) - KILLED
2277 void MacroAssembler::rtm_abort_ratio_calculation(Register rtm_counters_Reg,
2278                                                  RTMLockingCounters* rtm_counters,
2279                                                  Metadata* method_data) {
2280   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
2281 
2282   if (RTMLockingCalculationDelay > 0) {
2283     // Delay calculation.
2284     ld(rtm_counters_Reg, (RegisterOrConstant)(intptr_t)RTMLockingCounters::rtm_calculation_flag_addr());
2285     cmpdi(CCR0, rtm_counters_Reg, 0);
2286     beq(CCR0, L_done);
2287     load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2288   }
2289   // Abort ratio calculation only if abort_count > RTMAbortThreshold.
2290   //   Aborted transactions = abort_count * 100
2291   //   All transactions = total_count *  RTMTotalCountIncrRate
2292   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
2293   ld(R0, RTMLockingCounters::abort_count_offset(), rtm_counters_Reg);
2294   cmpdi(CCR0, R0, RTMAbortThreshold);
2295   blt(CCR0, L_check_always_rtm2);
2296   mulli(R0, R0, 100);
2297 
2298   const Register tmpReg = rtm_counters_Reg;
2299   ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2300   mulli(tmpReg, tmpReg, RTMTotalCountIncrRate);
2301   mulli(tmpReg, tmpReg, RTMAbortRatio);
2302   cmpd(CCR0, R0, tmpReg);
2303   blt(CCR0, L_check_always_rtm1); // jump to reload
2304   if (method_data != NULL) {
2305     // Set rtm_state to "no rtm" in MDO.
2306     // Not using a metadata relocation. Method and Class Loader are kept alive anyway.
2307     // (See nmethod::metadata_do and CodeBuffer::finalize_oop_references.)
2308     load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2309     atomic_ori_int(R0, tmpReg, NoRTM);
2310   }
2311   b(L_done);
2312 
2313   bind(L_check_always_rtm1);
2314   load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2315   bind(L_check_always_rtm2);
2316   ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2317   cmpdi(CCR0, tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
2318   blt(CCR0, L_done);
2319   if (method_data != NULL) {
2320     // Set rtm_state to "always rtm" in MDO.
2321     // Not using a metadata relocation. See above.
2322     load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2323     atomic_ori_int(R0, tmpReg, UseRTM);
2324   }
2325   bind(L_done);
2326 }
2327 
2328 // Update counters and perform abort ratio calculation.
2329 // input: abort_status_Reg
2330 void MacroAssembler::rtm_profiling(Register abort_status_Reg, Register temp_Reg,
2331                                    RTMLockingCounters* rtm_counters,
2332                                    Metadata* method_data,
2333                                    bool profile_rtm) {
2334 
2335   assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2336   // Update rtm counters based on state at abort.
2337   // Reads abort_status_Reg, updates flags.
2338   assert_different_registers(abort_status_Reg, temp_Reg);
2339   load_const_optimized(temp_Reg, (address)rtm_counters, R0);
2340   rtm_counters_update(abort_status_Reg, temp_Reg);
2341   if (profile_rtm) {
2342     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2343     rtm_abort_ratio_calculation(temp_Reg, rtm_counters, method_data);
2344   }
2345 }
2346 
2347 // Retry on abort if abort's status indicates non-persistent failure.
2348 // inputs: retry_count_Reg
2349 //       : abort_status_Reg
2350 // output: retry_count_Reg decremented by 1
2351 void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg,
2352                                              Label& retryLabel, Label* checkRetry) {
2353   Label doneRetry;
2354   rldicr_(R0, abort_status_Reg, tm_failure_persistent, 0);
2355   bne(CCR0, doneRetry);
2356   if (checkRetry) { bind(*checkRetry); }
2357   addic_(retry_count_Reg, retry_count_Reg, -1);
2358   blt(CCR0, doneRetry);
2359   smt_yield(); // Can't use wait(). No permission (SIGILL).
2360   b(retryLabel);
2361   bind(doneRetry);
2362 }
2363 
2364 // Spin and retry if lock is busy.
2365 // inputs: box_Reg (monitor address)
2366 //       : retry_count_Reg
2367 // output: retry_count_Reg decremented by 1
2368 // CTR is killed
2369 void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register owner_addr_Reg, Label& retryLabel) {
2370   Label SpinLoop, doneRetry;
2371   addic_(retry_count_Reg, retry_count_Reg, -1);
2372   blt(CCR0, doneRetry);
2373   li(R0, RTMSpinLoopCount);
2374   mtctr(R0);
2375 
2376   bind(SpinLoop);
2377   smt_yield(); // Can't use waitrsv(). No permission (SIGILL).
2378   bdz(retryLabel);
2379   ld(R0, 0, owner_addr_Reg);
2380   cmpdi(CCR0, R0, 0);
2381   bne(CCR0, SpinLoop);
2382   b(retryLabel);
2383 
2384   bind(doneRetry);
2385 }
2386 
2387 // Use RTM for normal stack locks.
2388 // Input: objReg (object to lock)
2389 void MacroAssembler::rtm_stack_locking(ConditionRegister flag,
2390                                        Register obj, Register mark_word, Register tmp,
2391                                        Register retry_on_abort_count_Reg,
2392                                        RTMLockingCounters* stack_rtm_counters,
2393                                        Metadata* method_data, bool profile_rtm,
2394                                        Label& DONE_LABEL, Label& IsInflated) {
2395   assert(UseRTMForStackLocks, "why call this otherwise?");
2396   assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
2397   Label L_rtm_retry, L_decrement_retry, L_on_abort;
2398 
2399   if (RTMRetryCount > 0) {
2400     load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
2401     bind(L_rtm_retry);
2402   }
2403   andi_(R0, mark_word, markOopDesc::monitor_value);  // inflated vs stack-locked|neutral|biased
2404   bne(CCR0, IsInflated);
2405 
2406   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2407     Label L_noincrement;
2408     if (RTMTotalCountIncrRate > 1) {
2409       branch_on_random_using_tb(tmp, (int)RTMTotalCountIncrRate, L_noincrement);
2410     }
2411     assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
2412     load_const_optimized(tmp, (address)stack_rtm_counters->total_count_addr(), R0);
2413     //atomic_inc_ptr(tmp, /*temp, will be reloaded*/mark_word); We don't increment atomically
2414     ldx(mark_word, tmp);
2415     addi(mark_word, mark_word, 1);
2416     stdx(mark_word, tmp);
2417     bind(L_noincrement);
2418   }
2419   tbegin_();
2420   beq(CCR0, L_on_abort);
2421   ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);         // Reload in transaction, conflicts need to be tracked.
2422   andi(R0, mark_word, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
2423   cmpwi(flag, R0, markOopDesc::unlocked_value);                // bits = 001 unlocked
2424   beq(flag, DONE_LABEL);                                       // all done if unlocked
2425 
2426   if (UseRTMXendForLockBusy) {
2427     tend_();
2428     b(L_decrement_retry);
2429   } else {
2430     tabort_();
2431   }
2432   bind(L_on_abort);
2433   const Register abort_status_Reg = tmp;
2434   mftexasr(abort_status_Reg);
2435   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2436     rtm_profiling(abort_status_Reg, /*temp*/mark_word, stack_rtm_counters, method_data, profile_rtm);
2437   }
2438   ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // reload
2439   if (RTMRetryCount > 0) {
2440     // Retry on lock abort if abort status is not permanent.
2441     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry, &L_decrement_retry);
2442   } else {
2443     bind(L_decrement_retry);
2444   }
2445 }
2446 
2447 // Use RTM for inflating locks
2448 // inputs: obj       (object to lock)
2449 //         mark_word (current header - KILLED)
2450 //         boxReg    (on-stack box address (displaced header location) - KILLED)
2451 void MacroAssembler::rtm_inflated_locking(ConditionRegister flag,
2452                                           Register obj, Register mark_word, Register boxReg,
2453                                           Register retry_on_busy_count_Reg, Register retry_on_abort_count_Reg,
2454                                           RTMLockingCounters* rtm_counters,
2455                                           Metadata* method_data, bool profile_rtm,
2456                                           Label& DONE_LABEL) {
2457   assert(UseRTMLocking, "why call this otherwise?");
2458   Label L_rtm_retry, L_decrement_retry, L_on_abort;
2459   // Clean monitor_value bit to get valid pointer.
2460   int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markOopDesc::monitor_value;
2461 
2462   // Store non-null, using boxReg instead of (intptr_t)markOopDesc::unused_mark().
2463   std(boxReg, BasicLock::displaced_header_offset_in_bytes(), boxReg);
2464   const Register tmpReg = boxReg;
2465   const Register owner_addr_Reg = mark_word;
2466   addi(owner_addr_Reg, mark_word, owner_offset);
2467 
2468   if (RTMRetryCount > 0) {
2469     load_const_optimized(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy.
2470     load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort.
2471     bind(L_rtm_retry);
2472   }
2473   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2474     Label L_noincrement;
2475     if (RTMTotalCountIncrRate > 1) {
2476       branch_on_random_using_tb(R0, (int)RTMTotalCountIncrRate, L_noincrement);
2477     }
2478     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2479     load_const(R0, (address)rtm_counters->total_count_addr(), tmpReg);
2480     //atomic_inc_ptr(R0, tmpReg); We don't increment atomically
2481     ldx(tmpReg, R0);
2482     addi(tmpReg, tmpReg, 1);
2483     stdx(tmpReg, R0);
2484     bind(L_noincrement);
2485   }
2486   tbegin_();
2487   beq(CCR0, L_on_abort);
2488   // We don't reload mark word. Will only be reset at safepoint.
2489   ld(R0, 0, owner_addr_Reg); // Load in transaction, conflicts need to be tracked.
2490   cmpdi(flag, R0, 0);
2491   beq(flag, DONE_LABEL);
2492 
2493   if (UseRTMXendForLockBusy) {
2494     tend_();
2495     b(L_decrement_retry);
2496   } else {
2497     tabort_();
2498   }
2499   bind(L_on_abort);
2500   const Register abort_status_Reg = tmpReg;
2501   mftexasr(abort_status_Reg);
2502   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2503     rtm_profiling(abort_status_Reg, /*temp*/ owner_addr_Reg, rtm_counters, method_data, profile_rtm);
2504     // Restore owner_addr_Reg
2505     ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);
2506 #ifdef ASSERT
2507     andi_(R0, mark_word, markOopDesc::monitor_value);
2508     asm_assert_ne("must be inflated", 0xa754); // Deflating only allowed at safepoint.
2509 #endif
2510     addi(owner_addr_Reg, mark_word, owner_offset);
2511   }
2512   if (RTMRetryCount > 0) {
2513     // Retry on lock abort if abort status is not permanent.
2514     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
2515   }
2516 
2517   // Appears unlocked - try to swing _owner from null to non-null.
2518   cmpxchgd(flag, /*current val*/ R0, (intptr_t)0, /*new val*/ R16_thread, owner_addr_Reg,
2519            MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2520            MacroAssembler::cmpxchgx_hint_acquire_lock(), noreg, &L_decrement_retry, true);
2521 
2522   if (RTMRetryCount > 0) {
2523     // success done else retry
2524     b(DONE_LABEL);
2525     bind(L_decrement_retry);
2526     // Spin and retry if lock is busy.
2527     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, owner_addr_Reg, L_rtm_retry);
2528   } else {
2529     bind(L_decrement_retry);
2530   }
2531 }
2532 
2533 #endif //  INCLUDE_RTM_OPT
2534 
2535 // "The box" is the space on the stack where we copy the object mark.
2536 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box,
2537                                                Register temp, Register displaced_header, Register current_header,
2538                                                bool try_bias,
2539                                                RTMLockingCounters* rtm_counters,
2540                                                RTMLockingCounters* stack_rtm_counters,
2541                                                Metadata* method_data,
2542                                                bool use_rtm, bool profile_rtm) {
2543   assert_different_registers(oop, box, temp, displaced_header, current_header);
2544   assert(flag != CCR0, "bad condition register");
2545   Label cont;
2546   Label object_has_monitor;
2547   Label cas_failed;
2548 
2549   // Load markOop from object into displaced_header.
2550   ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop);
2551 
2552 
2553   // Always do locking in runtime.
2554   if (EmitSync & 0x01) {
2555     cmpdi(flag, oop, 0); // Oop can't be 0 here => always false.
2556     return;
2557   }
2558 
2559   if (try_bias) {
2560     biased_locking_enter(flag, oop, displaced_header, temp, current_header, cont);
2561   }
2562 
2563 #if INCLUDE_RTM_OPT
2564   if (UseRTMForStackLocks && use_rtm) {
2565     rtm_stack_locking(flag, oop, displaced_header, temp, /*temp*/ current_header,
2566                       stack_rtm_counters, method_data, profile_rtm,
2567                       cont, object_has_monitor);
2568   }
2569 #endif // INCLUDE_RTM_OPT
2570 
2571   // Handle existing monitor.
2572   if ((EmitSync & 0x02) == 0) {
2573     // The object has an existing monitor iff (mark & monitor_value) != 0.
2574     andi_(temp, displaced_header, markOopDesc::monitor_value);
2575     bne(CCR0, object_has_monitor);
2576   }
2577 
2578   // Set displaced_header to be (markOop of object | UNLOCK_VALUE).
2579   ori(displaced_header, displaced_header, markOopDesc::unlocked_value);
2580 
2581   // Load Compare Value application register.
2582 
2583   // Initialize the box. (Must happen before we update the object mark!)
2584   std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2585 
2586   // Must fence, otherwise, preceding store(s) may float below cmpxchg.
2587   // Compare object markOop with mark and if equal exchange scratch1 with object markOop.
2588   cmpxchgd(/*flag=*/flag,
2589            /*current_value=*/current_header,
2590            /*compare_value=*/displaced_header,
2591            /*exchange_value=*/box,
2592            /*where=*/oop,
2593            MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2594            MacroAssembler::cmpxchgx_hint_acquire_lock(),
2595            noreg,
2596            &cas_failed,
2597            /*check without membar and ldarx first*/true);
2598   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2599 
2600   // If the compare-and-exchange succeeded, then we found an unlocked
2601   // object and we have now locked it.
2602   b(cont);
2603 
2604   bind(cas_failed);
2605   // We did not see an unlocked object so try the fast recursive case.
2606 
2607   // Check if the owner is self by comparing the value in the markOop of object
2608   // (current_header) with the stack pointer.
2609   sub(current_header, current_header, R1_SP);
2610   load_const_optimized(temp, ~(os::vm_page_size()-1) | markOopDesc::lock_mask_in_place);
2611 
2612   and_(R0/*==0?*/, current_header, temp);
2613   // If condition is true we are cont and hence we can store 0 as the
2614   // displaced header in the box, which indicates that it is a recursive lock.
2615   mcrf(flag,CCR0);
2616   std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box);
2617 
2618   // Handle existing monitor.
2619   if ((EmitSync & 0x02) == 0) {
2620     b(cont);
2621 
2622     bind(object_has_monitor);
2623     // The object's monitor m is unlocked iff m->owner == NULL,
2624     // otherwise m->owner may contain a thread or a stack address.
2625 
2626 #if INCLUDE_RTM_OPT
2627     // Use the same RTM locking code in 32- and 64-bit VM.
2628     if (use_rtm) {
2629       rtm_inflated_locking(flag, oop, displaced_header, box, temp, /*temp*/ current_header,
2630                            rtm_counters, method_data, profile_rtm, cont);
2631     } else {
2632 #endif // INCLUDE_RTM_OPT
2633 
2634     // Try to CAS m->owner from NULL to current thread.
2635     addi(temp, displaced_header, ObjectMonitor::owner_offset_in_bytes()-markOopDesc::monitor_value);
2636     cmpxchgd(/*flag=*/flag,
2637              /*current_value=*/current_header,
2638              /*compare_value=*/(intptr_t)0,
2639              /*exchange_value=*/R16_thread,
2640              /*where=*/temp,
2641              MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2642              MacroAssembler::cmpxchgx_hint_acquire_lock());
2643 
2644     // Store a non-null value into the box.
2645     std(box, BasicLock::displaced_header_offset_in_bytes(), box);
2646 
2647 #   ifdef ASSERT
2648     bne(flag, cont);
2649     // We have acquired the monitor, check some invariants.
2650     addi(/*monitor=*/temp, temp, -ObjectMonitor::owner_offset_in_bytes());
2651     // Invariant 1: _recursions should be 0.
2652     //assert(ObjectMonitor::recursions_size_in_bytes() == 8, "unexpected size");
2653     asm_assert_mem8_is_zero(ObjectMonitor::recursions_offset_in_bytes(), temp,
2654                             "monitor->_recursions should be 0", -1);
2655     // Invariant 2: OwnerIsThread shouldn't be 0.
2656     //assert(ObjectMonitor::OwnerIsThread_size_in_bytes() == 4, "unexpected size");
2657     //asm_assert_mem4_isnot_zero(ObjectMonitor::OwnerIsThread_offset_in_bytes(), temp,
2658     //                           "monitor->OwnerIsThread shouldn't be 0", -1);
2659 #   endif
2660 
2661 #if INCLUDE_RTM_OPT
2662     } // use_rtm()
2663 #endif
2664   }
2665 
2666   bind(cont);
2667   // flag == EQ indicates success
2668   // flag == NE indicates failure
2669 }
2670 
2671 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box,
2672                                                  Register temp, Register displaced_header, Register current_header,
2673                                                  bool try_bias, bool use_rtm) {
2674   assert_different_registers(oop, box, temp, displaced_header, current_header);
2675   assert(flag != CCR0, "bad condition register");
2676   Label cont;
2677   Label object_has_monitor;
2678 
2679   // Always do locking in runtime.
2680   if (EmitSync & 0x01) {
2681     cmpdi(flag, oop, 0); // Oop can't be 0 here => always false.
2682     return;
2683   }
2684 
2685   if (try_bias) {
2686     biased_locking_exit(flag, oop, current_header, cont);
2687   }
2688 
2689 #if INCLUDE_RTM_OPT
2690   if (UseRTMForStackLocks && use_rtm) {
2691     assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
2692     Label L_regular_unlock;
2693     ld(current_header, oopDesc::mark_offset_in_bytes(), oop);         // fetch markword
2694     andi(R0, current_header, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
2695     cmpwi(flag, R0, markOopDesc::unlocked_value);                     // bits = 001 unlocked
2696     bne(flag, L_regular_unlock);                                      // else RegularLock
2697     tend_();                                                          // otherwise end...
2698     b(cont);                                                          // ... and we're done
2699     bind(L_regular_unlock);
2700   }
2701 #endif
2702 
2703   // Find the lock address and load the displaced header from the stack.
2704   ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2705 
2706   // If the displaced header is 0, we have a recursive unlock.
2707   cmpdi(flag, displaced_header, 0);
2708   beq(flag, cont);
2709 
2710   // Handle existing monitor.
2711   if ((EmitSync & 0x02) == 0) {
2712     // The object has an existing monitor iff (mark & monitor_value) != 0.
2713     RTM_OPT_ONLY( if (!(UseRTMForStackLocks && use_rtm)) ) // skip load if already done
2714     ld(current_header, oopDesc::mark_offset_in_bytes(), oop);
2715     andi_(R0, current_header, markOopDesc::monitor_value);
2716     bne(CCR0, object_has_monitor);
2717   }
2718 
2719   // Check if it is still a light weight lock, this is is true if we see
2720   // the stack address of the basicLock in the markOop of the object.
2721   // Cmpxchg sets flag to cmpd(current_header, box).
2722   cmpxchgd(/*flag=*/flag,
2723            /*current_value=*/current_header,
2724            /*compare_value=*/box,
2725            /*exchange_value=*/displaced_header,
2726            /*where=*/oop,
2727            MacroAssembler::MemBarRel,
2728            MacroAssembler::cmpxchgx_hint_release_lock(),
2729            noreg,
2730            &cont);
2731 
2732   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2733 
2734   // Handle existing monitor.
2735   if ((EmitSync & 0x02) == 0) {
2736     b(cont);
2737 
2738     bind(object_has_monitor);
2739     addi(current_header, current_header, -markOopDesc::monitor_value); // monitor
2740     ld(temp,             ObjectMonitor::owner_offset_in_bytes(), current_header);
2741 
2742     // It's inflated.
2743 #if INCLUDE_RTM_OPT
2744     if (use_rtm) {
2745       Label L_regular_inflated_unlock;
2746       // Clean monitor_value bit to get valid pointer
2747       cmpdi(flag, temp, 0);
2748       bne(flag, L_regular_inflated_unlock);
2749       tend_();
2750       b(cont);
2751       bind(L_regular_inflated_unlock);
2752     }
2753 #endif
2754 
2755     ld(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header);
2756     xorr(temp, R16_thread, temp);      // Will be 0 if we are the owner.
2757     orr(temp, temp, displaced_header); // Will be 0 if there are 0 recursions.
2758     cmpdi(flag, temp, 0);
2759     bne(flag, cont);
2760 
2761     ld(temp,             ObjectMonitor::EntryList_offset_in_bytes(), current_header);
2762     ld(displaced_header, ObjectMonitor::cxq_offset_in_bytes(), current_header);
2763     orr(temp, temp, displaced_header); // Will be 0 if both are 0.
2764     cmpdi(flag, temp, 0);
2765     bne(flag, cont);
2766     release();
2767     std(temp, ObjectMonitor::owner_offset_in_bytes(), current_header);
2768   }
2769 
2770   bind(cont);
2771   // flag == EQ indicates success
2772   // flag == NE indicates failure
2773 }
2774 
2775 // Write serialization page so VM thread can do a pseudo remote membar.
2776 // We use the current thread pointer to calculate a thread specific
2777 // offset to write to within the page. This minimizes bus traffic
2778 // due to cache line collision.
2779 void MacroAssembler::serialize_memory(Register thread, Register tmp1, Register tmp2) {
2780   srdi(tmp2, thread, os::get_serialize_page_shift_count());
2781 
2782   int mask = os::vm_page_size() - sizeof(int);
2783   if (Assembler::is_simm(mask, 16)) {
2784     andi(tmp2, tmp2, mask);
2785   } else {
2786     lis(tmp1, (int)((signed short) (mask >> 16)));
2787     ori(tmp1, tmp1, mask & 0x0000ffff);
2788     andr(tmp2, tmp2, tmp1);
2789   }
2790 
2791   load_const(tmp1, (long) os::get_memory_serialize_page());
2792   release();
2793   stwx(R0, tmp1, tmp2);
2794 }
2795 
2796 
2797 // GC barrier helper macros
2798 
2799 // Write the card table byte if needed.
2800 void MacroAssembler::card_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp) {
2801   CardTableModRefBS* bs =
2802     barrier_set_cast<CardTableModRefBS>(Universe::heap()->barrier_set());
2803   assert(bs->kind() == BarrierSet::CardTableForRS ||
2804          bs->kind() == BarrierSet::CardTableExtension, "wrong barrier");
2805 #ifdef ASSERT
2806   cmpdi(CCR0, Rnew_val, 0);
2807   asm_assert_ne("null oop not allowed", 0x321);
2808 #endif
2809   card_table_write(bs->byte_map_base, Rtmp, Rstore_addr);
2810 }
2811 
2812 // Write the card table byte.
2813 void MacroAssembler::card_table_write(volatile jbyte* byte_map_base, Register Rtmp, Register Robj) {
2814   assert_different_registers(Robj, Rtmp, R0);
2815   load_const_optimized(Rtmp, (address)byte_map_base, R0);
2816   srdi(Robj, Robj, CardTableModRefBS::card_shift);
2817   li(R0, 0); // dirty
2818   if (UseConcMarkSweepGC) membar(Assembler::StoreStore);
2819   stbx(R0, Rtmp, Robj);
2820 }
2821 
2822 #if INCLUDE_ALL_GCS
2823 // General G1 pre-barrier generator.
2824 // Goal: record the previous value if it is not null.
2825 void MacroAssembler::g1_write_barrier_pre(Register Robj, RegisterOrConstant offset, Register Rpre_val,
2826                                           Register Rtmp1, Register Rtmp2, bool needs_frame) {
2827   Label runtime, filtered;
2828 
2829   // Is marking active?
2830   if (in_bytes(SATBMarkQueue::byte_width_of_active()) == 4) {
2831     lwz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_active()), R16_thread);
2832   } else {
2833     guarantee(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, "Assumption");
2834     lbz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_active()), R16_thread);
2835   }
2836   cmpdi(CCR0, Rtmp1, 0);
2837   beq(CCR0, filtered);
2838 
2839   // Do we need to load the previous value?
2840   if (Robj != noreg) {
2841     // Load the previous value...
2842     if (UseCompressedOops) {
2843       lwz(Rpre_val, offset, Robj);
2844     } else {
2845       ld(Rpre_val, offset, Robj);
2846     }
2847     // Previous value has been loaded into Rpre_val.
2848   }
2849   assert(Rpre_val != noreg, "must have a real register");
2850 
2851   // Is the previous value null?
2852   cmpdi(CCR0, Rpre_val, 0);
2853   beq(CCR0, filtered);
2854 
2855   if (Robj != noreg && UseCompressedOops) {
2856     decode_heap_oop_not_null(Rpre_val);
2857   }
2858 
2859   // OK, it's not filtered, so we'll need to call enqueue. In the normal
2860   // case, pre_val will be a scratch G-reg, but there are some cases in
2861   // which it's an O-reg. In the first case, do a normal call. In the
2862   // latter, do a save here and call the frameless version.
2863 
2864   // Can we store original value in the thread's buffer?
2865   // Is index == 0?
2866   // (The index field is typed as size_t.)
2867   const Register Rbuffer = Rtmp1, Rindex = Rtmp2;
2868 
2869   ld(Rindex, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_index()), R16_thread);
2870   cmpdi(CCR0, Rindex, 0);
2871   beq(CCR0, runtime); // If index == 0, goto runtime.
2872   ld(Rbuffer, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_buf()), R16_thread);
2873 
2874   addi(Rindex, Rindex, -wordSize); // Decrement index.
2875   std(Rindex, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_index()), R16_thread);
2876 
2877   // Record the previous value.
2878   stdx(Rpre_val, Rbuffer, Rindex);
2879   b(filtered);
2880 
2881   bind(runtime);
2882 
2883   // VM call need frame to access(write) O register.
2884   if (needs_frame) {
2885     save_LR_CR(Rtmp1);
2886     push_frame_reg_args(0, Rtmp2);
2887   }
2888 
2889   if (Rpre_val->is_volatile() && Robj == noreg) mr(R31, Rpre_val); // Save pre_val across C call if it was preloaded.
2890   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), Rpre_val, R16_thread);
2891   if (Rpre_val->is_volatile() && Robj == noreg) mr(Rpre_val, R31); // restore
2892 
2893   if (needs_frame) {
2894     pop_frame();
2895     restore_LR_CR(Rtmp1);
2896   }
2897 
2898   bind(filtered);
2899 }
2900 
2901 // General G1 post-barrier generator
2902 // Store cross-region card.
2903 void MacroAssembler::g1_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp1, Register Rtmp2, Register Rtmp3, Label *filtered_ext) {
2904   Label runtime, filtered_int;
2905   Label& filtered = (filtered_ext != NULL) ? *filtered_ext : filtered_int;
2906   assert_different_registers(Rstore_addr, Rnew_val, Rtmp1, Rtmp2);
2907 
2908   G1SATBCardTableLoggingModRefBS* bs =
2909     barrier_set_cast<G1SATBCardTableLoggingModRefBS>(Universe::heap()->barrier_set());
2910 
2911   // Does store cross heap regions?
2912   if (G1RSBarrierRegionFilter) {
2913     xorr(Rtmp1, Rstore_addr, Rnew_val);
2914     srdi_(Rtmp1, Rtmp1, HeapRegion::LogOfHRGrainBytes);
2915     beq(CCR0, filtered);
2916   }
2917 
2918   // Crosses regions, storing NULL?
2919 #ifdef ASSERT
2920   cmpdi(CCR0, Rnew_val, 0);
2921   asm_assert_ne("null oop not allowed (G1)", 0x322); // Checked by caller on PPC64, so following branch is obsolete:
2922   //beq(CCR0, filtered);
2923 #endif
2924 
2925   // Storing region crossing non-NULL, is card already dirty?
2926   assert(sizeof(*bs->byte_map_base) == sizeof(jbyte), "adjust this code");
2927   const Register Rcard_addr = Rtmp1;
2928   Register Rbase = Rtmp2;
2929   load_const_optimized(Rbase, (address)bs->byte_map_base, /*temp*/ Rtmp3);
2930 
2931   srdi(Rcard_addr, Rstore_addr, CardTableModRefBS::card_shift);
2932 
2933   // Get the address of the card.
2934   lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr);
2935   cmpwi(CCR0, Rtmp3, (int)G1SATBCardTableModRefBS::g1_young_card_val());
2936   beq(CCR0, filtered);
2937 
2938   membar(Assembler::StoreLoad);
2939   lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr);  // Reload after membar.
2940   cmpwi(CCR0, Rtmp3 /* card value */, CardTableModRefBS::dirty_card_val());
2941   beq(CCR0, filtered);
2942 
2943   // Storing a region crossing, non-NULL oop, card is clean.
2944   // Dirty card and log.
2945   li(Rtmp3, CardTableModRefBS::dirty_card_val());
2946   //release(); // G1: oops are allowed to get visible after dirty marking.
2947   stbx(Rtmp3, Rbase, Rcard_addr);
2948 
2949   add(Rcard_addr, Rbase, Rcard_addr); // This is the address which needs to get enqueued.
2950   Rbase = noreg; // end of lifetime
2951 
2952   const Register Rqueue_index = Rtmp2,
2953                  Rqueue_buf   = Rtmp3;
2954   ld(Rqueue_index, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_index()), R16_thread);
2955   cmpdi(CCR0, Rqueue_index, 0);
2956   beq(CCR0, runtime); // index == 0 then jump to runtime
2957   ld(Rqueue_buf, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_buf()), R16_thread);
2958 
2959   addi(Rqueue_index, Rqueue_index, -wordSize); // decrement index
2960   std(Rqueue_index, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_index()), R16_thread);
2961 
2962   stdx(Rcard_addr, Rqueue_buf, Rqueue_index); // store card
2963   b(filtered);
2964 
2965   bind(runtime);
2966 
2967   // Save the live input values.
2968   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), Rcard_addr, R16_thread);
2969 
2970   bind(filtered_int);
2971 }
2972 #endif // INCLUDE_ALL_GCS
2973 
2974 // Values for last_Java_pc, and last_Java_sp must comply to the rules
2975 // in frame_ppc.hpp.
2976 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) {
2977   // Always set last_Java_pc and flags first because once last_Java_sp
2978   // is visible has_last_Java_frame is true and users will look at the
2979   // rest of the fields. (Note: flags should always be zero before we
2980   // get here so doesn't need to be set.)
2981 
2982   // Verify that last_Java_pc was zeroed on return to Java
2983   asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread,
2984                           "last_Java_pc not zeroed before leaving Java", 0x200);
2985 
2986   // When returning from calling out from Java mode the frame anchor's
2987   // last_Java_pc will always be set to NULL. It is set here so that
2988   // if we are doing a call to native (not VM) that we capture the
2989   // known pc and don't have to rely on the native call having a
2990   // standard frame linkage where we can find the pc.
2991   if (last_Java_pc != noreg)
2992     std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
2993 
2994   // Set last_Java_sp last.
2995   std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
2996 }
2997 
2998 void MacroAssembler::reset_last_Java_frame(void) {
2999   asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()),
3000                              R16_thread, "SP was not set, still zero", 0x202);
3001 
3002   BLOCK_COMMENT("reset_last_Java_frame {");
3003   li(R0, 0);
3004 
3005   // _last_Java_sp = 0
3006   std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
3007 
3008   // _last_Java_pc = 0
3009   std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
3010   BLOCK_COMMENT("} reset_last_Java_frame");
3011 }
3012 
3013 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) {
3014   assert_different_registers(sp, tmp1);
3015 
3016   // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via
3017   // TOP_IJAVA_FRAME_ABI.
3018   // FIXME: assert that we really have a TOP_IJAVA_FRAME here!
3019   address entry = pc();
3020   load_const_optimized(tmp1, entry);
3021 
3022   set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1);
3023 }
3024 
3025 void MacroAssembler::get_vm_result(Register oop_result) {
3026   // Read:
3027   //   R16_thread
3028   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
3029   //
3030   // Updated:
3031   //   oop_result
3032   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
3033 
3034   verify_thread();
3035 
3036   ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread);
3037   li(R0, 0);
3038   std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread);
3039 
3040   verify_oop(oop_result);
3041 }
3042 
3043 void MacroAssembler::get_vm_result_2(Register metadata_result) {
3044   // Read:
3045   //   R16_thread
3046   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
3047   //
3048   // Updated:
3049   //   metadata_result
3050   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
3051 
3052   ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
3053   li(R0, 0);
3054   std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
3055 }
3056 
3057 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3058   Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided.
3059   if (Universe::narrow_klass_base() != 0) {
3060     // Use dst as temp if it is free.
3061     sub_const_optimized(dst, current, Universe::narrow_klass_base(), R0);
3062     current = dst;
3063   }
3064   if (Universe::narrow_klass_shift() != 0) {
3065     srdi(dst, current, Universe::narrow_klass_shift());
3066     current = dst;
3067   }
3068   return current;
3069 }
3070 
3071 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) {
3072   if (UseCompressedClassPointers) {
3073     Register compressedKlass = encode_klass_not_null(ck, klass);
3074     stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop);
3075   } else {
3076     std(klass, oopDesc::klass_offset_in_bytes(), dst_oop);
3077   }
3078 }
3079 
3080 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) {
3081   if (UseCompressedClassPointers) {
3082     if (val == noreg) {
3083       val = R0;
3084       li(val, 0);
3085     }
3086     stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed
3087   }
3088 }
3089 
3090 int MacroAssembler::instr_size_for_decode_klass_not_null() {
3091   if (!UseCompressedClassPointers) return 0;
3092   int num_instrs = 1;  // shift or move
3093   if (Universe::narrow_klass_base() != 0) num_instrs = 7;  // shift + load const + add
3094   return num_instrs * BytesPerInstWord;
3095 }
3096 
3097 void MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3098   assert(dst != R0, "Dst reg may not be R0, as R0 is used here.");
3099   if (src == noreg) src = dst;
3100   Register shifted_src = src;
3101   if (Universe::narrow_klass_shift() != 0 ||
3102       Universe::narrow_klass_base() == 0 && src != dst) {  // Move required.
3103     shifted_src = dst;
3104     sldi(shifted_src, src, Universe::narrow_klass_shift());
3105   }
3106   if (Universe::narrow_klass_base() != 0) {
3107     add_const_optimized(dst, shifted_src, Universe::narrow_klass_base(), R0);
3108   }
3109 }
3110 
3111 void MacroAssembler::load_klass(Register dst, Register src) {
3112   if (UseCompressedClassPointers) {
3113     lwz(dst, oopDesc::klass_offset_in_bytes(), src);
3114     // Attention: no null check here!
3115     decode_klass_not_null(dst, dst);
3116   } else {
3117     ld(dst, oopDesc::klass_offset_in_bytes(), src);
3118   }
3119 }
3120 
3121 // Clear Array
3122 // Kills both input registers. tmp == R0 is allowed.
3123 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp) {
3124   // Procedure for large arrays (uses data cache block zero instruction).
3125     Label startloop, fast, fastloop, small_rest, restloop, done;
3126     const int cl_size         = VM_Version::L1_data_cache_line_size(),
3127               cl_dwords       = cl_size>>3,
3128               cl_dw_addr_bits = exact_log2(cl_dwords),
3129               dcbz_min        = 1;                     // Min count of dcbz executions, needs to be >0.
3130 
3131 //2:
3132     cmpdi(CCR1, cnt_dwords, ((dcbz_min+1)<<cl_dw_addr_bits)-1); // Big enough? (ensure >=dcbz_min lines included).
3133     blt(CCR1, small_rest);                                      // Too small.
3134     rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits);           // Extract dword offset within first cache line.
3135     beq(CCR0, fast);                                            // Already 128byte aligned.
3136 
3137     subfic(tmp, tmp, cl_dwords);
3138     mtctr(tmp);                        // Set ctr to hit 128byte boundary (0<ctr<cl_dwords).
3139     subf(cnt_dwords, tmp, cnt_dwords); // rest.
3140     li(tmp, 0);
3141 //10:
3142   bind(startloop);                     // Clear at the beginning to reach 128byte boundary.
3143     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3144     addi(base_ptr, base_ptr, 8);
3145     bdnz(startloop);
3146 //13:
3147   bind(fast);                                  // Clear 128byte blocks.
3148     srdi(tmp, cnt_dwords, cl_dw_addr_bits);    // Loop count for 128byte loop (>0).
3149     andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords.
3150     mtctr(tmp);                                // Load counter.
3151 //16:
3152   bind(fastloop);
3153     dcbz(base_ptr);                    // Clear 128byte aligned block.
3154     addi(base_ptr, base_ptr, cl_size);
3155     bdnz(fastloop);
3156     if (InsertEndGroupPPC64) { endgroup(); } else { nop(); }
3157 //20:
3158   bind(small_rest);
3159     cmpdi(CCR0, cnt_dwords, 0);        // size 0?
3160     beq(CCR0, done);                   // rest == 0
3161     li(tmp, 0);
3162     mtctr(cnt_dwords);                 // Load counter.
3163 //24:
3164   bind(restloop);                      // Clear rest.
3165     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3166     addi(base_ptr, base_ptr, 8);
3167     bdnz(restloop);
3168 //27:
3169   bind(done);
3170 }
3171 
3172 /////////////////////////////////////////// String intrinsics ////////////////////////////////////////////
3173 
3174 #ifdef COMPILER2
3175 // Intrinsics for CompactStrings
3176 
3177 // Compress char[] to byte[] by compressing 16 bytes at once.
3178 void MacroAssembler::string_compress_16(Register src, Register dst, Register cnt,
3179                                         Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5,
3180                                         Label& Lfailure) {
3181 
3182   const Register tmp0 = R0;
3183   assert_different_registers(src, dst, cnt, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5);
3184   Label Lloop, Lslow;
3185 
3186   // Check if cnt >= 8 (= 16 bytes)
3187   lis(tmp1, 0xFF);                // tmp1 = 0x00FF00FF00FF00FF
3188   srwi_(tmp2, cnt, 3);
3189   beq(CCR0, Lslow);
3190   ori(tmp1, tmp1, 0xFF);
3191   rldimi(tmp1, tmp1, 32, 0);
3192   mtctr(tmp2);
3193 
3194   // 2x unrolled loop
3195   bind(Lloop);
3196   ld(tmp2, 0, src);               // _0_1_2_3 (Big Endian)
3197   ld(tmp4, 8, src);               // _4_5_6_7
3198 
3199   orr(tmp0, tmp2, tmp4);
3200   rldicl(tmp3, tmp2, 6*8, 64-24); // _____1_2
3201   rldimi(tmp2, tmp2, 2*8, 2*8);   // _0_2_3_3
3202   rldicl(tmp5, tmp4, 6*8, 64-24); // _____5_6
3203   rldimi(tmp4, tmp4, 2*8, 2*8);   // _4_6_7_7
3204 
3205   andc_(tmp0, tmp0, tmp1);
3206   bne(CCR0, Lfailure);            // Not latin1.
3207   addi(src, src, 16);
3208 
3209   rlwimi(tmp3, tmp2, 0*8, 24, 31);// _____1_3
3210   srdi(tmp2, tmp2, 3*8);          // ____0_2_
3211   rlwimi(tmp5, tmp4, 0*8, 24, 31);// _____5_7
3212   srdi(tmp4, tmp4, 3*8);          // ____4_6_
3213 
3214   orr(tmp2, tmp2, tmp3);          // ____0123
3215   orr(tmp4, tmp4, tmp5);          // ____4567
3216 
3217   stw(tmp2, 0, dst);
3218   stw(tmp4, 4, dst);
3219   addi(dst, dst, 8);
3220   bdnz(Lloop);
3221 
3222   bind(Lslow);                    // Fallback to slow version
3223 }
3224 
3225 // Compress char[] to byte[]. cnt must be positive int.
3226 void MacroAssembler::string_compress(Register src, Register dst, Register cnt, Register tmp, Label& Lfailure) {
3227   Label Lloop;
3228   mtctr(cnt);
3229 
3230   bind(Lloop);
3231   lhz(tmp, 0, src);
3232   cmplwi(CCR0, tmp, 0xff);
3233   bgt(CCR0, Lfailure);            // Not latin1.
3234   addi(src, src, 2);
3235   stb(tmp, 0, dst);
3236   addi(dst, dst, 1);
3237   bdnz(Lloop);
3238 }
3239 
3240 // Inflate byte[] to char[] by inflating 16 bytes at once.
3241 void MacroAssembler::string_inflate_16(Register src, Register dst, Register cnt,
3242                                        Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
3243   const Register tmp0 = R0;
3244   assert_different_registers(src, dst, cnt, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5);
3245   Label Lloop, Lslow;
3246 
3247   // Check if cnt >= 8
3248   srwi_(tmp2, cnt, 3);
3249   beq(CCR0, Lslow);
3250   lis(tmp1, 0xFF);                // tmp1 = 0x00FF00FF
3251   ori(tmp1, tmp1, 0xFF);
3252   mtctr(tmp2);
3253 
3254   // 2x unrolled loop
3255   bind(Lloop);
3256   lwz(tmp2, 0, src);              // ____0123 (Big Endian)
3257   lwz(tmp4, 4, src);              // ____4567
3258   addi(src, src, 8);
3259 
3260   rldicl(tmp3, tmp2, 7*8, 64-8);  // _______2
3261   rlwimi(tmp2, tmp2, 3*8, 16, 23);// ____0113
3262   rldicl(tmp5, tmp4, 7*8, 64-8);  // _______6
3263   rlwimi(tmp4, tmp4, 3*8, 16, 23);// ____4557
3264 
3265   andc(tmp0, tmp2, tmp1);         // ____0_1_
3266   rlwimi(tmp2, tmp3, 2*8, 0, 23); // _____2_3
3267   andc(tmp3, tmp4, tmp1);         // ____4_5_
3268   rlwimi(tmp4, tmp5, 2*8, 0, 23); // _____6_7
3269 
3270   rldimi(tmp2, tmp0, 3*8, 0*8);   // _0_1_2_3
3271   rldimi(tmp4, tmp3, 3*8, 0*8);   // _4_5_6_7
3272 
3273   std(tmp2, 0, dst);
3274   std(tmp4, 8, dst);
3275   addi(dst, dst, 16);
3276   bdnz(Lloop);
3277 
3278   bind(Lslow);                    // Fallback to slow version
3279 }
3280 
3281 // Inflate byte[] to char[]. cnt must be positive int.
3282 void MacroAssembler::string_inflate(Register src, Register dst, Register cnt, Register tmp) {
3283   Label Lloop;
3284   mtctr(cnt);
3285 
3286   bind(Lloop);
3287   lbz(tmp, 0, src);
3288   addi(src, src, 1);
3289   sth(tmp, 0, dst);
3290   addi(dst, dst, 2);
3291   bdnz(Lloop);
3292 }
3293 
3294 void MacroAssembler::string_compare(Register str1, Register str2,
3295                                     Register cnt1, Register cnt2,
3296                                     Register tmp1, Register result, int ae) {
3297   const Register tmp0 = R0,
3298                  diff = tmp1;
3299 
3300   assert_different_registers(str1, str2, cnt1, cnt2, tmp0, tmp1, result);
3301   Label Ldone, Lslow, Lloop, Lreturn_diff;
3302 
3303   // Note: Making use of the fact that compareTo(a, b) == -compareTo(b, a)
3304   // we interchange str1 and str2 in the UL case and negate the result.
3305   // Like this, str1 is always latin1 encoded, except for the UU case.
3306   // In addition, we need 0 (or sign which is 0) extend.
3307 
3308   if (ae == StrIntrinsicNode::UU) {
3309     srwi(cnt1, cnt1, 1);
3310   } else {
3311     clrldi(cnt1, cnt1, 32);
3312   }
3313 
3314   if (ae != StrIntrinsicNode::LL) {
3315     srwi(cnt2, cnt2, 1);
3316   } else {
3317     clrldi(cnt2, cnt2, 32);
3318   }
3319 
3320   // See if the lengths are different, and calculate min in cnt1.
3321   // Save diff in case we need it for a tie-breaker.
3322   subf_(diff, cnt2, cnt1); // diff = cnt1 - cnt2
3323   // if (diff > 0) { cnt1 = cnt2; }
3324   if (VM_Version::has_isel()) {
3325     isel(cnt1, CCR0, Assembler::greater, /*invert*/ false, cnt2);
3326   } else {
3327     Label Lskip;
3328     blt(CCR0, Lskip);
3329     mr(cnt1, cnt2);
3330     bind(Lskip);
3331   }
3332 
3333   // Rename registers
3334   Register chr1 = result;
3335   Register chr2 = tmp0;
3336 
3337   // Compare multiple characters in fast loop (only implemented for same encoding).
3338   int stride1 = 8, stride2 = 8;
3339   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3340     int log2_chars_per_iter = (ae == StrIntrinsicNode::LL) ? 3 : 2;
3341     Label Lfastloop, Lskipfast;
3342 
3343     srwi_(tmp0, cnt1, log2_chars_per_iter);
3344     beq(CCR0, Lskipfast);
3345     rldicl(cnt2, cnt1, 0, 64 - log2_chars_per_iter); // Remaining characters.
3346     li(cnt1, 1 << log2_chars_per_iter); // Initialize for failure case: Rescan characters from current iteration.
3347     mtctr(tmp0);
3348 
3349     bind(Lfastloop);
3350     ld(chr1, 0, str1);
3351     ld(chr2, 0, str2);
3352     cmpd(CCR0, chr1, chr2);
3353     bne(CCR0, Lslow);
3354     addi(str1, str1, stride1);
3355     addi(str2, str2, stride2);
3356     bdnz(Lfastloop);
3357     mr(cnt1, cnt2); // Remaining characters.
3358     bind(Lskipfast);
3359   }
3360 
3361   // Loop which searches the first difference character by character.
3362   cmpwi(CCR0, cnt1, 0);
3363   beq(CCR0, Lreturn_diff);
3364   bind(Lslow);
3365   mtctr(cnt1);
3366 
3367   switch (ae) {
3368     case StrIntrinsicNode::LL: stride1 = 1; stride2 = 1; break;
3369     case StrIntrinsicNode::UL: // fallthru (see comment above)
3370     case StrIntrinsicNode::LU: stride1 = 1; stride2 = 2; break;
3371     case StrIntrinsicNode::UU: stride1 = 2; stride2 = 2; break;
3372     default: ShouldNotReachHere(); break;
3373   }
3374 
3375   bind(Lloop);
3376   if (stride1 == 1) { lbz(chr1, 0, str1); } else { lhz(chr1, 0, str1); }
3377   if (stride2 == 1) { lbz(chr2, 0, str2); } else { lhz(chr2, 0, str2); }
3378   subf_(result, chr2, chr1); // result = chr1 - chr2
3379   bne(CCR0, Ldone);
3380   addi(str1, str1, stride1);
3381   addi(str2, str2, stride2);
3382   bdnz(Lloop);
3383 
3384   // If strings are equal up to min length, return the length difference.
3385   bind(Lreturn_diff);
3386   mr(result, diff);
3387 
3388   // Otherwise, return the difference between the first mismatched chars.
3389   bind(Ldone);
3390   if (ae == StrIntrinsicNode::UL) {
3391     neg(result, result); // Negate result (see note above).
3392   }
3393 }
3394 
3395 void MacroAssembler::array_equals(bool is_array_equ, Register ary1, Register ary2,
3396                                   Register limit, Register tmp1, Register result, bool is_byte) {
3397   const Register tmp0 = R0;
3398   assert_different_registers(ary1, ary2, limit, tmp0, tmp1, result);
3399   Label Ldone, Lskiploop, Lloop, Lfastloop, Lskipfast;
3400   bool limit_needs_shift = false;
3401 
3402   if (is_array_equ) {
3403     const int length_offset = arrayOopDesc::length_offset_in_bytes();
3404     const int base_offset   = arrayOopDesc::base_offset_in_bytes(is_byte ? T_BYTE : T_CHAR);
3405 
3406     // Return true if the same array.
3407     cmpd(CCR0, ary1, ary2);
3408     beq(CCR0, Lskiploop);
3409 
3410     // Return false if one of them is NULL.
3411     cmpdi(CCR0, ary1, 0);
3412     cmpdi(CCR1, ary2, 0);
3413     li(result, 0);
3414     cror(CCR0, Assembler::equal, CCR1, Assembler::equal);
3415     beq(CCR0, Ldone);
3416 
3417     // Load the lengths of arrays.
3418     lwz(limit, length_offset, ary1);
3419     lwz(tmp0, length_offset, ary2);
3420 
3421     // Return false if the two arrays are not equal length.
3422     cmpw(CCR0, limit, tmp0);
3423     bne(CCR0, Ldone);
3424 
3425     // Load array addresses.
3426     addi(ary1, ary1, base_offset);
3427     addi(ary2, ary2, base_offset);
3428   } else {
3429     limit_needs_shift = !is_byte;
3430     li(result, 0); // Assume not equal.
3431   }
3432 
3433   // Rename registers
3434   Register chr1 = tmp0;
3435   Register chr2 = tmp1;
3436 
3437   // Compare 8 bytes per iteration in fast loop.
3438   const int log2_chars_per_iter = is_byte ? 3 : 2;
3439 
3440   srwi_(tmp0, limit, log2_chars_per_iter + (limit_needs_shift ? 1 : 0));
3441   beq(CCR0, Lskipfast);
3442   mtctr(tmp0);
3443 
3444   bind(Lfastloop);
3445   ld(chr1, 0, ary1);
3446   ld(chr2, 0, ary2);
3447   addi(ary1, ary1, 8);
3448   addi(ary2, ary2, 8);
3449   cmpd(CCR0, chr1, chr2);
3450   bne(CCR0, Ldone);
3451   bdnz(Lfastloop);
3452 
3453   bind(Lskipfast);
3454   rldicl_(limit, limit, limit_needs_shift ? 64 - 1 : 0, 64 - log2_chars_per_iter); // Remaining characters.
3455   beq(CCR0, Lskiploop);
3456   mtctr(limit);
3457 
3458   // Character by character.
3459   bind(Lloop);
3460   if (is_byte) {
3461     lbz(chr1, 0, ary1);
3462     lbz(chr2, 0, ary2);
3463     addi(ary1, ary1, 1);
3464     addi(ary2, ary2, 1);
3465   } else {
3466     lhz(chr1, 0, ary1);
3467     lhz(chr2, 0, ary2);
3468     addi(ary1, ary1, 2);
3469     addi(ary2, ary2, 2);
3470   }
3471   cmpw(CCR0, chr1, chr2);
3472   bne(CCR0, Ldone);
3473   bdnz(Lloop);
3474 
3475   bind(Lskiploop);
3476   li(result, 1); // All characters are equal.
3477   bind(Ldone);
3478 }
3479 
3480 void MacroAssembler::string_indexof(Register result, Register haystack, Register haycnt,
3481                                     Register needle, ciTypeArray* needle_values, Register needlecnt, int needlecntval,
3482                                     Register tmp1, Register tmp2, Register tmp3, Register tmp4, int ae) {
3483 
3484   // Ensure 0<needlecnt<=haycnt in ideal graph as prerequisite!
3485   Label L_TooShort, L_Found, L_NotFound, L_End;
3486   Register last_addr = haycnt, // Kill haycnt at the beginning.
3487   addr      = tmp1,
3488   n_start   = tmp2,
3489   ch1       = tmp3,
3490   ch2       = R0;
3491 
3492   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
3493   const int h_csize = (ae == StrIntrinsicNode::LL) ? 1 : 2;
3494   const int n_csize = (ae == StrIntrinsicNode::UU) ? 2 : 1;
3495 
3496   // **************************************************************************************************
3497   // Prepare for main loop: optimized for needle count >=2, bail out otherwise.
3498   // **************************************************************************************************
3499 
3500   // Compute last haystack addr to use if no match gets found.
3501   clrldi(haycnt, haycnt, 32);         // Ensure positive int is valid as 64 bit value.
3502   addi(addr, haystack, -h_csize);     // Accesses use pre-increment.
3503   if (needlecntval == 0) { // variable needlecnt
3504    cmpwi(CCR6, needlecnt, 2);
3505    clrldi(needlecnt, needlecnt, 32);  // Ensure positive int is valid as 64 bit value.
3506    blt(CCR6, L_TooShort);             // Variable needlecnt: handle short needle separately.
3507   }
3508 
3509   if (n_csize == 2) { lwz(n_start, 0, needle); } else { lhz(n_start, 0, needle); } // Load first 2 characters of needle.
3510 
3511   if (needlecntval == 0) { // variable needlecnt
3512    subf(ch1, needlecnt, haycnt);      // Last character index to compare is haycnt-needlecnt.
3513    addi(needlecnt, needlecnt, -2);    // Rest of needle.
3514   } else { // constant needlecnt
3515   guarantee(needlecntval != 1, "IndexOf with single-character needle must be handled separately");
3516   assert((needlecntval & 0x7fff) == needlecntval, "wrong immediate");
3517    addi(ch1, haycnt, -needlecntval);  // Last character index to compare is haycnt-needlecnt.
3518    if (needlecntval > 3) { li(needlecnt, needlecntval - 2); } // Rest of needle.
3519   }
3520 
3521   if (h_csize == 2) { slwi(ch1, ch1, 1); } // Scale to number of bytes.
3522 
3523   if (ae ==StrIntrinsicNode::UL) {
3524    srwi(tmp4, n_start, 1*8);          // ___0
3525    rlwimi(n_start, tmp4, 2*8, 0, 23); // _0_1
3526   }
3527 
3528   add(last_addr, haystack, ch1);      // Point to last address to compare (haystack+2*(haycnt-needlecnt)).
3529 
3530   // Main Loop (now we have at least 2 characters).
3531   Label L_OuterLoop, L_InnerLoop, L_FinalCheck, L_Comp1, L_Comp2;
3532   bind(L_OuterLoop); // Search for 1st 2 characters.
3533   Register addr_diff = tmp4;
3534    subf(addr_diff, addr, last_addr);  // Difference between already checked address and last address to check.
3535    addi(addr, addr, h_csize);         // This is the new address we want to use for comparing.
3536    srdi_(ch2, addr_diff, h_csize);
3537    beq(CCR0, L_FinalCheck);           // 2 characters left?
3538    mtctr(ch2);                        // num of characters / 2
3539   bind(L_InnerLoop);                  // Main work horse (2x unrolled search loop)
3540    if (h_csize == 2) {                // Load 2 characters of haystack (ignore alignment).
3541     lwz(ch1, 0, addr);
3542     lwz(ch2, 2, addr);
3543    } else {
3544     lhz(ch1, 0, addr);
3545     lhz(ch2, 1, addr);
3546    }
3547    cmpw(CCR0, ch1, n_start);          // Compare 2 characters (1 would be sufficient but try to reduce branches to CompLoop).
3548    cmpw(CCR1, ch2, n_start);
3549    beq(CCR0, L_Comp1);                // Did we find the needle start?
3550    beq(CCR1, L_Comp2);
3551    addi(addr, addr, 2 * h_csize);
3552    bdnz(L_InnerLoop);
3553   bind(L_FinalCheck);
3554    andi_(addr_diff, addr_diff, h_csize); // Remaining characters not covered by InnerLoop: (num of characters) & 1.
3555    beq(CCR0, L_NotFound);
3556    if (h_csize == 2) { lwz(ch1, 0, addr); } else { lhz(ch1, 0, addr); } // One position left at which we have to compare.
3557    cmpw(CCR1, ch1, n_start);
3558    beq(CCR1, L_Comp1);
3559   bind(L_NotFound);
3560    li(result, -1);                    // not found
3561    b(L_End);
3562 
3563    // **************************************************************************************************
3564    // Special Case: unfortunately, the variable needle case can be called with needlecnt<2
3565    // **************************************************************************************************
3566   if (needlecntval == 0) {           // We have to handle these cases separately.
3567   Label L_OneCharLoop;
3568   bind(L_TooShort);
3569    mtctr(haycnt);
3570    if (n_csize == 2) { lhz(n_start, 0, needle); } else { lbz(n_start, 0, needle); } // First character of needle
3571   bind(L_OneCharLoop);
3572    if (h_csize == 2) { lhzu(ch1, 2, addr); } else { lbzu(ch1, 1, addr); }
3573    cmpw(CCR1, ch1, n_start);
3574    beq(CCR1, L_Found);               // Did we find the one character needle?
3575    bdnz(L_OneCharLoop);
3576    li(result, -1);                   // Not found.
3577    b(L_End);
3578   }
3579 
3580   // **************************************************************************************************
3581   // Regular Case Part II: compare rest of needle (first 2 characters have been compared already)
3582   // **************************************************************************************************
3583 
3584   // Compare the rest
3585   bind(L_Comp2);
3586    addi(addr, addr, h_csize);        // First comparison has failed, 2nd one hit.
3587   bind(L_Comp1);                     // Addr points to possible needle start.
3588   if (needlecntval != 2) {           // Const needlecnt==2?
3589    if (needlecntval != 3) {
3590     if (needlecntval == 0) { beq(CCR6, L_Found); } // Variable needlecnt==2?
3591     Register n_ind = tmp4,
3592              h_ind = n_ind;
3593     li(n_ind, 2 * n_csize);          // First 2 characters are already compared, use index 2.
3594     mtctr(needlecnt);                // Decremented by 2, still > 0.
3595    Label L_CompLoop;
3596    bind(L_CompLoop);
3597     if (ae ==StrIntrinsicNode::UL) {
3598       h_ind = ch1;
3599       sldi(h_ind, n_ind, 1);
3600     }
3601     if (n_csize == 2) { lhzx(ch2, needle, n_ind); } else { lbzx(ch2, needle, n_ind); }
3602     if (h_csize == 2) { lhzx(ch1, addr, h_ind); } else { lbzx(ch1, addr, h_ind); }
3603     cmpw(CCR1, ch1, ch2);
3604     bne(CCR1, L_OuterLoop);
3605     addi(n_ind, n_ind, n_csize);
3606     bdnz(L_CompLoop);
3607    } else { // No loop required if there's only one needle character left.
3608     if (n_csize == 2) { lhz(ch2, 2 * 2, needle); } else { lbz(ch2, 2 * 1, needle); }
3609     if (h_csize == 2) { lhz(ch1, 2 * 2, addr); } else { lbz(ch1, 2 * 1, addr); }
3610     cmpw(CCR1, ch1, ch2);
3611     bne(CCR1, L_OuterLoop);
3612    }
3613   }
3614   // Return index ...
3615   bind(L_Found);
3616    subf(result, haystack, addr);     // relative to haystack, ...
3617    if (h_csize == 2) { srdi(result, result, 1); } // in characters.
3618   bind(L_End);
3619 } // string_indexof
3620 
3621 void MacroAssembler::string_indexof_char(Register result, Register haystack, Register haycnt,
3622                                          Register needle, jchar needleChar, Register tmp1, Register tmp2, bool is_byte) {
3623   assert_different_registers(haystack, haycnt, needle, tmp1, tmp2);
3624 
3625   Label L_InnerLoop, L_FinalCheck, L_Found1, L_Found2, L_NotFound, L_End;
3626   Register addr = tmp1,
3627            ch1 = tmp2,
3628            ch2 = R0;
3629 
3630   const int h_csize = is_byte ? 1 : 2;
3631 
3632 //4:
3633    srwi_(tmp2, haycnt, 1);   // Shift right by exact_log2(UNROLL_FACTOR).
3634    mr(addr, haystack);
3635    beq(CCR0, L_FinalCheck);
3636    mtctr(tmp2);              // Move to count register.
3637 //8:
3638   bind(L_InnerLoop);         // Main work horse (2x unrolled search loop).
3639    if (!is_byte) {
3640     lhz(ch1, 0, addr);
3641     lhz(ch2, 2, addr);
3642    } else {
3643     lbz(ch1, 0, addr);
3644     lbz(ch2, 1, addr);
3645    }
3646    (needle != R0) ? cmpw(CCR0, ch1, needle) : cmplwi(CCR0, ch1, (unsigned int)needleChar);
3647    (needle != R0) ? cmpw(CCR1, ch2, needle) : cmplwi(CCR1, ch2, (unsigned int)needleChar);
3648    beq(CCR0, L_Found1);      // Did we find the needle?
3649    beq(CCR1, L_Found2);
3650    addi(addr, addr, 2 * h_csize);
3651    bdnz(L_InnerLoop);
3652 //16:
3653   bind(L_FinalCheck);
3654    andi_(R0, haycnt, 1);
3655    beq(CCR0, L_NotFound);
3656    if (!is_byte) { lhz(ch1, 0, addr); } else { lbz(ch1, 0, addr); } // One position left at which we have to compare.
3657    (needle != R0) ? cmpw(CCR1, ch1, needle) : cmplwi(CCR1, ch1, (unsigned int)needleChar);
3658    beq(CCR1, L_Found1);
3659 //21:
3660   bind(L_NotFound);
3661    li(result, -1);           // Not found.
3662    b(L_End);
3663 
3664   bind(L_Found2);
3665    addi(addr, addr, h_csize);
3666 //24:
3667   bind(L_Found1);            // Return index ...
3668    subf(result, haystack, addr); // relative to haystack, ...
3669    if (!is_byte) { srdi(result, result, 1); } // in characters.
3670   bind(L_End);
3671 } // string_indexof_char
3672 
3673 
3674 void MacroAssembler::has_negatives(Register src, Register cnt, Register result,
3675                                    Register tmp1, Register tmp2) {
3676   const Register tmp0 = R0;
3677   assert_different_registers(src, result, cnt, tmp0, tmp1, tmp2);
3678   Label Lfastloop, Lslow, Lloop, Lnoneg, Ldone;
3679 
3680   // Check if cnt >= 8 (= 16 bytes)
3681   lis(tmp1, (int)(short)0x8080);  // tmp1 = 0x8080808080808080
3682   srwi_(tmp2, cnt, 4);
3683   li(result, 1);                  // Assume there's a negative byte.
3684   beq(CCR0, Lslow);
3685   ori(tmp1, tmp1, 0x8080);
3686   rldimi(tmp1, tmp1, 32, 0);
3687   mtctr(tmp2);
3688 
3689   // 2x unrolled loop
3690   bind(Lfastloop);
3691   ld(tmp2, 0, src);
3692   ld(tmp0, 8, src);
3693 
3694   orr(tmp0, tmp2, tmp0);
3695 
3696   and_(tmp0, tmp0, tmp1);
3697   bne(CCR0, Ldone);               // Found negative byte.
3698   addi(src, src, 16);
3699 
3700   bdnz(Lfastloop);
3701 
3702   bind(Lslow);                    // Fallback to slow version
3703   rldicl_(tmp0, cnt, 0, 64-4);
3704   beq(CCR0, Lnoneg);
3705   mtctr(tmp0);
3706   bind(Lloop);
3707   lbz(tmp0, 0, src);
3708   addi(src, src, 1);
3709   andi_(tmp0, tmp0, 0x80);
3710   bne(CCR0, Ldone);               // Found negative byte.
3711   bdnz(Lloop);
3712   bind(Lnoneg);
3713   li(result, 0);
3714 
3715   bind(Ldone);
3716 }
3717 
3718 
3719 // Intrinsics for non-CompactStrings
3720 
3721 // Search for a single jchar in an jchar[].
3722 //
3723 // Assumes that result differs from all other registers.
3724 //
3725 // 'haystack' is the addresses of a jchar-array.
3726 // 'needle' is either the character to search for or R0.
3727 // 'needleChar' is the character to search for if 'needle' == R0..
3728 // 'haycnt' is the length of the haystack. We assume 'haycnt' >=1.
3729 //
3730 // Preserves haystack, haycnt, needle and kills all other registers.
3731 //
3732 // If needle == R0, we search for the constant needleChar.
3733 void MacroAssembler::string_indexof_1(Register result, Register haystack, Register haycnt,
3734                                       Register needle, jchar needleChar,
3735                                       Register tmp1, Register tmp2) {
3736 
3737   assert_different_registers(result, haystack, haycnt, needle, tmp1, tmp2);
3738 
3739   Label L_InnerLoop, L_FinalCheck, L_Found1, L_Found2, L_Found3, L_NotFound, L_End;
3740   Register addr = tmp1,
3741            ch1 = tmp2,
3742            ch2 = R0;
3743 
3744 //3:
3745    dcbtct(haystack, 0x00);                        // Indicate R/O access to haystack.
3746 
3747    srwi_(tmp2, haycnt, 1);   // Shift right by exact_log2(UNROLL_FACTOR).
3748    mr(addr, haystack);
3749    beq(CCR0, L_FinalCheck);
3750    mtctr(tmp2);              // Move to count register.
3751 //8:
3752   bind(L_InnerLoop);             // Main work horse (2x unrolled search loop).
3753    lhz(ch1, 0, addr);        // Load characters from haystack.
3754    lhz(ch2, 2, addr);
3755    (needle != R0) ? cmpw(CCR0, ch1, needle) : cmplwi(CCR0, ch1, needleChar);
3756    (needle != R0) ? cmpw(CCR1, ch2, needle) : cmplwi(CCR1, ch2, needleChar);
3757    beq(CCR0, L_Found1);   // Did we find the needle?
3758    beq(CCR1, L_Found2);
3759    addi(addr, addr, 4);
3760    bdnz(L_InnerLoop);
3761 //16:
3762   bind(L_FinalCheck);
3763    andi_(R0, haycnt, 1);
3764    beq(CCR0, L_NotFound);
3765    lhz(ch1, 0, addr);        // One position left at which we have to compare.
3766    (needle != R0) ? cmpw(CCR1, ch1, needle) : cmplwi(CCR1, ch1, needleChar);
3767    beq(CCR1, L_Found3);
3768 //21:
3769   bind(L_NotFound);
3770    li(result, -1);           // Not found.
3771    b(L_End);
3772 
3773   bind(L_Found2);
3774    addi(addr, addr, 2);
3775 //24:
3776   bind(L_Found1);
3777   bind(L_Found3);                  // Return index ...
3778    subf(addr, haystack, addr); // relative to haystack,
3779    srdi(result, addr, 1);      // in characters.
3780   bind(L_End);
3781 }
3782 
3783 
3784 // Implementation of IndexOf for jchar arrays.
3785 //
3786 // The length of haystack and needle are not constant, i.e. passed in a register.
3787 //
3788 // Preserves registers haystack, needle.
3789 // Kills registers haycnt, needlecnt.
3790 // Assumes that result differs from all other registers.
3791 // Haystack, needle are the addresses of jchar-arrays.
3792 // Haycnt, needlecnt are the lengths of them, respectively.
3793 //
3794 // Needlecntval must be zero or 15-bit unsigned immediate and > 1.
3795 void MacroAssembler::string_indexof(Register result, Register haystack, Register haycnt,
3796                                     Register needle, ciTypeArray* needle_values, Register needlecnt, int needlecntval,
3797                                     Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
3798 
3799   // Ensure 0<needlecnt<=haycnt in ideal graph as prerequisite!
3800   Label L_TooShort, L_Found, L_NotFound, L_End;
3801   Register last_addr = haycnt, // Kill haycnt at the beginning.
3802            addr      = tmp1,
3803            n_start   = tmp2,
3804            ch1       = tmp3,
3805            ch2       = R0;
3806 
3807   // **************************************************************************************************
3808   // Prepare for main loop: optimized for needle count >=2, bail out otherwise.
3809   // **************************************************************************************************
3810 
3811 //1 (variable) or 3 (const):
3812    dcbtct(needle, 0x00);    // Indicate R/O access to str1.
3813    dcbtct(haystack, 0x00);  // Indicate R/O access to str2.
3814 
3815   // Compute last haystack addr to use if no match gets found.
3816   if (needlecntval == 0) { // variable needlecnt
3817 //3:
3818    subf(ch1, needlecnt, haycnt);      // Last character index to compare is haycnt-needlecnt.
3819    addi(addr, haystack, -2);          // Accesses use pre-increment.
3820    cmpwi(CCR6, needlecnt, 2);
3821    blt(CCR6, L_TooShort);          // Variable needlecnt: handle short needle separately.
3822    slwi(ch1, ch1, 1);                 // Scale to number of bytes.
3823    lwz(n_start, 0, needle);           // Load first 2 characters of needle.
3824    add(last_addr, haystack, ch1);     // Point to last address to compare (haystack+2*(haycnt-needlecnt)).
3825    addi(needlecnt, needlecnt, -2);    // Rest of needle.
3826   } else { // constant needlecnt
3827   guarantee(needlecntval != 1, "IndexOf with single-character needle must be handled separately");
3828   assert((needlecntval & 0x7fff) == needlecntval, "wrong immediate");
3829 //5:
3830    addi(ch1, haycnt, -needlecntval);  // Last character index to compare is haycnt-needlecnt.
3831    lwz(n_start, 0, needle);           // Load first 2 characters of needle.
3832    addi(addr, haystack, -2);          // Accesses use pre-increment.
3833    slwi(ch1, ch1, 1);                 // Scale to number of bytes.
3834    add(last_addr, haystack, ch1);     // Point to last address to compare (haystack+2*(haycnt-needlecnt)).
3835    li(needlecnt, needlecntval-2);     // Rest of needle.
3836   }
3837 
3838   // Main Loop (now we have at least 3 characters).
3839 //11:
3840   Label L_OuterLoop, L_InnerLoop, L_FinalCheck, L_Comp1, L_Comp2, L_Comp3;
3841   bind(L_OuterLoop); // Search for 1st 2 characters.
3842   Register addr_diff = tmp4;
3843    subf(addr_diff, addr, last_addr); // Difference between already checked address and last address to check.
3844    addi(addr, addr, 2);              // This is the new address we want to use for comparing.
3845    srdi_(ch2, addr_diff, 2);
3846    beq(CCR0, L_FinalCheck);       // 2 characters left?
3847    mtctr(ch2);                       // addr_diff/4
3848 //16:
3849   bind(L_InnerLoop);                // Main work horse (2x unrolled search loop)
3850    lwz(ch1, 0, addr);           // Load 2 characters of haystack (ignore alignment).
3851    lwz(ch2, 2, addr);
3852    cmpw(CCR0, ch1, n_start); // Compare 2 characters (1 would be sufficient but try to reduce branches to CompLoop).
3853    cmpw(CCR1, ch2, n_start);
3854    beq(CCR0, L_Comp1);       // Did we find the needle start?
3855    beq(CCR1, L_Comp2);
3856    addi(addr, addr, 4);
3857    bdnz(L_InnerLoop);
3858 //24:
3859   bind(L_FinalCheck);
3860    rldicl_(addr_diff, addr_diff, 64-1, 63); // Remaining characters not covered by InnerLoop: (addr_diff>>1)&1.
3861    beq(CCR0, L_NotFound);
3862    lwz(ch1, 0, addr);                       // One position left at which we have to compare.
3863    cmpw(CCR1, ch1, n_start);
3864    beq(CCR1, L_Comp3);
3865 //29:
3866   bind(L_NotFound);
3867    li(result, -1); // not found
3868    b(L_End);
3869 
3870 
3871    // **************************************************************************************************
3872    // Special Case: unfortunately, the variable needle case can be called with needlecnt<2
3873    // **************************************************************************************************
3874 //31:
3875  if ((needlecntval>>1) !=1 ) { // Const needlecnt is 2 or 3? Reduce code size.
3876   int nopcnt = 5;
3877   if (needlecntval !=0 ) ++nopcnt; // Balance alignment (other case: see below).
3878   if (needlecntval == 0) {         // We have to handle these cases separately.
3879   Label L_OneCharLoop;
3880   bind(L_TooShort);
3881    mtctr(haycnt);
3882    lhz(n_start, 0, needle);    // First character of needle
3883   bind(L_OneCharLoop);
3884    lhzu(ch1, 2, addr);
3885    cmpw(CCR1, ch1, n_start);
3886    beq(CCR1, L_Found);      // Did we find the one character needle?
3887    bdnz(L_OneCharLoop);
3888    li(result, -1);             // Not found.
3889    b(L_End);
3890   } // 8 instructions, so no impact on alignment.
3891   for (int x = 0; x < nopcnt; ++x) nop();
3892  }
3893 
3894   // **************************************************************************************************
3895   // Regular Case Part II: compare rest of needle (first 2 characters have been compared already)
3896   // **************************************************************************************************
3897 
3898   // Compare the rest
3899 //36 if needlecntval==0, else 37:
3900   bind(L_Comp2);
3901    addi(addr, addr, 2); // First comparison has failed, 2nd one hit.
3902   bind(L_Comp1);            // Addr points to possible needle start.
3903   bind(L_Comp3);            // Could have created a copy and use a different return address but saving code size here.
3904   if (needlecntval != 2) {  // Const needlecnt==2?
3905    if (needlecntval != 3) {
3906     if (needlecntval == 0) beq(CCR6, L_Found); // Variable needlecnt==2?
3907     Register ind_reg = tmp4;
3908     li(ind_reg, 2*2);   // First 2 characters are already compared, use index 2.
3909     mtctr(needlecnt);   // Decremented by 2, still > 0.
3910 //40:
3911    Label L_CompLoop;
3912    bind(L_CompLoop);
3913     lhzx(ch2, needle, ind_reg);
3914     lhzx(ch1, addr, ind_reg);
3915     cmpw(CCR1, ch1, ch2);
3916     bne(CCR1, L_OuterLoop);
3917     addi(ind_reg, ind_reg, 2);
3918     bdnz(L_CompLoop);
3919    } else { // No loop required if there's only one needle character left.
3920     lhz(ch2, 2*2, needle);
3921     lhz(ch1, 2*2, addr);
3922     cmpw(CCR1, ch1, ch2);
3923     bne(CCR1, L_OuterLoop);
3924    }
3925   }
3926   // Return index ...
3927 //46:
3928   bind(L_Found);
3929    subf(addr, haystack, addr); // relative to haystack, ...
3930    srdi(result, addr, 1);      // in characters.
3931 //48:
3932   bind(L_End);
3933 }
3934 
3935 // Implementation of Compare for jchar arrays.
3936 //
3937 // Kills the registers str1, str2, cnt1, cnt2.
3938 // Kills cr0, ctr.
3939 // Assumes that result differes from the input registers.
3940 void MacroAssembler::string_compare(Register str1_reg, Register str2_reg, Register cnt1_reg, Register cnt2_reg,
3941                                     Register result_reg, Register tmp_reg) {
3942    assert_different_registers(result_reg, str1_reg, str2_reg, cnt1_reg, cnt2_reg, tmp_reg);
3943 
3944    Label Ldone, Lslow_case, Lslow_loop, Lfast_loop;
3945    Register cnt_diff = R0,
3946             limit_reg = cnt1_reg,
3947             chr1_reg = result_reg,
3948             chr2_reg = cnt2_reg,
3949             addr_diff = str2_reg;
3950 
3951    // 'cnt_reg' contains the number of characters in the string's character array for the
3952    // pre-CompactStrings strings implementation and the number of bytes in the string's
3953    // byte array for the CompactStrings strings implementation.
3954    const int HAS_COMPACT_STRING = java_lang_String::has_coder_field() ? 1 : 0; // '1' = byte array, '0' = char array
3955 
3956    // Offset 0 should be 32 byte aligned.
3957 //-6:
3958     srawi(cnt1_reg, cnt1_reg, HAS_COMPACT_STRING);
3959     srawi(cnt2_reg, cnt2_reg, HAS_COMPACT_STRING);
3960 //-4:
3961     dcbtct(str1_reg, 0x00);  // Indicate R/O access to str1.
3962     dcbtct(str2_reg, 0x00);  // Indicate R/O access to str2.
3963 //-2:
3964    // Compute min(cnt1, cnt2) and check if 0 (bail out if we don't need to compare characters).
3965     subf(result_reg, cnt2_reg, cnt1_reg);  // difference between cnt1/2
3966     subf_(addr_diff, str1_reg, str2_reg);  // alias?
3967     beq(CCR0, Ldone);                   // return cnt difference if both ones are identical
3968     srawi(limit_reg, result_reg, 31);      // generate signmask (cnt1/2 must be non-negative so cnt_diff can't overflow)
3969     mr(cnt_diff, result_reg);
3970     andr(limit_reg, result_reg, limit_reg); // difference or zero (negative): cnt1<cnt2 ? cnt1-cnt2 : 0
3971     add_(limit_reg, cnt2_reg, limit_reg);  // min(cnt1, cnt2)==0?
3972     beq(CCR0, Ldone);                   // return cnt difference if one has 0 length
3973 
3974     lhz(chr1_reg, 0, str1_reg);            // optional: early out if first characters mismatch
3975     lhzx(chr2_reg, str1_reg, addr_diff);   // optional: early out if first characters mismatch
3976     addi(tmp_reg, limit_reg, -1);          // min(cnt1, cnt2)-1
3977     subf_(result_reg, chr2_reg, chr1_reg); // optional: early out if first characters mismatch
3978     bne(CCR0, Ldone);                   // optional: early out if first characters mismatch
3979 
3980    // Set loop counter by scaling down tmp_reg
3981     srawi_(chr2_reg, tmp_reg, exact_log2(4)); // (min(cnt1, cnt2)-1)/4
3982     ble(CCR0, Lslow_case);                 // need >4 characters for fast loop
3983     andi(limit_reg, tmp_reg, 4-1);            // remaining characters
3984 
3985    // Adapt str1_reg str2_reg for the first loop iteration
3986     mtctr(chr2_reg);                 // (min(cnt1, cnt2)-1)/4
3987     addi(limit_reg, limit_reg, 4+1); // compare last 5-8 characters in slow_case if mismatch found in fast_loop
3988 //16:
3989    // Compare the rest of the characters
3990    bind(Lfast_loop);
3991     ld(chr1_reg, 0, str1_reg);
3992     ldx(chr2_reg, str1_reg, addr_diff);
3993     cmpd(CCR0, chr2_reg, chr1_reg);
3994     bne(CCR0, Lslow_case); // return chr1_reg
3995     addi(str1_reg, str1_reg, 4*2);
3996     bdnz(Lfast_loop);
3997     addi(limit_reg, limit_reg, -4); // no mismatch found in fast_loop, only 1-4 characters missing
3998 //23:
3999    bind(Lslow_case);
4000     mtctr(limit_reg);
4001 //24:
4002    bind(Lslow_loop);
4003     lhz(chr1_reg, 0, str1_reg);
4004     lhzx(chr2_reg, str1_reg, addr_diff);
4005     subf_(result_reg, chr2_reg, chr1_reg);
4006     bne(CCR0, Ldone); // return chr1_reg
4007     addi(str1_reg, str1_reg, 1*2);
4008     bdnz(Lslow_loop);
4009 //30:
4010    // If strings are equal up to min length, return the length difference.
4011     mr(result_reg, cnt_diff);
4012     nop(); // alignment
4013 //32:
4014    // Otherwise, return the difference between the first mismatched chars.
4015    bind(Ldone);
4016 }
4017 
4018 
4019 // Compare char[] arrays.
4020 //
4021 // str1_reg   USE only
4022 // str2_reg   USE only
4023 // cnt_reg    USE_DEF, due to tmp reg shortage
4024 // result_reg DEF only, might compromise USE only registers
4025 void MacroAssembler::char_arrays_equals(Register str1_reg, Register str2_reg, Register cnt_reg, Register result_reg,
4026                                         Register tmp1_reg, Register tmp2_reg, Register tmp3_reg, Register tmp4_reg,
4027                                         Register tmp5_reg) {
4028 
4029   // Str1 may be the same register as str2 which can occur e.g. after scalar replacement.
4030   assert_different_registers(result_reg, str1_reg, cnt_reg, tmp1_reg, tmp2_reg, tmp3_reg, tmp4_reg, tmp5_reg);
4031   assert_different_registers(result_reg, str2_reg, cnt_reg, tmp1_reg, tmp2_reg, tmp3_reg, tmp4_reg, tmp5_reg);
4032 
4033   // Offset 0 should be 32 byte aligned.
4034   Label Linit_cbc, Lcbc, Lloop, Ldone_true, Ldone_false;
4035   Register index_reg = tmp5_reg;
4036   Register cbc_iter  = tmp4_reg;
4037 
4038   // 'cnt_reg' contains the number of characters in the string's character array for the
4039   // pre-CompactStrings strings implementation and the number of bytes in the string's
4040   // byte array for the CompactStrings strings implementation.
4041   const int HAS_COMPACT_STRING = java_lang_String::has_coder_field() ? 1 : 0; // '1' = byte array, '0' = char array
4042 
4043 //-1:
4044   dcbtct(str1_reg, 0x00);  // Indicate R/O access to str1.
4045   dcbtct(str2_reg, 0x00);  // Indicate R/O access to str2.
4046 //1:
4047   // cbc_iter: remaining characters after the '4 java characters per iteration' loop.
4048   rlwinm(cbc_iter, cnt_reg, 32 - HAS_COMPACT_STRING, 30, 31); // (cnt_reg % (HAS_COMPACT_STRING ? 8 : 4)) >> HAS_COMPACT_STRING
4049   li(index_reg, 0); // init
4050   li(result_reg, 0); // assume false
4051   // tmp2_reg: units of 4 java characters (i.e. 8 bytes) per iteration (main loop).
4052   srwi_(tmp2_reg, cnt_reg, exact_log2(4 << HAS_COMPACT_STRING)); // cnt_reg / (HAS_COMPACT_STRING ? 8 : 4)
4053 
4054   cmpwi(CCR1, cbc_iter, 0);             // CCR1 = (cbc_iter==0)
4055   beq(CCR0, Linit_cbc);                 // too short
4056     mtctr(tmp2_reg);
4057 //8:
4058     bind(Lloop);
4059       ldx(tmp1_reg, str1_reg, index_reg);
4060       ldx(tmp2_reg, str2_reg, index_reg);
4061       cmpd(CCR0, tmp1_reg, tmp2_reg);
4062       bne(CCR0, Ldone_false);  // Unequal char pair found -> done.
4063       addi(index_reg, index_reg, 4*sizeof(jchar));
4064       bdnz(Lloop);
4065 //14:
4066   bind(Linit_cbc);
4067   beq(CCR1, Ldone_true);
4068     mtctr(cbc_iter);
4069 //16:
4070     bind(Lcbc);
4071       lhzx(tmp1_reg, str1_reg, index_reg);
4072       lhzx(tmp2_reg, str2_reg, index_reg);
4073       cmpw(CCR0, tmp1_reg, tmp2_reg);
4074       bne(CCR0, Ldone_false);  // Unequal char pair found -> done.
4075       addi(index_reg, index_reg, 1*sizeof(jchar));
4076       bdnz(Lcbc);
4077     nop();
4078   bind(Ldone_true);
4079   li(result_reg, 1);
4080 //24:
4081   bind(Ldone_false);
4082 }
4083 
4084 
4085 void MacroAssembler::char_arrays_equalsImm(Register str1_reg, Register str2_reg, int cntval, Register result_reg,
4086                                            Register tmp1_reg, Register tmp2_reg) {
4087   // Str1 may be the same register as str2 which can occur e.g. after scalar replacement.
4088   assert_different_registers(result_reg, str1_reg, tmp1_reg, tmp2_reg);
4089   assert_different_registers(result_reg, str2_reg, tmp1_reg, tmp2_reg);
4090   assert(sizeof(jchar) == 2, "must be");
4091   assert(cntval >= 0 && ((cntval & 0x7fff) == cntval), "wrong immediate");
4092 
4093   // 'cntval' contains the number of characters in the string's character array for the
4094   // pre-CompactStrings strings implementation and the number of bytes in the string's
4095   // byte array for the CompactStrings strings implementation.
4096   cntval >>= (java_lang_String::has_coder_field() ? 1 : 0); // '1' = byte array strings, '0' = char array strings
4097 
4098   Label Ldone_false;
4099 
4100   if (cntval < 16) { // short case
4101     if (cntval != 0) li(result_reg, 0); // assume false
4102 
4103     const int num_bytes = cntval*sizeof(jchar);
4104     int index = 0;
4105     for (int next_index; (next_index = index + 8) <= num_bytes; index = next_index) {
4106       ld(tmp1_reg, index, str1_reg);
4107       ld(tmp2_reg, index, str2_reg);
4108       cmpd(CCR0, tmp1_reg, tmp2_reg);
4109       bne(CCR0, Ldone_false);
4110     }
4111     if (cntval & 2) {
4112       lwz(tmp1_reg, index, str1_reg);
4113       lwz(tmp2_reg, index, str2_reg);
4114       cmpw(CCR0, tmp1_reg, tmp2_reg);
4115       bne(CCR0, Ldone_false);
4116       index += 4;
4117     }
4118     if (cntval & 1) {
4119       lhz(tmp1_reg, index, str1_reg);
4120       lhz(tmp2_reg, index, str2_reg);
4121       cmpw(CCR0, tmp1_reg, tmp2_reg);
4122       bne(CCR0, Ldone_false);
4123     }
4124     // fallthrough: true
4125   } else {
4126     Label Lloop;
4127     Register index_reg = tmp1_reg;
4128     const int loopcnt = cntval/4;
4129     assert(loopcnt > 0, "must be");
4130     // Offset 0 should be 32 byte aligned.
4131     //2:
4132     dcbtct(str1_reg, 0x00);  // Indicate R/O access to str1.
4133     dcbtct(str2_reg, 0x00);  // Indicate R/O access to str2.
4134     li(tmp2_reg, loopcnt);
4135     li(index_reg, 0); // init
4136     li(result_reg, 0); // assume false
4137     mtctr(tmp2_reg);
4138     //8:
4139     bind(Lloop);
4140     ldx(R0, str1_reg, index_reg);
4141     ldx(tmp2_reg, str2_reg, index_reg);
4142     cmpd(CCR0, R0, tmp2_reg);
4143     bne(CCR0, Ldone_false);  // Unequal char pair found -> done.
4144     addi(index_reg, index_reg, 4*sizeof(jchar));
4145     bdnz(Lloop);
4146     //14:
4147     if (cntval & 2) {
4148       lwzx(R0, str1_reg, index_reg);
4149       lwzx(tmp2_reg, str2_reg, index_reg);
4150       cmpw(CCR0, R0, tmp2_reg);
4151       bne(CCR0, Ldone_false);
4152       if (cntval & 1) addi(index_reg, index_reg, 2*sizeof(jchar));
4153     }
4154     if (cntval & 1) {
4155       lhzx(R0, str1_reg, index_reg);
4156       lhzx(tmp2_reg, str2_reg, index_reg);
4157       cmpw(CCR0, R0, tmp2_reg);
4158       bne(CCR0, Ldone_false);
4159     }
4160     // fallthru: true
4161   }
4162   li(result_reg, 1);
4163   bind(Ldone_false);
4164 }
4165 
4166 #endif // Compiler2
4167 
4168 // Helpers for Intrinsic Emitters
4169 //
4170 // Revert the byte order of a 32bit value in a register
4171 //   src: 0x44556677
4172 //   dst: 0x77665544
4173 // Three steps to obtain the result:
4174 //  1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word
4175 //     into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared.
4176 //     This value initializes dst.
4177 //  2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost
4178 //     byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go.
4179 //     This value is mask inserted into dst with a [0..23] mask of 1s.
4180 //  3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position.
4181 //     This value is mask inserted into dst with a [8..15] mask of 1s.
4182 void MacroAssembler::load_reverse_32(Register dst, Register src) {
4183   assert_different_registers(dst, src);
4184 
4185   rldicl(dst, src, (4+1)*8, 56);       // Rotate byte 4 into position 7 (rightmost), clear all to the left.
4186   rlwimi(dst, src,     3*8,  0, 23);   // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone.
4187   rlwimi(dst, src,     1*8,  8, 15);   // Insert byte 6 into position 5, leave the rest alone.
4188 }
4189 
4190 // Calculate the column addresses of the crc32 lookup table into distinct registers.
4191 // This loop-invariant calculation is moved out of the loop body, reducing the loop
4192 // body size from 20 to 16 instructions.
4193 // Returns the offset that was used to calculate the address of column tc3.
4194 // Due to register shortage, setting tc3 may overwrite table. With the return offset
4195 // at hand, the original table address can be easily reconstructed.
4196 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {
4197 
4198 #ifdef VM_LITTLE_ENDIAN
4199   // This is what we implement (the DOLIT4 part):
4200   // ========================================================================= */
4201   // #define DOLIT4 c ^= *buf4++; \
4202   //         c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \
4203   //             crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24]
4204   // #define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4
4205   // ========================================================================= */
4206   const int ix0 = 3*(4*CRC32_COLUMN_SIZE);
4207   const int ix1 = 2*(4*CRC32_COLUMN_SIZE);
4208   const int ix2 = 1*(4*CRC32_COLUMN_SIZE);
4209   const int ix3 = 0*(4*CRC32_COLUMN_SIZE);
4210 #else
4211   // This is what we implement (the DOBIG4 part):
4212   // =========================================================================
4213   // #define DOBIG4 c ^= *++buf4; \
4214   //         c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \
4215   //             crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24]
4216   // #define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4
4217   // =========================================================================
4218   const int ix0 = 4*(4*CRC32_COLUMN_SIZE);
4219   const int ix1 = 5*(4*CRC32_COLUMN_SIZE);
4220   const int ix2 = 6*(4*CRC32_COLUMN_SIZE);
4221   const int ix3 = 7*(4*CRC32_COLUMN_SIZE);
4222 #endif
4223   assert_different_registers(table, tc0, tc1, tc2);
4224   assert(table == tc3, "must be!");
4225 
4226   addi(tc0, table, ix0);
4227   addi(tc1, table, ix1);
4228   addi(tc2, table, ix2);
4229   if (ix3 != 0) addi(tc3, table, ix3);
4230 
4231   return ix3;
4232 }
4233 
4234 /**
4235  * uint32_t crc;
4236  * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
4237  */
4238 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) {
4239   assert_different_registers(crc, table, tmp);
4240   assert_different_registers(val, table);
4241 
4242   if (crc == val) {                   // Must rotate first to use the unmodified value.
4243     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
4244                                       // As we use a word (4-byte) instruction, we have to adapt the mask bit positions.
4245     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
4246   } else {
4247     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
4248     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
4249   }
4250   lwzx(tmp, table, tmp);
4251   xorr(crc, crc, tmp);
4252 }
4253 
4254 /**
4255  * uint32_t crc;
4256  * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
4257  */
4258 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
4259   fold_byte_crc32(crc, crc, table, tmp);
4260 }
4261 
4262 /**
4263  * Emits code to update CRC-32 with a byte value according to constants in table.
4264  *
4265  * @param [in,out]crc   Register containing the crc.
4266  * @param [in]val       Register containing the byte to fold into the CRC.
4267  * @param [in]table     Register containing the table of crc constants.
4268  *
4269  * uint32_t crc;
4270  * val = crc_table[(val ^ crc) & 0xFF];
4271  * crc = val ^ (crc >> 8);
4272  */
4273 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
4274   BLOCK_COMMENT("update_byte_crc32:");
4275   xorr(val, val, crc);
4276   fold_byte_crc32(crc, val, table, val);
4277 }
4278 
4279 /**
4280  * @param crc   register containing existing CRC (32-bit)
4281  * @param buf   register pointing to input byte buffer (byte*)
4282  * @param len   register containing number of bytes
4283  * @param table register pointing to CRC table
4284  */
4285 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,
4286                                            Register data, bool loopAlignment, bool invertCRC) {
4287   assert_different_registers(crc, buf, len, table, data);
4288 
4289   Label L_mainLoop, L_done;
4290   const int mainLoop_stepping  = 1;
4291   const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4;
4292 
4293   // Process all bytes in a single-byte loop.
4294   clrldi_(len, len, 32);                         // Enforce 32 bit. Anything to do?
4295   beq(CCR0, L_done);
4296 
4297   if (invertCRC) {
4298     nand(crc, crc, crc);                         // ~c
4299   }
4300 
4301   mtctr(len);
4302   align(mainLoop_alignment);
4303   BIND(L_mainLoop);
4304     lbz(data, 0, buf);                           // Byte from buffer, zero-extended.
4305     addi(buf, buf, mainLoop_stepping);           // Advance buffer position.
4306     update_byte_crc32(crc, data, table);
4307     bdnz(L_mainLoop);                            // Iterate.
4308 
4309   if (invertCRC) {
4310     nand(crc, crc, crc);                         // ~c
4311   }
4312 
4313   bind(L_done);
4314 }
4315 
4316 /**
4317  * Emits code to update CRC-32 with a 4-byte value according to constants in table
4318  * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
4319  */
4320 // A not on the lookup table address(es):
4321 // The lookup table consists of two sets of four columns each.
4322 // The columns {0..3} are used for little-endian machines.
4323 // The columns {4..7} are used for big-endian machines.
4324 // To save the effort of adding the column offset to the table address each time
4325 // a table element is looked up, it is possible to pass the pre-calculated
4326 // column addresses.
4327 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary.
4328 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,
4329                                         Register t0,  Register t1,  Register t2,  Register t3,
4330                                         Register tc0, Register tc1, Register tc2, Register tc3) {
4331   assert_different_registers(crc, t3);
4332 
4333   // XOR crc with next four bytes of buffer.
4334   lwz(t3, bufDisp, buf);
4335   if (bufInc != 0) {
4336     addi(buf, buf, bufInc);
4337   }
4338   xorr(t3, t3, crc);
4339 
4340   // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices.
4341   rlwinm(t0, t3,  2,         24-2, 31-2);  // ((t1 >>  0) & 0xff) << 2
4342   rlwinm(t1, t3,  32+(2- 8), 24-2, 31-2);  // ((t1 >>  8) & 0xff) << 2
4343   rlwinm(t2, t3,  32+(2-16), 24-2, 31-2);  // ((t1 >> 16) & 0xff) << 2
4344   rlwinm(t3, t3,  32+(2-24), 24-2, 31-2);  // ((t1 >> 24) & 0xff) << 2
4345 
4346   // Use the pre-calculated column addresses.
4347   // Load pre-calculated table values.
4348   lwzx(t0, tc0, t0);
4349   lwzx(t1, tc1, t1);
4350   lwzx(t2, tc2, t2);
4351   lwzx(t3, tc3, t3);
4352 
4353   // Calculate new crc from table values.
4354   xorr(t0,  t0, t1);
4355   xorr(t2,  t2, t3);
4356   xorr(crc, t0, t2);  // Now crc contains the final checksum value.
4357 }
4358 
4359 /**
4360  * @param crc   register containing existing CRC (32-bit)
4361  * @param buf   register pointing to input byte buffer (byte*)
4362  * @param len   register containing number of bytes
4363  * @param table register pointing to CRC table
4364  *
4365  * Uses R9..R12 as work register. Must be saved/restored by caller!
4366  */
4367 void MacroAssembler::kernel_crc32_2word(Register crc, Register buf, Register len, Register table,
4368                                         Register t0,  Register t1,  Register t2,  Register t3,
4369                                         Register tc0, Register tc1, Register tc2, Register tc3) {
4370   assert_different_registers(crc, buf, len, table);
4371 
4372   Label L_mainLoop, L_tail;
4373   Register  tmp  = t0;
4374   Register  data = t0;
4375   Register  tmp2 = t1;
4376   const int mainLoop_stepping  = 8;
4377   const int tailLoop_stepping  = 1;
4378   const int log_stepping       = exact_log2(mainLoop_stepping);
4379   const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
4380   const int complexThreshold   = 2*mainLoop_stepping;
4381 
4382   // Don't test for len <= 0 here. This pathological case should not occur anyway.
4383   // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles.
4384   // The situation itself is detected and handled correctly by the conditional branches
4385   // following  aghi(len, -stepping) and aghi(len, +stepping).
4386   assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
4387 
4388   BLOCK_COMMENT("kernel_crc32_2word {");
4389 
4390   nand(crc, crc, crc);                           // ~c
4391 
4392   // Check for short (<mainLoop_stepping) buffer.
4393   cmpdi(CCR0, len, complexThreshold);
4394   blt(CCR0, L_tail);
4395 
4396   // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
4397   // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
4398   {
4399     // Align buf addr to mainLoop_stepping boundary.
4400     neg(tmp2, buf);                           // Calculate # preLoop iterations for alignment.
4401     rldicl(tmp2, tmp2, 0, 64-log_stepping);   // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
4402 
4403     if (complexThreshold > mainLoop_stepping) {
4404       sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4405     } else {
4406       sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
4407       cmpdi(CCR0, tmp, mainLoop_stepping);
4408       blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
4409       mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4410     }
4411     update_byteLoop_crc32(crc, buf, tmp2, table, data, false, false);
4412   }
4413 
4414   srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
4415   andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
4416   mtctr(tmp2);
4417 
4418 #ifdef VM_LITTLE_ENDIAN
4419   Register crc_rv = crc;
4420 #else
4421   Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
4422                                                  // Occupies tmp, but frees up crc.
4423   load_reverse_32(crc_rv, crc);                  // Revert byte order because we are dealing with big-endian data.
4424   tmp = crc;
4425 #endif
4426 
4427   int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
4428 
4429   align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
4430   BIND(L_mainLoop);
4431     update_1word_crc32(crc_rv, buf, table, 0, 0, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
4432     update_1word_crc32(crc_rv, buf, table, 4, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
4433     bdnz(L_mainLoop);
4434 
4435 #ifndef VM_LITTLE_ENDIAN
4436   load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
4437   tmp = crc_rv;                                  // Tmp uses it's original register again.
4438 #endif
4439 
4440   // Restore original table address for tailLoop.
4441   if (reconstructTableOffset != 0) {
4442     addi(table, table, -reconstructTableOffset);
4443   }
4444 
4445   // Process last few (<complexThreshold) bytes of buffer.
4446   BIND(L_tail);
4447   update_byteLoop_crc32(crc, buf, len, table, data, false, false);
4448 
4449   nand(crc, crc, crc);                           // ~c
4450   BLOCK_COMMENT("} kernel_crc32_2word");
4451 }
4452 
4453 /**
4454  * @param crc   register containing existing CRC (32-bit)
4455  * @param buf   register pointing to input byte buffer (byte*)
4456  * @param len   register containing number of bytes
4457  * @param table register pointing to CRC table
4458  *
4459  * uses R9..R12 as work register. Must be saved/restored by caller!
4460  */
4461 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table,
4462                                         Register t0,  Register t1,  Register t2,  Register t3,
4463                                         Register tc0, Register tc1, Register tc2, Register tc3) {
4464   assert_different_registers(crc, buf, len, table);
4465 
4466   Label L_mainLoop, L_tail;
4467   Register  tmp          = t0;
4468   Register  data         = t0;
4469   Register  tmp2         = t1;
4470   const int mainLoop_stepping  = 4;
4471   const int tailLoop_stepping  = 1;
4472   const int log_stepping       = exact_log2(mainLoop_stepping);
4473   const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
4474   const int complexThreshold   = 2*mainLoop_stepping;
4475 
4476   // Don't test for len <= 0 here. This pathological case should not occur anyway.
4477   // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles.
4478   // The situation itself is detected and handled correctly by the conditional branches
4479   // following  aghi(len, -stepping) and aghi(len, +stepping).
4480   assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
4481 
4482   BLOCK_COMMENT("kernel_crc32_1word {");
4483 
4484   nand(crc, crc, crc);                           // ~c
4485 
4486   // Check for short (<mainLoop_stepping) buffer.
4487   cmpdi(CCR0, len, complexThreshold);
4488   blt(CCR0, L_tail);
4489 
4490   // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
4491   // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
4492   {
4493     // Align buf addr to mainLoop_stepping boundary.
4494     neg(tmp2, buf);                              // Calculate # preLoop iterations for alignment.
4495     rldicl(tmp2, tmp2, 0, 64-log_stepping);      // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
4496 
4497     if (complexThreshold > mainLoop_stepping) {
4498       sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4499     } else {
4500       sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
4501       cmpdi(CCR0, tmp, mainLoop_stepping);
4502       blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
4503       mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4504     }
4505     update_byteLoop_crc32(crc, buf, tmp2, table, data, false, false);
4506   }
4507 
4508   srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
4509   andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
4510   mtctr(tmp2);
4511 
4512 #ifdef VM_LITTLE_ENDIAN
4513   Register crc_rv = crc;
4514 #else
4515   Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
4516                                                  // Occupies tmp, but frees up crc.
4517   load_reverse_32(crc_rv, crc);                  // Revert byte order because we are dealing with big-endian data.
4518   tmp = crc;
4519 #endif
4520 
4521   int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
4522 
4523   align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
4524   BIND(L_mainLoop);
4525     update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
4526     bdnz(L_mainLoop);
4527 
4528 #ifndef VM_LITTLE_ENDIAN
4529   load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
4530   tmp = crc_rv;                                  // Tmp uses it's original register again.
4531 #endif
4532 
4533   // Restore original table address for tailLoop.
4534   if (reconstructTableOffset != 0) {
4535     addi(table, table, -reconstructTableOffset);
4536   }
4537 
4538   // Process last few (<complexThreshold) bytes of buffer.
4539   BIND(L_tail);
4540   update_byteLoop_crc32(crc, buf, len, table, data, false, false);
4541 
4542   nand(crc, crc, crc);                           // ~c
4543   BLOCK_COMMENT("} kernel_crc32_1word");
4544 }
4545 
4546 /**
4547  * @param crc   register containing existing CRC (32-bit)
4548  * @param buf   register pointing to input byte buffer (byte*)
4549  * @param len   register containing number of bytes
4550  * @param table register pointing to CRC table
4551  *
4552  * Uses R7_ARG5, R8_ARG6 as work registers.
4553  */
4554 void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len, Register table,
4555                                         Register t0,  Register t1,  Register t2,  Register t3) {
4556   assert_different_registers(crc, buf, len, table);
4557 
4558   Register  data = t0;                   // Holds the current byte to be folded into crc.
4559 
4560   BLOCK_COMMENT("kernel_crc32_1byte {");
4561 
4562   // Process all bytes in a single-byte loop.
4563   update_byteLoop_crc32(crc, buf, len, table, data, true, true);
4564 
4565   BLOCK_COMMENT("} kernel_crc32_1byte");
4566 }
4567 
4568 void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp) {
4569   assert_different_registers(crc, buf, /* len,  not used!! */ table, tmp);
4570 
4571   BLOCK_COMMENT("kernel_crc32_singleByte:");
4572   nand(crc, crc, crc);       // ~c
4573 
4574   lbz(tmp, 0, buf);          // Byte from buffer, zero-extended.
4575   update_byte_crc32(crc, tmp, table);
4576 
4577   nand(crc, crc, crc);       // ~c
4578 }
4579 
4580 // dest_lo += src1 + src2
4581 // dest_hi += carry1 + carry2
4582 void MacroAssembler::add2_with_carry(Register dest_hi,
4583                                      Register dest_lo,
4584                                      Register src1, Register src2) {
4585   li(R0, 0);
4586   addc(dest_lo, dest_lo, src1);
4587   adde(dest_hi, dest_hi, R0);
4588   addc(dest_lo, dest_lo, src2);
4589   adde(dest_hi, dest_hi, R0);
4590 }
4591 
4592 // Multiply 64 bit by 64 bit first loop.
4593 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,
4594                                            Register x_xstart,
4595                                            Register y, Register y_idx,
4596                                            Register z,
4597                                            Register carry,
4598                                            Register product_high, Register product,
4599                                            Register idx, Register kdx,
4600                                            Register tmp) {
4601   //  jlong carry, x[], y[], z[];
4602   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
4603   //    huge_128 product = y[idx] * x[xstart] + carry;
4604   //    z[kdx] = (jlong)product;
4605   //    carry  = (jlong)(product >>> 64);
4606   //  }
4607   //  z[xstart] = carry;
4608 
4609   Label L_first_loop, L_first_loop_exit;
4610   Label L_one_x, L_one_y, L_multiply;
4611 
4612   addic_(xstart, xstart, -1);
4613   blt(CCR0, L_one_x);   // Special case: length of x is 1.
4614 
4615   // Load next two integers of x.
4616   sldi(tmp, xstart, LogBytesPerInt);
4617   ldx(x_xstart, x, tmp);
4618 #ifdef VM_LITTLE_ENDIAN
4619   rldicl(x_xstart, x_xstart, 32, 0);
4620 #endif
4621 
4622   align(32, 16);
4623   bind(L_first_loop);
4624 
4625   cmpdi(CCR0, idx, 1);
4626   blt(CCR0, L_first_loop_exit);
4627   addi(idx, idx, -2);
4628   beq(CCR0, L_one_y);
4629 
4630   // Load next two integers of y.
4631   sldi(tmp, idx, LogBytesPerInt);
4632   ldx(y_idx, y, tmp);
4633 #ifdef VM_LITTLE_ENDIAN
4634   rldicl(y_idx, y_idx, 32, 0);
4635 #endif
4636 
4637 
4638   bind(L_multiply);
4639   multiply64(product_high, product, x_xstart, y_idx);
4640 
4641   li(tmp, 0);
4642   addc(product, product, carry);         // Add carry to result.
4643   adde(product_high, product_high, tmp); // Add carry of the last addition.
4644   addi(kdx, kdx, -2);
4645 
4646   // Store result.
4647 #ifdef VM_LITTLE_ENDIAN
4648   rldicl(product, product, 32, 0);
4649 #endif
4650   sldi(tmp, kdx, LogBytesPerInt);
4651   stdx(product, z, tmp);
4652   mr_if_needed(carry, product_high);
4653   b(L_first_loop);
4654 
4655 
4656   bind(L_one_y); // Load one 32 bit portion of y as (0,value).
4657 
4658   lwz(y_idx, 0, y);
4659   b(L_multiply);
4660 
4661 
4662   bind(L_one_x); // Load one 32 bit portion of x as (0,value).
4663 
4664   lwz(x_xstart, 0, x);
4665   b(L_first_loop);
4666 
4667   bind(L_first_loop_exit);
4668 }
4669 
4670 // Multiply 64 bit by 64 bit and add 128 bit.
4671 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y,
4672                                             Register z, Register yz_idx,
4673                                             Register idx, Register carry,
4674                                             Register product_high, Register product,
4675                                             Register tmp, int offset) {
4676 
4677   //  huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
4678   //  z[kdx] = (jlong)product;
4679 
4680   sldi(tmp, idx, LogBytesPerInt);
4681   if (offset) {
4682     addi(tmp, tmp, offset);
4683   }
4684   ldx(yz_idx, y, tmp);
4685 #ifdef VM_LITTLE_ENDIAN
4686   rldicl(yz_idx, yz_idx, 32, 0);
4687 #endif
4688 
4689   multiply64(product_high, product, x_xstart, yz_idx);
4690   ldx(yz_idx, z, tmp);
4691 #ifdef VM_LITTLE_ENDIAN
4692   rldicl(yz_idx, yz_idx, 32, 0);
4693 #endif
4694 
4695   add2_with_carry(product_high, product, carry, yz_idx);
4696 
4697   sldi(tmp, idx, LogBytesPerInt);
4698   if (offset) {
4699     addi(tmp, tmp, offset);
4700   }
4701 #ifdef VM_LITTLE_ENDIAN
4702   rldicl(product, product, 32, 0);
4703 #endif
4704   stdx(product, z, tmp);
4705 }
4706 
4707 // Multiply 128 bit by 128 bit. Unrolled inner loop.
4708 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart,
4709                                              Register y, Register z,
4710                                              Register yz_idx, Register idx, Register carry,
4711                                              Register product_high, Register product,
4712                                              Register carry2, Register tmp) {
4713 
4714   //  jlong carry, x[], y[], z[];
4715   //  int kdx = ystart+1;
4716   //  for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
4717   //    huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
4718   //    z[kdx+idx+1] = (jlong)product;
4719   //    jlong carry2 = (jlong)(product >>> 64);
4720   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
4721   //    z[kdx+idx] = (jlong)product;
4722   //    carry = (jlong)(product >>> 64);
4723   //  }
4724   //  idx += 2;
4725   //  if (idx > 0) {
4726   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
4727   //    z[kdx+idx] = (jlong)product;
4728   //    carry = (jlong)(product >>> 64);
4729   //  }
4730 
4731   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
4732   const Register jdx = R0;
4733 
4734   // Scale the index.
4735   srdi_(jdx, idx, 2);
4736   beq(CCR0, L_third_loop_exit);
4737   mtctr(jdx);
4738 
4739   align(32, 16);
4740   bind(L_third_loop);
4741 
4742   addi(idx, idx, -4);
4743 
4744   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8);
4745   mr_if_needed(carry2, product_high);
4746 
4747   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0);
4748   mr_if_needed(carry, product_high);
4749   bdnz(L_third_loop);
4750 
4751   bind(L_third_loop_exit);  // Handle any left-over operand parts.
4752 
4753   andi_(idx, idx, 0x3);
4754   beq(CCR0, L_post_third_loop_done);
4755 
4756   Label L_check_1;
4757 
4758   addic_(idx, idx, -2);
4759   blt(CCR0, L_check_1);
4760 
4761   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0);
4762   mr_if_needed(carry, product_high);
4763 
4764   bind(L_check_1);
4765 
4766   addi(idx, idx, 0x2);
4767   andi_(idx, idx, 0x1);
4768   addic_(idx, idx, -1);
4769   blt(CCR0, L_post_third_loop_done);
4770 
4771   sldi(tmp, idx, LogBytesPerInt);
4772   lwzx(yz_idx, y, tmp);
4773   multiply64(product_high, product, x_xstart, yz_idx);
4774   lwzx(yz_idx, z, tmp);
4775 
4776   add2_with_carry(product_high, product, yz_idx, carry);
4777 
4778   sldi(tmp, idx, LogBytesPerInt);
4779   stwx(product, z, tmp);
4780   srdi(product, product, 32);
4781 
4782   sldi(product_high, product_high, 32);
4783   orr(product, product, product_high);
4784   mr_if_needed(carry, product);
4785 
4786   bind(L_post_third_loop_done);
4787 }   // multiply_128_x_128_loop
4788 
4789 void MacroAssembler::multiply_to_len(Register x, Register xlen,
4790                                      Register y, Register ylen,
4791                                      Register z, Register zlen,
4792                                      Register tmp1, Register tmp2,
4793                                      Register tmp3, Register tmp4,
4794                                      Register tmp5, Register tmp6,
4795                                      Register tmp7, Register tmp8,
4796                                      Register tmp9, Register tmp10,
4797                                      Register tmp11, Register tmp12,
4798                                      Register tmp13) {
4799 
4800   ShortBranchVerifier sbv(this);
4801 
4802   assert_different_registers(x, xlen, y, ylen, z, zlen,
4803                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
4804   assert_different_registers(x, xlen, y, ylen, z, zlen,
4805                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp7);
4806   assert_different_registers(x, xlen, y, ylen, z, zlen,
4807                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp8);
4808 
4809   const Register idx = tmp1;
4810   const Register kdx = tmp2;
4811   const Register xstart = tmp3;
4812 
4813   const Register y_idx = tmp4;
4814   const Register carry = tmp5;
4815   const Register product = tmp6;
4816   const Register product_high = tmp7;
4817   const Register x_xstart = tmp8;
4818   const Register tmp = tmp9;
4819 
4820   // First Loop.
4821   //
4822   //  final static long LONG_MASK = 0xffffffffL;
4823   //  int xstart = xlen - 1;
4824   //  int ystart = ylen - 1;
4825   //  long carry = 0;
4826   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
4827   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
4828   //    z[kdx] = (int)product;
4829   //    carry = product >>> 32;
4830   //  }
4831   //  z[xstart] = (int)carry;
4832 
4833   mr_if_needed(idx, ylen);        // idx = ylen
4834   mr_if_needed(kdx, zlen);        // kdx = xlen + ylen
4835   li(carry, 0);                   // carry = 0
4836 
4837   Label L_done;
4838 
4839   addic_(xstart, xlen, -1);
4840   blt(CCR0, L_done);
4841 
4842   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z,
4843                         carry, product_high, product, idx, kdx, tmp);
4844 
4845   Label L_second_loop;
4846 
4847   cmpdi(CCR0, kdx, 0);
4848   beq(CCR0, L_second_loop);
4849 
4850   Label L_carry;
4851 
4852   addic_(kdx, kdx, -1);
4853   beq(CCR0, L_carry);
4854 
4855   // Store lower 32 bits of carry.
4856   sldi(tmp, kdx, LogBytesPerInt);
4857   stwx(carry, z, tmp);
4858   srdi(carry, carry, 32);
4859   addi(kdx, kdx, -1);
4860 
4861 
4862   bind(L_carry);
4863 
4864   // Store upper 32 bits of carry.
4865   sldi(tmp, kdx, LogBytesPerInt);
4866   stwx(carry, z, tmp);
4867 
4868   // Second and third (nested) loops.
4869   //
4870   //  for (int i = xstart-1; i >= 0; i--) { // Second loop
4871   //    carry = 0;
4872   //    for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
4873   //      long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
4874   //                     (z[k] & LONG_MASK) + carry;
4875   //      z[k] = (int)product;
4876   //      carry = product >>> 32;
4877   //    }
4878   //    z[i] = (int)carry;
4879   //  }
4880   //
4881   //  i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
4882 
4883   bind(L_second_loop);
4884 
4885   li(carry, 0);                   // carry = 0;
4886 
4887   addic_(xstart, xstart, -1);     // i = xstart-1;
4888   blt(CCR0, L_done);
4889 
4890   Register zsave = tmp10;
4891 
4892   mr(zsave, z);
4893 
4894 
4895   Label L_last_x;
4896 
4897   sldi(tmp, xstart, LogBytesPerInt);
4898   add(z, z, tmp);                 // z = z + k - j
4899   addi(z, z, 4);
4900   addic_(xstart, xstart, -1);     // i = xstart-1;
4901   blt(CCR0, L_last_x);
4902 
4903   sldi(tmp, xstart, LogBytesPerInt);
4904   ldx(x_xstart, x, tmp);
4905 #ifdef VM_LITTLE_ENDIAN
4906   rldicl(x_xstart, x_xstart, 32, 0);
4907 #endif
4908 
4909 
4910   Label L_third_loop_prologue;
4911 
4912   bind(L_third_loop_prologue);
4913 
4914   Register xsave = tmp11;
4915   Register xlensave = tmp12;
4916   Register ylensave = tmp13;
4917 
4918   mr(xsave, x);
4919   mr(xlensave, xstart);
4920   mr(ylensave, ylen);
4921 
4922 
4923   multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen,
4924                           carry, product_high, product, x, tmp);
4925 
4926   mr(z, zsave);
4927   mr(x, xsave);
4928   mr(xlen, xlensave);   // This is the decrement of the loop counter!
4929   mr(ylen, ylensave);
4930 
4931   addi(tmp3, xlen, 1);
4932   sldi(tmp, tmp3, LogBytesPerInt);
4933   stwx(carry, z, tmp);
4934   addic_(tmp3, tmp3, -1);
4935   blt(CCR0, L_done);
4936 
4937   srdi(carry, carry, 32);
4938   sldi(tmp, tmp3, LogBytesPerInt);
4939   stwx(carry, z, tmp);
4940   b(L_second_loop);
4941 
4942   // Next infrequent code is moved outside loops.
4943   bind(L_last_x);
4944 
4945   lwz(x_xstart, 0, x);
4946   b(L_third_loop_prologue);
4947 
4948   bind(L_done);
4949 }   // multiply_to_len
4950 
4951 void MacroAssembler::asm_assert(bool check_equal, const char *msg, int id) {
4952 #ifdef ASSERT
4953   Label ok;
4954   if (check_equal) {
4955     beq(CCR0, ok);
4956   } else {
4957     bne(CCR0, ok);
4958   }
4959   stop(msg, id);
4960   bind(ok);
4961 #endif
4962 }
4963 
4964 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset,
4965                                           Register mem_base, const char* msg, int id) {
4966 #ifdef ASSERT
4967   switch (size) {
4968     case 4:
4969       lwz(R0, mem_offset, mem_base);
4970       cmpwi(CCR0, R0, 0);
4971       break;
4972     case 8:
4973       ld(R0, mem_offset, mem_base);
4974       cmpdi(CCR0, R0, 0);
4975       break;
4976     default:
4977       ShouldNotReachHere();
4978   }
4979   asm_assert(check_equal, msg, id);
4980 #endif // ASSERT
4981 }
4982 
4983 void MacroAssembler::verify_thread() {
4984   if (VerifyThread) {
4985     unimplemented("'VerifyThread' currently not implemented on PPC");
4986   }
4987 }
4988 
4989 // READ: oop. KILL: R0. Volatile floats perhaps.
4990 void MacroAssembler::verify_oop(Register oop, const char* msg) {
4991   if (!VerifyOops) {
4992     return;
4993   }
4994 
4995   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4996   const Register tmp = R11; // Will be preserved.
4997   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
4998   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
4999 
5000   mr_if_needed(R4_ARG2, oop);
5001   save_LR_CR(tmp); // save in old frame
5002   push_frame_reg_args(nbytes_save, tmp);
5003   // load FunctionDescriptor** / entry_address *
5004   load_const_optimized(tmp, fd, R0);
5005   // load FunctionDescriptor* / entry_address
5006   ld(tmp, 0, tmp);
5007   load_const_optimized(R3_ARG1, (address)msg, R0);
5008   // Call destination for its side effect.
5009   call_c(tmp);
5010 
5011   pop_frame();
5012   restore_LR_CR(tmp);
5013   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
5014 }
5015 
5016 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) {
5017   if (!VerifyOops) {
5018     return;
5019   }
5020 
5021   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
5022   const Register tmp = R11; // Will be preserved.
5023   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
5024   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
5025 
5026   ld(R4_ARG2, offs, base);
5027   save_LR_CR(tmp); // save in old frame
5028   push_frame_reg_args(nbytes_save, tmp);
5029   // load FunctionDescriptor** / entry_address *
5030   load_const_optimized(tmp, fd, R0);
5031   // load FunctionDescriptor* / entry_address
5032   ld(tmp, 0, tmp);
5033   load_const_optimized(R3_ARG1, (address)msg, R0);
5034   // Call destination for its side effect.
5035   call_c(tmp);
5036 
5037   pop_frame();
5038   restore_LR_CR(tmp);
5039   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
5040 }
5041 
5042 const char* stop_types[] = {
5043   "stop",
5044   "untested",
5045   "unimplemented",
5046   "shouldnotreachhere"
5047 };
5048 
5049 static void stop_on_request(int tp, const char* msg) {
5050   tty->print("PPC assembly code requires stop: (%s) %s\n", stop_types[tp%/*stop_end*/4], msg);
5051   guarantee(false, "PPC assembly code requires stop: %s", msg);
5052 }
5053 
5054 // Call a C-function that prints output.
5055 void MacroAssembler::stop(int type, const char* msg, int id) {
5056 #ifndef PRODUCT
5057   block_comment(err_msg("stop: %s %s {", stop_types[type%stop_end], msg));
5058 #else
5059   block_comment("stop {");
5060 #endif
5061 
5062   // setup arguments
5063   load_const_optimized(R3_ARG1, type);
5064   load_const_optimized(R4_ARG2, (void *)msg, /*tmp=*/R0);
5065   call_VM_leaf(CAST_FROM_FN_PTR(address, stop_on_request), R3_ARG1, R4_ARG2);
5066   illtrap();
5067   emit_int32(id);
5068   block_comment("} stop;");
5069 }
5070 
5071 #ifndef PRODUCT
5072 // Write pattern 0x0101010101010101 in memory region [low-before, high+after].
5073 // Val, addr are temp registers.
5074 // If low == addr, addr is killed.
5075 // High is preserved.
5076 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) {
5077   if (!ZapMemory) return;
5078 
5079   assert_different_registers(low, val);
5080 
5081   BLOCK_COMMENT("zap memory region {");
5082   load_const_optimized(val, 0x0101010101010101);
5083   int size = before + after;
5084   if (low == high && size < 5 && size > 0) {
5085     int offset = -before*BytesPerWord;
5086     for (int i = 0; i < size; ++i) {
5087       std(val, offset, low);
5088       offset += (1*BytesPerWord);
5089     }
5090   } else {
5091     addi(addr, low, -before*BytesPerWord);
5092     assert_different_registers(high, val);
5093     if (after) addi(high, high, after * BytesPerWord);
5094     Label loop;
5095     bind(loop);
5096     std(val, 0, addr);
5097     addi(addr, addr, 8);
5098     cmpd(CCR6, addr, high);
5099     ble(CCR6, loop);
5100     if (after) addi(high, high, -after * BytesPerWord);  // Correct back to old value.
5101   }
5102   BLOCK_COMMENT("} zap memory region");
5103 }
5104 
5105 #endif // !PRODUCT
5106 
5107 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() {
5108   int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true);
5109   assert(sizeof(bool) == 1, "PowerPC ABI");
5110   masm->lbz(temp, simm16_offset, temp);
5111   masm->cmpwi(CCR0, temp, 0);
5112   masm->beq(CCR0, _label);
5113 }
5114 
5115 SkipIfEqualZero::~SkipIfEqualZero() {
5116   _masm->bind(_label);
5117 }