1 /*
   2  * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright 2012, 2015 SAP AG. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "compiler/disassembler.hpp"
  29 #include "gc/shared/cardTableModRefBS.hpp"
  30 #include "gc/shared/collectedHeap.inline.hpp"
  31 #include "interpreter/interpreter.hpp"
  32 #include "memory/resourceArea.hpp"
  33 #include "prims/methodHandles.hpp"
  34 #include "runtime/biasedLocking.hpp"
  35 #include "runtime/icache.hpp"
  36 #include "runtime/interfaceSupport.hpp"
  37 #include "runtime/objectMonitor.hpp"
  38 #include "runtime/os.hpp"
  39 #include "runtime/sharedRuntime.hpp"
  40 #include "runtime/stubRoutines.hpp"
  41 #include "utilities/macros.hpp"
  42 #if INCLUDE_ALL_GCS
  43 #include "gc/g1/g1CollectedHeap.inline.hpp"
  44 #include "gc/g1/g1SATBCardTableModRefBS.hpp"
  45 #include "gc/g1/heapRegion.hpp"
  46 #endif // INCLUDE_ALL_GCS
  47 
  48 #ifdef PRODUCT
  49 #define BLOCK_COMMENT(str) // nothing
  50 #else
  51 #define BLOCK_COMMENT(str) block_comment(str)
  52 #endif
  53 
  54 #ifdef ASSERT
  55 // On RISC, there's no benefit to verifying instruction boundaries.
  56 bool AbstractAssembler::pd_check_instruction_mark() { return false; }
  57 #endif
  58 
  59 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) {
  60   assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range");
  61   if (Assembler::is_simm(si31, 16)) {
  62     ld(d, si31, a);
  63     if (emit_filler_nop) nop();
  64   } else {
  65     const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31);
  66     const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31);
  67     addis(d, a, hi);
  68     ld(d, lo, d);
  69   }
  70 }
  71 
  72 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) {
  73   assert_different_registers(d, a);
  74   ld_largeoffset_unchecked(d, si31, a, emit_filler_nop);
  75 }
  76 
  77 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base,
  78                                       size_t size_in_bytes, bool is_signed) {
  79   switch (size_in_bytes) {
  80   case  8:              ld(dst, offs, base);                         break;
  81   case  4:  is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break;
  82   case  2:  is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break;
  83   case  1:  lbz(dst, offs, base); if (is_signed) extsb(dst, dst);    break; // lba doesn't exist :(
  84   default:  ShouldNotReachHere();
  85   }
  86 }
  87 
  88 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base,
  89                                        size_t size_in_bytes) {
  90   switch (size_in_bytes) {
  91   case  8:  std(dst, offs, base); break;
  92   case  4:  stw(dst, offs, base); break;
  93   case  2:  sth(dst, offs, base); break;
  94   case  1:  stb(dst, offs, base); break;
  95   default:  ShouldNotReachHere();
  96   }
  97 }
  98 
  99 void MacroAssembler::align(int modulus, int max, int rem) {
 100   int padding = (rem + modulus - (offset() % modulus)) % modulus;
 101   if (padding > max) return;
 102   for (int c = (padding >> 2); c > 0; --c) { nop(); }
 103 }
 104 
 105 // Issue instructions that calculate given TOC from global TOC.
 106 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16,
 107                                                        bool add_relocation, bool emit_dummy_addr) {
 108   int offset = -1;
 109   if (emit_dummy_addr) {
 110     offset = -128; // dummy address
 111   } else if (addr != (address)(intptr_t)-1) {
 112     offset = MacroAssembler::offset_to_global_toc(addr);
 113   }
 114 
 115   if (hi16) {
 116     addis(dst, R29, MacroAssembler::largeoffset_si16_si16_hi(offset));
 117   }
 118   if (lo16) {
 119     if (add_relocation) {
 120       // Relocate at the addi to avoid confusion with a load from the method's TOC.
 121       relocate(internal_word_Relocation::spec(addr));
 122     }
 123     addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset));
 124   }
 125 }
 126 
 127 int MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) {
 128   const int offset = MacroAssembler::offset_to_global_toc(addr);
 129 
 130   const address inst2_addr = a;
 131   const int inst2 = *(int *)inst2_addr;
 132 
 133   // The relocation points to the second instruction, the addi,
 134   // and the addi reads and writes the same register dst.
 135   const int dst = inv_rt_field(inst2);
 136   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 137 
 138   // Now, find the preceding addis which writes to dst.
 139   int inst1 = 0;
 140   address inst1_addr = inst2_addr - BytesPerInstWord;
 141   while (inst1_addr >= bound) {
 142     inst1 = *(int *) inst1_addr;
 143     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 144       // Stop, found the addis which writes dst.
 145       break;
 146     }
 147     inst1_addr -= BytesPerInstWord;
 148   }
 149 
 150   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 151   set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset));
 152   set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset));
 153   return (int)((intptr_t)addr - (intptr_t)inst1_addr);
 154 }
 155 
 156 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) {
 157   const address inst2_addr = a;
 158   const int inst2 = *(int *)inst2_addr;
 159 
 160   // The relocation points to the second instruction, the addi,
 161   // and the addi reads and writes the same register dst.
 162   const int dst = inv_rt_field(inst2);
 163   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 164 
 165   // Now, find the preceding addis which writes to dst.
 166   int inst1 = 0;
 167   address inst1_addr = inst2_addr - BytesPerInstWord;
 168   while (inst1_addr >= bound) {
 169     inst1 = *(int *) inst1_addr;
 170     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 171       // stop, found the addis which writes dst
 172       break;
 173     }
 174     inst1_addr -= BytesPerInstWord;
 175   }
 176 
 177   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 178 
 179   int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0);
 180   // -1 is a special case
 181   if (offset == -1) {
 182     return (address)(intptr_t)-1;
 183   } else {
 184     return global_toc() + offset;
 185   }
 186 }
 187 
 188 #ifdef _LP64
 189 // Patch compressed oops or klass constants.
 190 // Assembler sequence is
 191 // 1) compressed oops:
 192 //    lis  rx = const.hi
 193 //    ori rx = rx | const.lo
 194 // 2) compressed klass:
 195 //    lis  rx = const.hi
 196 //    clrldi rx = rx & 0xFFFFffff // clearMS32b, optional
 197 //    ori rx = rx | const.lo
 198 // Clrldi will be passed by.
 199 int MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) {
 200   assert(UseCompressedOops, "Should only patch compressed oops");
 201 
 202   const address inst2_addr = a;
 203   const int inst2 = *(int *)inst2_addr;
 204 
 205   // The relocation points to the second instruction, the ori,
 206   // and the ori reads and writes the same register dst.
 207   const int dst = inv_rta_field(inst2);
 208   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 209   // Now, find the preceding addis which writes to dst.
 210   int inst1 = 0;
 211   address inst1_addr = inst2_addr - BytesPerInstWord;
 212   bool inst1_found = false;
 213   while (inst1_addr >= bound) {
 214     inst1 = *(int *)inst1_addr;
 215     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; }
 216     inst1_addr -= BytesPerInstWord;
 217   }
 218   assert(inst1_found, "inst is not lis");
 219 
 220   int xc = (data >> 16) & 0xffff;
 221   int xd = (data >>  0) & 0xffff;
 222 
 223   set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo
 224   set_imm((int *)inst2_addr,        (xd)); // unsigned int
 225   return (int)((intptr_t)inst2_addr - (intptr_t)inst1_addr);
 226 }
 227 
 228 // Get compressed oop or klass constant.
 229 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) {
 230   assert(UseCompressedOops, "Should only patch compressed oops");
 231 
 232   const address inst2_addr = a;
 233   const int inst2 = *(int *)inst2_addr;
 234 
 235   // The relocation points to the second instruction, the ori,
 236   // and the ori reads and writes the same register dst.
 237   const int dst = inv_rta_field(inst2);
 238   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 239   // Now, find the preceding lis which writes to dst.
 240   int inst1 = 0;
 241   address inst1_addr = inst2_addr - BytesPerInstWord;
 242   bool inst1_found = false;
 243 
 244   while (inst1_addr >= bound) {
 245     inst1 = *(int *) inst1_addr;
 246     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;}
 247     inst1_addr -= BytesPerInstWord;
 248   }
 249   assert(inst1_found, "inst is not lis");
 250 
 251   uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff));
 252   uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16);
 253 
 254   return (int) (xl | xh);
 255 }
 256 #endif // _LP64
 257 
 258 void MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a, Register toc) {
 259   int toc_offset = 0;
 260   // Use RelocationHolder::none for the constant pool entry, otherwise
 261   // we will end up with a failing NativeCall::verify(x) where x is
 262   // the address of the constant pool entry.
 263   // FIXME: We should insert relocation information for oops at the constant
 264   // pool entries instead of inserting it at the loads; patching of a constant
 265   // pool entry should be less expensive.
 266   address oop_address = address_constant((address)a.value(), RelocationHolder::none);
 267   // Relocate at the pc of the load.
 268   relocate(a.rspec());
 269   toc_offset = (int)(oop_address - code()->consts()->start());
 270   ld_largeoffset_unchecked(dst, toc_offset, toc, true);
 271 }
 272 
 273 bool MacroAssembler::is_load_const_from_method_toc_at(address a) {
 274   const address inst1_addr = a;
 275   const int inst1 = *(int *)inst1_addr;
 276 
 277    // The relocation points to the ld or the addis.
 278    return (is_ld(inst1)) ||
 279           (is_addis(inst1) && inv_ra_field(inst1) != 0);
 280 }
 281 
 282 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) {
 283   assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc");
 284 
 285   const address inst1_addr = a;
 286   const int inst1 = *(int *)inst1_addr;
 287 
 288   if (is_ld(inst1)) {
 289     return inv_d1_field(inst1);
 290   } else if (is_addis(inst1)) {
 291     const int dst = inv_rt_field(inst1);
 292 
 293     // Now, find the succeeding ld which reads and writes to dst.
 294     address inst2_addr = inst1_addr + BytesPerInstWord;
 295     int inst2 = 0;
 296     while (true) {
 297       inst2 = *(int *) inst2_addr;
 298       if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) {
 299         // Stop, found the ld which reads and writes dst.
 300         break;
 301       }
 302       inst2_addr += BytesPerInstWord;
 303     }
 304     return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2);
 305   }
 306   ShouldNotReachHere();
 307   return 0;
 308 }
 309 
 310 // Get the constant from a `load_const' sequence.
 311 long MacroAssembler::get_const(address a) {
 312   assert(is_load_const_at(a), "not a load of a constant");
 313   const int *p = (const int*) a;
 314   unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48);
 315   if (is_ori(*(p+1))) {
 316     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32);
 317     x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16);
 318     x |= (((unsigned long) (get_imm(a,4) & 0xffff)));
 319   } else if (is_lis(*(p+1))) {
 320     x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32);
 321     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16);
 322     x |= (((unsigned long) (get_imm(a,3) & 0xffff)));
 323   } else {
 324     ShouldNotReachHere();
 325     return (long) 0;
 326   }
 327   return (long) x;
 328 }
 329 
 330 // Patch the 64 bit constant of a `load_const' sequence. This is a low
 331 // level procedure. It neither flushes the instruction cache nor is it
 332 // mt safe.
 333 void MacroAssembler::patch_const(address a, long x) {
 334   assert(is_load_const_at(a), "not a load of a constant");
 335   int *p = (int*) a;
 336   if (is_ori(*(p+1))) {
 337     set_imm(0 + p, (x >> 48) & 0xffff);
 338     set_imm(1 + p, (x >> 32) & 0xffff);
 339     set_imm(3 + p, (x >> 16) & 0xffff);
 340     set_imm(4 + p, x & 0xffff);
 341   } else if (is_lis(*(p+1))) {
 342     set_imm(0 + p, (x >> 48) & 0xffff);
 343     set_imm(2 + p, (x >> 32) & 0xffff);
 344     set_imm(1 + p, (x >> 16) & 0xffff);
 345     set_imm(3 + p, x & 0xffff);
 346   } else {
 347     ShouldNotReachHere();
 348   }
 349 }
 350 
 351 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) {
 352   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
 353   int index = oop_recorder()->allocate_metadata_index(obj);
 354   RelocationHolder rspec = metadata_Relocation::spec(index);
 355   return AddressLiteral((address)obj, rspec);
 356 }
 357 
 358 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) {
 359   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
 360   int index = oop_recorder()->find_index(obj);
 361   RelocationHolder rspec = metadata_Relocation::spec(index);
 362   return AddressLiteral((address)obj, rspec);
 363 }
 364 
 365 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) {
 366   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
 367   int oop_index = oop_recorder()->allocate_oop_index(obj);
 368   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 369 }
 370 
 371 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) {
 372   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
 373   int oop_index = oop_recorder()->find_index(obj);
 374   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 375 }
 376 
 377 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
 378                                                       Register tmp, int offset) {
 379   intptr_t value = *delayed_value_addr;
 380   if (value != 0) {
 381     return RegisterOrConstant(value + offset);
 382   }
 383 
 384   // Load indirectly to solve generation ordering problem.
 385   // static address, no relocation
 386   int simm16_offset = load_const_optimized(tmp, delayed_value_addr, noreg, true);
 387   ld(tmp, simm16_offset, tmp); // must be aligned ((xa & 3) == 0)
 388 
 389   if (offset != 0) {
 390     addi(tmp, tmp, offset);
 391   }
 392 
 393   return RegisterOrConstant(tmp);
 394 }
 395 
 396 #ifndef PRODUCT
 397 void MacroAssembler::pd_print_patched_instruction(address branch) {
 398   Unimplemented(); // TODO: PPC port
 399 }
 400 #endif // ndef PRODUCT
 401 
 402 // Conditional far branch for destinations encodable in 24+2 bits.
 403 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) {
 404 
 405   // If requested by flag optimize, relocate the bc_far as a
 406   // runtime_call and prepare for optimizing it when the code gets
 407   // relocated.
 408   if (optimize == bc_far_optimize_on_relocate) {
 409     relocate(relocInfo::runtime_call_type);
 410   }
 411 
 412   // variant 2:
 413   //
 414   //    b!cxx SKIP
 415   //    bxx   DEST
 416   //  SKIP:
 417   //
 418 
 419   const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 420                                                 opposite_bcond(inv_boint_bcond(boint)));
 421 
 422   // We emit two branches.
 423   // First, a conditional branch which jumps around the far branch.
 424   const address not_taken_pc = pc() + 2 * BytesPerInstWord;
 425   const address bc_pc        = pc();
 426   bc(opposite_boint, biint, not_taken_pc);
 427 
 428   const int bc_instr = *(int*)bc_pc;
 429   assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition");
 430   assert(opposite_boint == inv_bo_field(bc_instr), "postcondition");
 431   assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))),
 432                                      opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))),
 433          "postcondition");
 434   assert(biint == inv_bi_field(bc_instr), "postcondition");
 435 
 436   // Second, an unconditional far branch which jumps to dest.
 437   // Note: target(dest) remembers the current pc (see CodeSection::target)
 438   //       and returns the current pc if the label is not bound yet; when
 439   //       the label gets bound, the unconditional far branch will be patched.
 440   const address target_pc = target(dest);
 441   const address b_pc  = pc();
 442   b(target_pc);
 443 
 444   assert(not_taken_pc == pc(),                     "postcondition");
 445   assert(dest.is_bound() || target_pc == b_pc, "postcondition");
 446 }
 447 
 448 bool MacroAssembler::is_bc_far_at(address instruction_addr) {
 449   return is_bc_far_variant1_at(instruction_addr) ||
 450          is_bc_far_variant2_at(instruction_addr) ||
 451          is_bc_far_variant3_at(instruction_addr);
 452 }
 453 
 454 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) {
 455   if (is_bc_far_variant1_at(instruction_addr)) {
 456     const address instruction_1_addr = instruction_addr;
 457     const int instruction_1 = *(int*)instruction_1_addr;
 458     return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr);
 459   } else if (is_bc_far_variant2_at(instruction_addr)) {
 460     const address instruction_2_addr = instruction_addr + 4;
 461     return bxx_destination(instruction_2_addr);
 462   } else if (is_bc_far_variant3_at(instruction_addr)) {
 463     return instruction_addr + 8;
 464   }
 465   // variant 4 ???
 466   ShouldNotReachHere();
 467   return NULL;
 468 }
 469 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) {
 470 
 471   if (is_bc_far_variant3_at(instruction_addr)) {
 472     // variant 3, far cond branch to the next instruction, already patched to nops:
 473     //
 474     //    nop
 475     //    endgroup
 476     //  SKIP/DEST:
 477     //
 478     return;
 479   }
 480 
 481   // first, extract boint and biint from the current branch
 482   int boint = 0;
 483   int biint = 0;
 484 
 485   ResourceMark rm;
 486   const int code_size = 2 * BytesPerInstWord;
 487   CodeBuffer buf(instruction_addr, code_size);
 488   MacroAssembler masm(&buf);
 489   if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) {
 490     // Far branch to next instruction: Optimize it by patching nops (produce variant 3).
 491     masm.nop();
 492     masm.endgroup();
 493   } else {
 494     if (is_bc_far_variant1_at(instruction_addr)) {
 495       // variant 1, the 1st instruction contains the destination address:
 496       //
 497       //    bcxx  DEST
 498       //    endgroup
 499       //
 500       const int instruction_1 = *(int*)(instruction_addr);
 501       boint = inv_bo_field(instruction_1);
 502       biint = inv_bi_field(instruction_1);
 503     } else if (is_bc_far_variant2_at(instruction_addr)) {
 504       // variant 2, the 2nd instruction contains the destination address:
 505       //
 506       //    b!cxx SKIP
 507       //    bxx   DEST
 508       //  SKIP:
 509       //
 510       const int instruction_1 = *(int*)(instruction_addr);
 511       boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))),
 512           opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1))));
 513       biint = inv_bi_field(instruction_1);
 514     } else {
 515       // variant 4???
 516       ShouldNotReachHere();
 517     }
 518 
 519     // second, set the new branch destination and optimize the code
 520     if (dest != instruction_addr + 4 && // the bc_far is still unbound!
 521         masm.is_within_range_of_bcxx(dest, instruction_addr)) {
 522       // variant 1:
 523       //
 524       //    bcxx  DEST
 525       //    endgroup
 526       //
 527       masm.bc(boint, biint, dest);
 528       masm.endgroup();
 529     } else {
 530       // variant 2:
 531       //
 532       //    b!cxx SKIP
 533       //    bxx   DEST
 534       //  SKIP:
 535       //
 536       const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 537                                                     opposite_bcond(inv_boint_bcond(boint)));
 538       const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord;
 539       masm.bc(opposite_boint, biint, not_taken_pc);
 540       masm.b(dest);
 541     }
 542   }
 543   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 544 }
 545 
 546 // Emit a NOT mt-safe patchable 64 bit absolute call/jump.
 547 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) {
 548   // get current pc
 549   uint64_t start_pc = (uint64_t) pc();
 550 
 551   const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last
 552   const address pc_of_b  = (address) (start_pc + (0*BytesPerInstWord)); // b is first
 553 
 554   // relocate here
 555   if (rt != relocInfo::none) {
 556     relocate(rt);
 557   }
 558 
 559   if ( ReoptimizeCallSequences &&
 560        (( link && is_within_range_of_b(dest, pc_of_bl)) ||
 561         (!link && is_within_range_of_b(dest, pc_of_b)))) {
 562     // variant 2:
 563     // Emit an optimized, pc-relative call/jump.
 564 
 565     if (link) {
 566       // some padding
 567       nop();
 568       nop();
 569       nop();
 570       nop();
 571       nop();
 572       nop();
 573 
 574       // do the call
 575       assert(pc() == pc_of_bl, "just checking");
 576       bl(dest, relocInfo::none);
 577     } else {
 578       // do the jump
 579       assert(pc() == pc_of_b, "just checking");
 580       b(dest, relocInfo::none);
 581 
 582       // some padding
 583       nop();
 584       nop();
 585       nop();
 586       nop();
 587       nop();
 588       nop();
 589     }
 590 
 591     // Assert that we can identify the emitted call/jump.
 592     assert(is_bxx64_patchable_variant2_at((address)start_pc, link),
 593            "can't identify emitted call");
 594   } else {
 595     // variant 1:
 596 #if defined(ABI_ELFv2)
 597     nop();
 598     calculate_address_from_global_toc(R12, dest, true, true, false);
 599     mtctr(R12);
 600     nop();
 601     nop();
 602 #else
 603     mr(R0, R11);  // spill R11 -> R0.
 604 
 605     // Load the destination address into CTR,
 606     // calculate destination relative to global toc.
 607     calculate_address_from_global_toc(R11, dest, true, true, false);
 608 
 609     mtctr(R11);
 610     mr(R11, R0);  // spill R11 <- R0.
 611     nop();
 612 #endif
 613 
 614     // do the call/jump
 615     if (link) {
 616       bctrl();
 617     } else{
 618       bctr();
 619     }
 620     // Assert that we can identify the emitted call/jump.
 621     assert(is_bxx64_patchable_variant1b_at((address)start_pc, link),
 622            "can't identify emitted call");
 623   }
 624 
 625   // Assert that we can identify the emitted call/jump.
 626   assert(is_bxx64_patchable_at((address)start_pc, link),
 627          "can't identify emitted call");
 628   assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest,
 629          "wrong encoding of dest address");
 630 }
 631 
 632 // Identify a bxx64_patchable instruction.
 633 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) {
 634   return is_bxx64_patchable_variant1b_at(instruction_addr, link)
 635     //|| is_bxx64_patchable_variant1_at(instruction_addr, link)
 636       || is_bxx64_patchable_variant2_at(instruction_addr, link);
 637 }
 638 
 639 // Does the call64_patchable instruction use a pc-relative encoding of
 640 // the call destination?
 641 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) {
 642   // variant 2 is pc-relative
 643   return is_bxx64_patchable_variant2_at(instruction_addr, link);
 644 }
 645 
 646 // Identify variant 1.
 647 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) {
 648   unsigned int* instr = (unsigned int*) instruction_addr;
 649   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 650       && is_mtctr(instr[5]) // mtctr
 651     && is_load_const_at(instruction_addr);
 652 }
 653 
 654 // Identify variant 1b: load destination relative to global toc.
 655 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) {
 656   unsigned int* instr = (unsigned int*) instruction_addr;
 657   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 658     && is_mtctr(instr[3]) // mtctr
 659     && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr);
 660 }
 661 
 662 // Identify variant 2.
 663 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) {
 664   unsigned int* instr = (unsigned int*) instruction_addr;
 665   if (link) {
 666     return is_bl (instr[6])  // bl dest is last
 667       && is_nop(instr[0])  // nop
 668       && is_nop(instr[1])  // nop
 669       && is_nop(instr[2])  // nop
 670       && is_nop(instr[3])  // nop
 671       && is_nop(instr[4])  // nop
 672       && is_nop(instr[5]); // nop
 673   } else {
 674     return is_b  (instr[0])  // b  dest is first
 675       && is_nop(instr[1])  // nop
 676       && is_nop(instr[2])  // nop
 677       && is_nop(instr[3])  // nop
 678       && is_nop(instr[4])  // nop
 679       && is_nop(instr[5])  // nop
 680       && is_nop(instr[6]); // nop
 681   }
 682 }
 683 
 684 // Set dest address of a bxx64_patchable instruction.
 685 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) {
 686   ResourceMark rm;
 687   int code_size = MacroAssembler::bxx64_patchable_size;
 688   CodeBuffer buf(instruction_addr, code_size);
 689   MacroAssembler masm(&buf);
 690   masm.bxx64_patchable(dest, relocInfo::none, link);
 691   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 692 }
 693 
 694 // Get dest address of a bxx64_patchable instruction.
 695 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) {
 696   if (is_bxx64_patchable_variant1_at(instruction_addr, link)) {
 697     return (address) (unsigned long) get_const(instruction_addr);
 698   } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) {
 699     unsigned int* instr = (unsigned int*) instruction_addr;
 700     if (link) {
 701       const int instr_idx = 6; // bl is last
 702       int branchoffset = branch_destination(instr[instr_idx], 0);
 703       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 704     } else {
 705       const int instr_idx = 0; // b is first
 706       int branchoffset = branch_destination(instr[instr_idx], 0);
 707       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 708     }
 709   // Load dest relative to global toc.
 710   } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) {
 711     return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord,
 712                                                                instruction_addr);
 713   } else {
 714     ShouldNotReachHere();
 715     return NULL;
 716   }
 717 }
 718 
 719 // Uses ordering which corresponds to ABI:
 720 //    _savegpr0_14:  std  r14,-144(r1)
 721 //    _savegpr0_15:  std  r15,-136(r1)
 722 //    _savegpr0_16:  std  r16,-128(r1)
 723 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) {
 724   std(R14, offset, dst);   offset += 8;
 725   std(R15, offset, dst);   offset += 8;
 726   std(R16, offset, dst);   offset += 8;
 727   std(R17, offset, dst);   offset += 8;
 728   std(R18, offset, dst);   offset += 8;
 729   std(R19, offset, dst);   offset += 8;
 730   std(R20, offset, dst);   offset += 8;
 731   std(R21, offset, dst);   offset += 8;
 732   std(R22, offset, dst);   offset += 8;
 733   std(R23, offset, dst);   offset += 8;
 734   std(R24, offset, dst);   offset += 8;
 735   std(R25, offset, dst);   offset += 8;
 736   std(R26, offset, dst);   offset += 8;
 737   std(R27, offset, dst);   offset += 8;
 738   std(R28, offset, dst);   offset += 8;
 739   std(R29, offset, dst);   offset += 8;
 740   std(R30, offset, dst);   offset += 8;
 741   std(R31, offset, dst);   offset += 8;
 742 
 743   stfd(F14, offset, dst);   offset += 8;
 744   stfd(F15, offset, dst);   offset += 8;
 745   stfd(F16, offset, dst);   offset += 8;
 746   stfd(F17, offset, dst);   offset += 8;
 747   stfd(F18, offset, dst);   offset += 8;
 748   stfd(F19, offset, dst);   offset += 8;
 749   stfd(F20, offset, dst);   offset += 8;
 750   stfd(F21, offset, dst);   offset += 8;
 751   stfd(F22, offset, dst);   offset += 8;
 752   stfd(F23, offset, dst);   offset += 8;
 753   stfd(F24, offset, dst);   offset += 8;
 754   stfd(F25, offset, dst);   offset += 8;
 755   stfd(F26, offset, dst);   offset += 8;
 756   stfd(F27, offset, dst);   offset += 8;
 757   stfd(F28, offset, dst);   offset += 8;
 758   stfd(F29, offset, dst);   offset += 8;
 759   stfd(F30, offset, dst);   offset += 8;
 760   stfd(F31, offset, dst);
 761 }
 762 
 763 // Uses ordering which corresponds to ABI:
 764 //    _restgpr0_14:  ld   r14,-144(r1)
 765 //    _restgpr0_15:  ld   r15,-136(r1)
 766 //    _restgpr0_16:  ld   r16,-128(r1)
 767 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) {
 768   ld(R14, offset, src);   offset += 8;
 769   ld(R15, offset, src);   offset += 8;
 770   ld(R16, offset, src);   offset += 8;
 771   ld(R17, offset, src);   offset += 8;
 772   ld(R18, offset, src);   offset += 8;
 773   ld(R19, offset, src);   offset += 8;
 774   ld(R20, offset, src);   offset += 8;
 775   ld(R21, offset, src);   offset += 8;
 776   ld(R22, offset, src);   offset += 8;
 777   ld(R23, offset, src);   offset += 8;
 778   ld(R24, offset, src);   offset += 8;
 779   ld(R25, offset, src);   offset += 8;
 780   ld(R26, offset, src);   offset += 8;
 781   ld(R27, offset, src);   offset += 8;
 782   ld(R28, offset, src);   offset += 8;
 783   ld(R29, offset, src);   offset += 8;
 784   ld(R30, offset, src);   offset += 8;
 785   ld(R31, offset, src);   offset += 8;
 786 
 787   // FP registers
 788   lfd(F14, offset, src);   offset += 8;
 789   lfd(F15, offset, src);   offset += 8;
 790   lfd(F16, offset, src);   offset += 8;
 791   lfd(F17, offset, src);   offset += 8;
 792   lfd(F18, offset, src);   offset += 8;
 793   lfd(F19, offset, src);   offset += 8;
 794   lfd(F20, offset, src);   offset += 8;
 795   lfd(F21, offset, src);   offset += 8;
 796   lfd(F22, offset, src);   offset += 8;
 797   lfd(F23, offset, src);   offset += 8;
 798   lfd(F24, offset, src);   offset += 8;
 799   lfd(F25, offset, src);   offset += 8;
 800   lfd(F26, offset, src);   offset += 8;
 801   lfd(F27, offset, src);   offset += 8;
 802   lfd(F28, offset, src);   offset += 8;
 803   lfd(F29, offset, src);   offset += 8;
 804   lfd(F30, offset, src);   offset += 8;
 805   lfd(F31, offset, src);
 806 }
 807 
 808 // For verify_oops.
 809 void MacroAssembler::save_volatile_gprs(Register dst, int offset) {
 810   std(R2,  offset, dst);   offset += 8;
 811   std(R3,  offset, dst);   offset += 8;
 812   std(R4,  offset, dst);   offset += 8;
 813   std(R5,  offset, dst);   offset += 8;
 814   std(R6,  offset, dst);   offset += 8;
 815   std(R7,  offset, dst);   offset += 8;
 816   std(R8,  offset, dst);   offset += 8;
 817   std(R9,  offset, dst);   offset += 8;
 818   std(R10, offset, dst);   offset += 8;
 819   std(R11, offset, dst);   offset += 8;
 820   std(R12, offset, dst);
 821 }
 822 
 823 // For verify_oops.
 824 void MacroAssembler::restore_volatile_gprs(Register src, int offset) {
 825   ld(R2,  offset, src);   offset += 8;
 826   ld(R3,  offset, src);   offset += 8;
 827   ld(R4,  offset, src);   offset += 8;
 828   ld(R5,  offset, src);   offset += 8;
 829   ld(R6,  offset, src);   offset += 8;
 830   ld(R7,  offset, src);   offset += 8;
 831   ld(R8,  offset, src);   offset += 8;
 832   ld(R9,  offset, src);   offset += 8;
 833   ld(R10, offset, src);   offset += 8;
 834   ld(R11, offset, src);   offset += 8;
 835   ld(R12, offset, src);
 836 }
 837 
 838 void MacroAssembler::save_LR_CR(Register tmp) {
 839   mfcr(tmp);
 840   std(tmp, _abi(cr), R1_SP);
 841   mflr(tmp);
 842   std(tmp, _abi(lr), R1_SP);
 843   // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad)
 844 }
 845 
 846 void MacroAssembler::restore_LR_CR(Register tmp) {
 847   assert(tmp != R1_SP, "must be distinct");
 848   ld(tmp, _abi(lr), R1_SP);
 849   mtlr(tmp);
 850   ld(tmp, _abi(cr), R1_SP);
 851   mtcr(tmp);
 852 }
 853 
 854 address MacroAssembler::get_PC_trash_LR(Register result) {
 855   Label L;
 856   bl(L);
 857   bind(L);
 858   address lr_pc = pc();
 859   mflr(result);
 860   return lr_pc;
 861 }
 862 
 863 void MacroAssembler::resize_frame(Register offset, Register tmp) {
 864 #ifdef ASSERT
 865   assert_different_registers(offset, tmp, R1_SP);
 866   andi_(tmp, offset, frame::alignment_in_bytes-1);
 867   asm_assert_eq("resize_frame: unaligned", 0x204);
 868 #endif
 869 
 870   // tmp <- *(SP)
 871   ld(tmp, _abi(callers_sp), R1_SP);
 872   // addr <- SP + offset;
 873   // *(addr) <- tmp;
 874   // SP <- addr
 875   stdux(tmp, R1_SP, offset);
 876 }
 877 
 878 void MacroAssembler::resize_frame(int offset, Register tmp) {
 879   assert(is_simm(offset, 16), "too big an offset");
 880   assert_different_registers(tmp, R1_SP);
 881   assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned");
 882   // tmp <- *(SP)
 883   ld(tmp, _abi(callers_sp), R1_SP);
 884   // addr <- SP + offset;
 885   // *(addr) <- tmp;
 886   // SP <- addr
 887   stdu(tmp, offset, R1_SP);
 888 }
 889 
 890 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) {
 891   // (addr == tmp1) || (addr == tmp2) is allowed here!
 892   assert(tmp1 != tmp2, "must be distinct");
 893 
 894   // compute offset w.r.t. current stack pointer
 895   // tmp_1 <- addr - SP (!)
 896   subf(tmp1, R1_SP, addr);
 897 
 898   // atomically update SP keeping back link.
 899   resize_frame(tmp1/* offset */, tmp2/* tmp */);
 900 }
 901 
 902 void MacroAssembler::push_frame(Register bytes, Register tmp) {
 903 #ifdef ASSERT
 904   assert(bytes != R0, "r0 not allowed here");
 905   andi_(R0, bytes, frame::alignment_in_bytes-1);
 906   asm_assert_eq("push_frame(Reg, Reg): unaligned", 0x203);
 907 #endif
 908   neg(tmp, bytes);
 909   stdux(R1_SP, R1_SP, tmp);
 910 }
 911 
 912 // Push a frame of size `bytes'.
 913 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) {
 914   long offset = align_addr(bytes, frame::alignment_in_bytes);
 915   if (is_simm(-offset, 16)) {
 916     stdu(R1_SP, -offset, R1_SP);
 917   } else {
 918     load_const(tmp, -offset);
 919     stdux(R1_SP, R1_SP, tmp);
 920   }
 921 }
 922 
 923 // Push a frame of size `bytes' plus abi_reg_args on top.
 924 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) {
 925   push_frame(bytes + frame::abi_reg_args_size, tmp);
 926 }
 927 
 928 // Setup up a new C frame with a spill area for non-volatile GPRs and
 929 // additional space for local variables.
 930 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes,
 931                                                       Register tmp) {
 932   push_frame(bytes + frame::abi_reg_args_size + frame::spill_nonvolatiles_size, tmp);
 933 }
 934 
 935 // Pop current C frame.
 936 void MacroAssembler::pop_frame() {
 937   ld(R1_SP, _abi(callers_sp), R1_SP);
 938 }
 939 
 940 #if defined(ABI_ELFv2)
 941 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) {
 942   // TODO(asmundak): make sure the caller uses R12 as function descriptor
 943   // most of the times.
 944   if (R12 != r_function_entry) {
 945     mr(R12, r_function_entry);
 946   }
 947   mtctr(R12);
 948   // Do a call or a branch.
 949   if (and_link) {
 950     bctrl();
 951   } else {
 952     bctr();
 953   }
 954   _last_calls_return_pc = pc();
 955 
 956   return _last_calls_return_pc;
 957 }
 958 
 959 // Call a C function via a function descriptor and use full C
 960 // calling conventions. Updates and returns _last_calls_return_pc.
 961 address MacroAssembler::call_c(Register r_function_entry) {
 962   return branch_to(r_function_entry, /*and_link=*/true);
 963 }
 964 
 965 // For tail calls: only branch, don't link, so callee returns to caller of this function.
 966 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) {
 967   return branch_to(r_function_entry, /*and_link=*/false);
 968 }
 969 
 970 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) {
 971   load_const(R12, function_entry, R0);
 972   return branch_to(R12,  /*and_link=*/true);
 973 }
 974 
 975 #else
 976 // Generic version of a call to C function via a function descriptor
 977 // with variable support for C calling conventions (TOC, ENV, etc.).
 978 // Updates and returns _last_calls_return_pc.
 979 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call,
 980                                   bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) {
 981   // we emit standard ptrgl glue code here
 982   assert((function_descriptor != R0), "function_descriptor cannot be R0");
 983 
 984   // retrieve necessary entries from the function descriptor
 985   ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor);
 986   mtctr(R0);
 987 
 988   if (load_toc_of_callee) {
 989     ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor);
 990   }
 991   if (load_env_of_callee) {
 992     ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor);
 993   } else if (load_toc_of_callee) {
 994     li(R11, 0);
 995   }
 996 
 997   // do a call or a branch
 998   if (and_link) {
 999     bctrl();
1000   } else {
1001     bctr();
1002   }
1003   _last_calls_return_pc = pc();
1004 
1005   return _last_calls_return_pc;
1006 }
1007 
1008 // Call a C function via a function descriptor and use full C calling
1009 // conventions.
1010 // We don't use the TOC in generated code, so there is no need to save
1011 // and restore its value.
1012 address MacroAssembler::call_c(Register fd) {
1013   return branch_to(fd, /*and_link=*/true,
1014                        /*save toc=*/false,
1015                        /*restore toc=*/false,
1016                        /*load toc=*/true,
1017                        /*load env=*/true);
1018 }
1019 
1020 address MacroAssembler::call_c_and_return_to_caller(Register fd) {
1021   return branch_to(fd, /*and_link=*/false,
1022                        /*save toc=*/false,
1023                        /*restore toc=*/false,
1024                        /*load toc=*/true,
1025                        /*load env=*/true);
1026 }
1027 
1028 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) {
1029   if (rt != relocInfo::none) {
1030     // this call needs to be relocatable
1031     if (!ReoptimizeCallSequences
1032         || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1033         || fd == NULL   // support code-size estimation
1034         || !fd->is_friend_function()
1035         || fd->entry() == NULL) {
1036       // it's not a friend function as defined by class FunctionDescriptor,
1037       // so do a full call-c here.
1038       load_const(R11, (address)fd, R0);
1039 
1040       bool has_env = (fd != NULL && fd->env() != NULL);
1041       return branch_to(R11, /*and_link=*/true,
1042                             /*save toc=*/false,
1043                             /*restore toc=*/false,
1044                             /*load toc=*/true,
1045                             /*load env=*/has_env);
1046     } else {
1047       // It's a friend function. Load the entry point and don't care about
1048       // toc and env. Use an optimizable call instruction, but ensure the
1049       // same code-size as in the case of a non-friend function.
1050       nop();
1051       nop();
1052       nop();
1053       bl64_patchable(fd->entry(), rt);
1054       _last_calls_return_pc = pc();
1055       return _last_calls_return_pc;
1056     }
1057   } else {
1058     // This call does not need to be relocatable, do more aggressive
1059     // optimizations.
1060     if (!ReoptimizeCallSequences
1061       || !fd->is_friend_function()) {
1062       // It's not a friend function as defined by class FunctionDescriptor,
1063       // so do a full call-c here.
1064       load_const(R11, (address)fd, R0);
1065       return branch_to(R11, /*and_link=*/true,
1066                             /*save toc=*/false,
1067                             /*restore toc=*/false,
1068                             /*load toc=*/true,
1069                             /*load env=*/true);
1070     } else {
1071       // it's a friend function, load the entry point and don't care about
1072       // toc and env.
1073       address dest = fd->entry();
1074       if (is_within_range_of_b(dest, pc())) {
1075         bl(dest);
1076       } else {
1077         bl64_patchable(dest, rt);
1078       }
1079       _last_calls_return_pc = pc();
1080       return _last_calls_return_pc;
1081     }
1082   }
1083 }
1084 
1085 // Call a C function.  All constants needed reside in TOC.
1086 //
1087 // Read the address to call from the TOC.
1088 // Read env from TOC, if fd specifies an env.
1089 // Read new TOC from TOC.
1090 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd,
1091                                          relocInfo::relocType rt, Register toc) {
1092   if (!ReoptimizeCallSequences
1093     || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1094     || !fd->is_friend_function()) {
1095     // It's not a friend function as defined by class FunctionDescriptor,
1096     // so do a full call-c here.
1097     assert(fd->entry() != NULL, "function must be linked");
1098 
1099     AddressLiteral fd_entry(fd->entry());
1100     load_const_from_method_toc(R11, fd_entry, toc);
1101     mtctr(R11);
1102     if (fd->env() == NULL) {
1103       li(R11, 0);
1104       nop();
1105     } else {
1106       AddressLiteral fd_env(fd->env());
1107       load_const_from_method_toc(R11, fd_env, toc);
1108     }
1109     AddressLiteral fd_toc(fd->toc());
1110     load_toc_from_toc(R2_TOC, fd_toc, toc);
1111     // R2_TOC is killed.
1112     bctrl();
1113     _last_calls_return_pc = pc();
1114   } else {
1115     // It's a friend function, load the entry point and don't care about
1116     // toc and env. Use an optimizable call instruction, but ensure the
1117     // same code-size as in the case of a non-friend function.
1118     nop();
1119     bl64_patchable(fd->entry(), rt);
1120     _last_calls_return_pc = pc();
1121   }
1122   return _last_calls_return_pc;
1123 }
1124 #endif // ABI_ELFv2
1125 
1126 void MacroAssembler::call_VM_base(Register oop_result,
1127                                   Register last_java_sp,
1128                                   address  entry_point,
1129                                   bool     check_exceptions) {
1130   BLOCK_COMMENT("call_VM {");
1131   // Determine last_java_sp register.
1132   if (!last_java_sp->is_valid()) {
1133     last_java_sp = R1_SP;
1134   }
1135   set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1);
1136 
1137   // ARG1 must hold thread address.
1138   mr(R3_ARG1, R16_thread);
1139 #if defined(ABI_ELFv2)
1140   address return_pc = call_c(entry_point, relocInfo::none);
1141 #else
1142   address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none);
1143 #endif
1144 
1145   reset_last_Java_frame();
1146 
1147   // Check for pending exceptions.
1148   if (check_exceptions) {
1149     // We don't check for exceptions here.
1150     ShouldNotReachHere();
1151   }
1152 
1153   // Get oop result if there is one and reset the value in the thread.
1154   if (oop_result->is_valid()) {
1155     get_vm_result(oop_result);
1156   }
1157 
1158   _last_calls_return_pc = return_pc;
1159   BLOCK_COMMENT("} call_VM");
1160 }
1161 
1162 void MacroAssembler::call_VM_leaf_base(address entry_point) {
1163   BLOCK_COMMENT("call_VM_leaf {");
1164 #if defined(ABI_ELFv2)
1165   call_c(entry_point, relocInfo::none);
1166 #else
1167   call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none);
1168 #endif
1169   BLOCK_COMMENT("} call_VM_leaf");
1170 }
1171 
1172 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) {
1173   call_VM_base(oop_result, noreg, entry_point, check_exceptions);
1174 }
1175 
1176 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1,
1177                              bool check_exceptions) {
1178   // R3_ARG1 is reserved for the thread.
1179   mr_if_needed(R4_ARG2, arg_1);
1180   call_VM(oop_result, entry_point, check_exceptions);
1181 }
1182 
1183 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2,
1184                              bool check_exceptions) {
1185   // R3_ARG1 is reserved for the thread
1186   mr_if_needed(R4_ARG2, arg_1);
1187   assert(arg_2 != R4_ARG2, "smashed argument");
1188   mr_if_needed(R5_ARG3, arg_2);
1189   call_VM(oop_result, entry_point, check_exceptions);
1190 }
1191 
1192 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3,
1193                              bool check_exceptions) {
1194   // R3_ARG1 is reserved for the thread
1195   mr_if_needed(R4_ARG2, arg_1);
1196   assert(arg_2 != R4_ARG2, "smashed argument");
1197   mr_if_needed(R5_ARG3, arg_2);
1198   mr_if_needed(R6_ARG4, arg_3);
1199   call_VM(oop_result, entry_point, check_exceptions);
1200 }
1201 
1202 void MacroAssembler::call_VM_leaf(address entry_point) {
1203   call_VM_leaf_base(entry_point);
1204 }
1205 
1206 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) {
1207   mr_if_needed(R3_ARG1, arg_1);
1208   call_VM_leaf(entry_point);
1209 }
1210 
1211 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) {
1212   mr_if_needed(R3_ARG1, arg_1);
1213   assert(arg_2 != R3_ARG1, "smashed argument");
1214   mr_if_needed(R4_ARG2, arg_2);
1215   call_VM_leaf(entry_point);
1216 }
1217 
1218 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) {
1219   mr_if_needed(R3_ARG1, arg_1);
1220   assert(arg_2 != R3_ARG1, "smashed argument");
1221   mr_if_needed(R4_ARG2, arg_2);
1222   assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument");
1223   mr_if_needed(R5_ARG3, arg_3);
1224   call_VM_leaf(entry_point);
1225 }
1226 
1227 // Check whether instruction is a read access to the polling page
1228 // which was emitted by load_from_polling_page(..).
1229 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext,
1230                                                address* polling_address_ptr) {
1231   if (!is_ld(instruction))
1232     return false; // It's not a ld. Fail.
1233 
1234   int rt = inv_rt_field(instruction);
1235   int ra = inv_ra_field(instruction);
1236   int ds = inv_ds_field(instruction);
1237   if (!(ds == 0 && ra != 0 && rt == 0)) {
1238     return false; // It's not a ld(r0, X, ra). Fail.
1239   }
1240 
1241   if (!ucontext) {
1242     // Set polling address.
1243     if (polling_address_ptr != NULL) {
1244       *polling_address_ptr = NULL;
1245     }
1246     return true; // No ucontext given. Can't check value of ra. Assume true.
1247   }
1248 
1249 #ifdef LINUX
1250   // Ucontext given. Check that register ra contains the address of
1251   // the safepoing polling page.
1252   ucontext_t* uc = (ucontext_t*) ucontext;
1253   // Set polling address.
1254   address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds;
1255   if (polling_address_ptr != NULL) {
1256     *polling_address_ptr = addr;
1257   }
1258   return os::is_poll_address(addr);
1259 #else
1260   // Not on Linux, ucontext must be NULL.
1261   ShouldNotReachHere();
1262   return false;
1263 #endif
1264 }
1265 
1266 bool MacroAssembler::is_memory_serialization(int instruction, JavaThread* thread, void* ucontext) {
1267 #ifdef LINUX
1268   ucontext_t* uc = (ucontext_t*) ucontext;
1269 
1270   if (is_stwx(instruction) || is_stwux(instruction)) {
1271     int ra = inv_ra_field(instruction);
1272     int rb = inv_rb_field(instruction);
1273 
1274     // look up content of ra and rb in ucontext
1275     address ra_val=(address)uc->uc_mcontext.regs->gpr[ra];
1276     long rb_val=(long)uc->uc_mcontext.regs->gpr[rb];
1277     return os::is_memory_serialize_page(thread, ra_val+rb_val);
1278   } else if (is_stw(instruction) || is_stwu(instruction)) {
1279     int ra = inv_ra_field(instruction);
1280     int d1 = inv_d1_field(instruction);
1281 
1282     // look up content of ra in ucontext
1283     address ra_val=(address)uc->uc_mcontext.regs->gpr[ra];
1284     return os::is_memory_serialize_page(thread, ra_val+d1);
1285   } else {
1286     return false;
1287   }
1288 #else
1289   // workaround not needed on !LINUX :-)
1290   ShouldNotCallThis();
1291   return false;
1292 #endif
1293 }
1294 
1295 void MacroAssembler::bang_stack_with_offset(int offset) {
1296   // When increasing the stack, the old stack pointer will be written
1297   // to the new top of stack according to the PPC64 abi.
1298   // Therefore, stack banging is not necessary when increasing
1299   // the stack by <= os::vm_page_size() bytes.
1300   // When increasing the stack by a larger amount, this method is
1301   // called repeatedly to bang the intermediate pages.
1302 
1303   // Stack grows down, caller passes positive offset.
1304   assert(offset > 0, "must bang with positive offset");
1305 
1306   long stdoffset = -offset;
1307 
1308   if (is_simm(stdoffset, 16)) {
1309     // Signed 16 bit offset, a simple std is ok.
1310     if (UseLoadInstructionsForStackBangingPPC64) {
1311       ld(R0, (int)(signed short)stdoffset, R1_SP);
1312     } else {
1313       std(R0,(int)(signed short)stdoffset, R1_SP);
1314     }
1315   } else if (is_simm(stdoffset, 31)) {
1316     const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset);
1317     const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset);
1318 
1319     Register tmp = R11;
1320     addis(tmp, R1_SP, hi);
1321     if (UseLoadInstructionsForStackBangingPPC64) {
1322       ld(R0,  lo, tmp);
1323     } else {
1324       std(R0, lo, tmp);
1325     }
1326   } else {
1327     ShouldNotReachHere();
1328   }
1329 }
1330 
1331 // If instruction is a stack bang of the form
1332 //    std    R0,    x(Ry),       (see bang_stack_with_offset())
1333 //    stdu   R1_SP, x(R1_SP),    (see push_frame(), resize_frame())
1334 // or stdux  R1_SP, Rx, R1_SP    (see push_frame(), resize_frame())
1335 // return the banged address. Otherwise, return 0.
1336 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) {
1337 #ifdef LINUX
1338   ucontext_t* uc = (ucontext_t*) ucontext;
1339   int rs = inv_rs_field(instruction);
1340   int ra = inv_ra_field(instruction);
1341   if (   (is_ld(instruction)   && rs == 0 &&  UseLoadInstructionsForStackBangingPPC64)
1342       || (is_std(instruction)  && rs == 0 && !UseLoadInstructionsForStackBangingPPC64)
1343       || (is_stdu(instruction) && rs == 1)) {
1344     int ds = inv_ds_field(instruction);
1345     // return banged address
1346     return ds+(address)uc->uc_mcontext.regs->gpr[ra];
1347   } else if (is_stdux(instruction) && rs == 1) {
1348     int rb = inv_rb_field(instruction);
1349     address sp = (address)uc->uc_mcontext.regs->gpr[1];
1350     long rb_val = (long)uc->uc_mcontext.regs->gpr[rb];
1351     return ra != 1 || rb_val >= 0 ? NULL         // not a stack bang
1352                                   : sp + rb_val; // banged address
1353   }
1354   return NULL; // not a stack bang
1355 #else
1356   // workaround not needed on !LINUX :-)
1357   ShouldNotCallThis();
1358   return NULL;
1359 #endif
1360 }
1361 
1362 // CmpxchgX sets condition register to cmpX(current, compare).
1363 void MacroAssembler::cmpxchgw(ConditionRegister flag, Register dest_current_value,
1364                               Register compare_value, Register exchange_value,
1365                               Register addr_base, int semantics, bool cmpxchgx_hint,
1366                               Register int_flag_success, bool contention_hint) {
1367   Label retry;
1368   Label failed;
1369   Label done;
1370 
1371   // Save one branch if result is returned via register and
1372   // result register is different from the other ones.
1373   bool use_result_reg    = (int_flag_success != noreg);
1374   bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value &&
1375                             int_flag_success != exchange_value && int_flag_success != addr_base);
1376 
1377   // release/fence semantics
1378   if (semantics & MemBarRel) {
1379     release();
1380   }
1381 
1382   if (use_result_reg && preset_result_reg) {
1383     li(int_flag_success, 0); // preset (assume cas failed)
1384   }
1385 
1386   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1387   if (contention_hint) { // Don't try to reserve if cmp fails.
1388     lwz(dest_current_value, 0, addr_base);
1389     cmpw(flag, dest_current_value, compare_value);
1390     bne(flag, failed);
1391   }
1392 
1393   // atomic emulation loop
1394   bind(retry);
1395 
1396   lwarx(dest_current_value, addr_base, cmpxchgx_hint);
1397   cmpw(flag, dest_current_value, compare_value);
1398   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1399     bne_predict_not_taken(flag, failed);
1400   } else {
1401     bne(                  flag, failed);
1402   }
1403   // branch to done  => (flag == ne), (dest_current_value != compare_value)
1404   // fall through    => (flag == eq), (dest_current_value == compare_value)
1405 
1406   stwcx_(exchange_value, addr_base);
1407   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1408     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1409   } else {
1410     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1411   }
1412   // fall through    => (flag == eq), (dest_current_value == compare_value), (swapped)
1413 
1414   // Result in register (must do this at the end because int_flag_success can be the
1415   // same register as one above).
1416   if (use_result_reg) {
1417     li(int_flag_success, 1);
1418   }
1419 
1420   if (semantics & MemBarFenceAfter) {
1421     fence();
1422   } else if (semantics & MemBarAcq) {
1423     isync();
1424   }
1425 
1426   if (use_result_reg && !preset_result_reg) {
1427     b(done);
1428   }
1429 
1430   bind(failed);
1431   if (use_result_reg && !preset_result_reg) {
1432     li(int_flag_success, 0);
1433   }
1434 
1435   bind(done);
1436   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1437   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1438 }
1439 
1440 // Preforms atomic compare exchange:
1441 //   if (compare_value == *addr_base)
1442 //     *addr_base = exchange_value
1443 //     int_flag_success = 1;
1444 //   else
1445 //     int_flag_success = 0;
1446 //
1447 // ConditionRegister flag       = cmp(compare_value, *addr_base)
1448 // Register dest_current_value  = *addr_base
1449 // Register compare_value       Used to compare with value in memory
1450 // Register exchange_value      Written to memory if compare_value == *addr_base
1451 // Register addr_base           The memory location to compareXChange
1452 // Register int_flag_success    Set to 1 if exchange_value was written to *addr_base
1453 //
1454 // To avoid the costly compare exchange the value is tested beforehand.
1455 // Several special cases exist to avoid that unnecessary information is generated.
1456 //
1457 void MacroAssembler::cmpxchgd(ConditionRegister flag,
1458                               Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value,
1459                               Register addr_base, int semantics, bool cmpxchgx_hint,
1460                               Register int_flag_success, Label* failed_ext, bool contention_hint) {
1461   Label retry;
1462   Label failed_int;
1463   Label& failed = (failed_ext != NULL) ? *failed_ext : failed_int;
1464   Label done;
1465 
1466   // Save one branch if result is returned via register and result register is different from the other ones.
1467   bool use_result_reg    = (int_flag_success!=noreg);
1468   bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() &&
1469                             int_flag_success!=exchange_value && int_flag_success!=addr_base);
1470   assert(int_flag_success == noreg || failed_ext == NULL, "cannot have both");
1471 
1472   // release/fence semantics
1473   if (semantics & MemBarRel) {
1474     release();
1475   }
1476 
1477   if (use_result_reg && preset_result_reg) {
1478     li(int_flag_success, 0); // preset (assume cas failed)
1479   }
1480 
1481   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1482   if (contention_hint) { // Don't try to reserve if cmp fails.
1483     ld(dest_current_value, 0, addr_base);
1484     cmpd(flag, compare_value, dest_current_value);
1485     bne(flag, failed);
1486   }
1487 
1488   // atomic emulation loop
1489   bind(retry);
1490 
1491   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1492   cmpd(flag, compare_value, dest_current_value);
1493   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1494     bne_predict_not_taken(flag, failed);
1495   } else {
1496     bne(                  flag, failed);
1497   }
1498 
1499   stdcx_(exchange_value, addr_base);
1500   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1501     bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
1502   } else {
1503     bne(                  CCR0, retry); // stXcx_ sets CCR0
1504   }
1505 
1506   // result in register (must do this at the end because int_flag_success can be the same register as one above)
1507   if (use_result_reg) {
1508     li(int_flag_success, 1);
1509   }
1510 
1511   // POWER6 doesn't need isync in CAS.
1512   // Always emit isync to be on the safe side.
1513   if (semantics & MemBarFenceAfter) {
1514     fence();
1515   } else if (semantics & MemBarAcq) {
1516     isync();
1517   }
1518 
1519   if (use_result_reg && !preset_result_reg) {
1520     b(done);
1521   }
1522 
1523   bind(failed_int);
1524   if (use_result_reg && !preset_result_reg) {
1525     li(int_flag_success, 0);
1526   }
1527 
1528   bind(done);
1529   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1530   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1531 }
1532 
1533 // Look up the method for a megamorphic invokeinterface call.
1534 // The target method is determined by <intf_klass, itable_index>.
1535 // The receiver klass is in recv_klass.
1536 // On success, the result will be in method_result, and execution falls through.
1537 // On failure, execution transfers to the given label.
1538 void MacroAssembler::lookup_interface_method(Register recv_klass,
1539                                              Register intf_klass,
1540                                              RegisterOrConstant itable_index,
1541                                              Register method_result,
1542                                              Register scan_temp,
1543                                              Register sethi_temp,
1544                                              Label& L_no_such_interface) {
1545   assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
1546   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
1547          "caller must use same register for non-constant itable index as for method");
1548 
1549   // Compute start of first itableOffsetEntry (which is at the end of the vtable).
1550   int vtable_base = InstanceKlass::vtable_start_offset() * wordSize;
1551   int itentry_off = itableMethodEntry::method_offset_in_bytes();
1552   int logMEsize   = exact_log2(itableMethodEntry::size() * wordSize);
1553   int scan_step   = itableOffsetEntry::size() * wordSize;
1554   int log_vte_size= exact_log2(vtableEntry::size() * wordSize);
1555 
1556   lwz(scan_temp, InstanceKlass::vtable_length_offset() * wordSize, recv_klass);
1557   // %%% We should store the aligned, prescaled offset in the klassoop.
1558   // Then the next several instructions would fold away.
1559 
1560   sldi(scan_temp, scan_temp, log_vte_size);
1561   addi(scan_temp, scan_temp, vtable_base);
1562   add(scan_temp, recv_klass, scan_temp);
1563 
1564   // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1565   if (itable_index.is_register()) {
1566     Register itable_offset = itable_index.as_register();
1567     sldi(itable_offset, itable_offset, logMEsize);
1568     if (itentry_off) addi(itable_offset, itable_offset, itentry_off);
1569     add(recv_klass, itable_offset, recv_klass);
1570   } else {
1571     long itable_offset = (long)itable_index.as_constant();
1572     load_const_optimized(sethi_temp, (itable_offset<<logMEsize)+itentry_off); // static address, no relocation
1573     add(recv_klass, sethi_temp, recv_klass);
1574   }
1575 
1576   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
1577   //   if (scan->interface() == intf) {
1578   //     result = (klass + scan->offset() + itable_index);
1579   //   }
1580   // }
1581   Label search, found_method;
1582 
1583   for (int peel = 1; peel >= 0; peel--) {
1584     // %%%% Could load both offset and interface in one ldx, if they were
1585     // in the opposite order. This would save a load.
1586     ld(method_result, itableOffsetEntry::interface_offset_in_bytes(), scan_temp);
1587 
1588     // Check that this entry is non-null. A null entry means that
1589     // the receiver class doesn't implement the interface, and wasn't the
1590     // same as when the caller was compiled.
1591     cmpd(CCR0, method_result, intf_klass);
1592 
1593     if (peel) {
1594       beq(CCR0, found_method);
1595     } else {
1596       bne(CCR0, search);
1597       // (invert the test to fall through to found_method...)
1598     }
1599 
1600     if (!peel) break;
1601 
1602     bind(search);
1603 
1604     cmpdi(CCR0, method_result, 0);
1605     beq(CCR0, L_no_such_interface);
1606     addi(scan_temp, scan_temp, scan_step);
1607   }
1608 
1609   bind(found_method);
1610 
1611   // Got a hit.
1612   int ito_offset = itableOffsetEntry::offset_offset_in_bytes();
1613   lwz(scan_temp, ito_offset, scan_temp);
1614   ldx(method_result, scan_temp, recv_klass);
1615 }
1616 
1617 // virtual method calling
1618 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1619                                            RegisterOrConstant vtable_index,
1620                                            Register method_result) {
1621 
1622   assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg());
1623 
1624   const int base = InstanceKlass::vtable_start_offset() * wordSize;
1625   assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1626 
1627   if (vtable_index.is_register()) {
1628     sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord);
1629     add(recv_klass, vtable_index.as_register(), recv_klass);
1630   } else {
1631     addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord);
1632   }
1633   ld(R19_method, base + vtableEntry::method_offset_in_bytes(), recv_klass);
1634 }
1635 
1636 /////////////////////////////////////////// subtype checking ////////////////////////////////////////////
1637 
1638 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1639                                                    Register super_klass,
1640                                                    Register temp1_reg,
1641                                                    Register temp2_reg,
1642                                                    Label& L_success,
1643                                                    Label& L_failure) {
1644 
1645   const Register check_cache_offset = temp1_reg;
1646   const Register cached_super       = temp2_reg;
1647 
1648   assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super);
1649 
1650   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1651   int sc_offset  = in_bytes(Klass::secondary_super_cache_offset());
1652 
1653   // If the pointers are equal, we are done (e.g., String[] elements).
1654   // This self-check enables sharing of secondary supertype arrays among
1655   // non-primary types such as array-of-interface. Otherwise, each such
1656   // type would need its own customized SSA.
1657   // We move this check to the front of the fast path because many
1658   // type checks are in fact trivially successful in this manner,
1659   // so we get a nicely predicted branch right at the start of the check.
1660   cmpd(CCR0, sub_klass, super_klass);
1661   beq(CCR0, L_success);
1662 
1663   // Check the supertype display:
1664   lwz(check_cache_offset, sco_offset, super_klass);
1665   // The loaded value is the offset from KlassOopDesc.
1666 
1667   ldx(cached_super, check_cache_offset, sub_klass);
1668   cmpd(CCR0, cached_super, super_klass);
1669   beq(CCR0, L_success);
1670 
1671   // This check has worked decisively for primary supers.
1672   // Secondary supers are sought in the super_cache ('super_cache_addr').
1673   // (Secondary supers are interfaces and very deeply nested subtypes.)
1674   // This works in the same check above because of a tricky aliasing
1675   // between the super_cache and the primary super display elements.
1676   // (The 'super_check_addr' can address either, as the case requires.)
1677   // Note that the cache is updated below if it does not help us find
1678   // what we need immediately.
1679   // So if it was a primary super, we can just fail immediately.
1680   // Otherwise, it's the slow path for us (no success at this point).
1681 
1682   cmpwi(CCR0, check_cache_offset, sc_offset);
1683   bne(CCR0, L_failure);
1684   // bind(slow_path); // fallthru
1685 }
1686 
1687 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1688                                                    Register super_klass,
1689                                                    Register temp1_reg,
1690                                                    Register temp2_reg,
1691                                                    Label* L_success,
1692                                                    Register result_reg) {
1693   const Register array_ptr = temp1_reg; // current value from cache array
1694   const Register temp      = temp2_reg;
1695 
1696   assert_different_registers(sub_klass, super_klass, array_ptr, temp);
1697 
1698   int source_offset = in_bytes(Klass::secondary_supers_offset());
1699   int target_offset = in_bytes(Klass::secondary_super_cache_offset());
1700 
1701   int length_offset = Array<Klass*>::length_offset_in_bytes();
1702   int base_offset   = Array<Klass*>::base_offset_in_bytes();
1703 
1704   Label hit, loop, failure, fallthru;
1705 
1706   ld(array_ptr, source_offset, sub_klass);
1707 
1708   //assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated.");
1709   lwz(temp, length_offset, array_ptr);
1710   cmpwi(CCR0, temp, 0);
1711   beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0
1712 
1713   mtctr(temp); // load ctr
1714 
1715   bind(loop);
1716   // Oops in table are NO MORE compressed.
1717   ld(temp, base_offset, array_ptr);
1718   cmpd(CCR0, temp, super_klass);
1719   beq(CCR0, hit);
1720   addi(array_ptr, array_ptr, BytesPerWord);
1721   bdnz(loop);
1722 
1723   bind(failure);
1724   if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss)
1725   b(fallthru);
1726 
1727   bind(hit);
1728   std(super_klass, target_offset, sub_klass); // save result to cache
1729   if (result_reg != noreg) li(result_reg, 0); // load zero result (indicates a hit)
1730   if (L_success != NULL) b(*L_success);
1731 
1732   bind(fallthru);
1733 }
1734 
1735 // Try fast path, then go to slow one if not successful
1736 void MacroAssembler::check_klass_subtype(Register sub_klass,
1737                          Register super_klass,
1738                          Register temp1_reg,
1739                          Register temp2_reg,
1740                          Label& L_success) {
1741   Label L_failure;
1742   check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, L_success, L_failure);
1743   check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success);
1744   bind(L_failure); // Fallthru if not successful.
1745 }
1746 
1747 void MacroAssembler::check_method_handle_type(Register mtype_reg, Register mh_reg,
1748                                               Register temp_reg,
1749                                               Label& wrong_method_type) {
1750   assert_different_registers(mtype_reg, mh_reg, temp_reg);
1751   // Compare method type against that of the receiver.
1752   load_heap_oop_not_null(temp_reg, delayed_value(java_lang_invoke_MethodHandle::type_offset_in_bytes, temp_reg), mh_reg);
1753   cmpd(CCR0, temp_reg, mtype_reg);
1754   bne(CCR0, wrong_method_type);
1755 }
1756 
1757 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot,
1758                                                    Register temp_reg,
1759                                                    int extra_slot_offset) {
1760   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
1761   int stackElementSize = Interpreter::stackElementSize;
1762   int offset = extra_slot_offset * stackElementSize;
1763   if (arg_slot.is_constant()) {
1764     offset += arg_slot.as_constant() * stackElementSize;
1765     return offset;
1766   } else {
1767     assert(temp_reg != noreg, "must specify");
1768     sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize));
1769     if (offset != 0)
1770       addi(temp_reg, temp_reg, offset);
1771     return temp_reg;
1772   }
1773 }
1774 
1775 void MacroAssembler::biased_locking_enter(ConditionRegister cr_reg, Register obj_reg,
1776                                           Register mark_reg, Register temp_reg,
1777                                           Register temp2_reg, Label& done, Label* slow_case) {
1778   assert(UseBiasedLocking, "why call this otherwise?");
1779 
1780 #ifdef ASSERT
1781   assert_different_registers(obj_reg, mark_reg, temp_reg, temp2_reg);
1782 #endif
1783 
1784   Label cas_label;
1785 
1786   // Branch to done if fast path fails and no slow_case provided.
1787   Label *slow_case_int = (slow_case != NULL) ? slow_case : &done;
1788 
1789   // Biased locking
1790   // See whether the lock is currently biased toward our thread and
1791   // whether the epoch is still valid
1792   // Note that the runtime guarantees sufficient alignment of JavaThread
1793   // pointers to allow age to be placed into low bits
1794   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits,
1795          "biased locking makes assumptions about bit layout");
1796 
1797   if (PrintBiasedLockingStatistics) {
1798     load_const(temp_reg, (address) BiasedLocking::total_entry_count_addr(), temp2_reg);
1799     lwz(temp2_reg, 0, temp_reg);
1800     addi(temp2_reg, temp2_reg, 1);
1801     stw(temp2_reg, 0, temp_reg);
1802   }
1803 
1804   andi(temp_reg, mark_reg, markOopDesc::biased_lock_mask_in_place);
1805   cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern);
1806   bne(cr_reg, cas_label);
1807 
1808   load_klass(temp_reg, obj_reg);
1809 
1810   load_const_optimized(temp2_reg, ~((int) markOopDesc::age_mask_in_place));
1811   ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
1812   orr(temp_reg, R16_thread, temp_reg);
1813   xorr(temp_reg, mark_reg, temp_reg);
1814   andr(temp_reg, temp_reg, temp2_reg);
1815   cmpdi(cr_reg, temp_reg, 0);
1816   if (PrintBiasedLockingStatistics) {
1817     Label l;
1818     bne(cr_reg, l);
1819     load_const(mark_reg, (address) BiasedLocking::biased_lock_entry_count_addr());
1820     lwz(temp2_reg, 0, mark_reg);
1821     addi(temp2_reg, temp2_reg, 1);
1822     stw(temp2_reg, 0, mark_reg);
1823     // restore mark_reg
1824     ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);
1825     bind(l);
1826   }
1827   beq(cr_reg, done);
1828 
1829   Label try_revoke_bias;
1830   Label try_rebias;
1831 
1832   // At this point we know that the header has the bias pattern and
1833   // that we are not the bias owner in the current epoch. We need to
1834   // figure out more details about the state of the header in order to
1835   // know what operations can be legally performed on the object's
1836   // header.
1837 
1838   // If the low three bits in the xor result aren't clear, that means
1839   // the prototype header is no longer biased and we have to revoke
1840   // the bias on this object.
1841   andi(temp2_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
1842   cmpwi(cr_reg, temp2_reg, 0);
1843   bne(cr_reg, try_revoke_bias);
1844 
1845   // Biasing is still enabled for this data type. See whether the
1846   // epoch of the current bias is still valid, meaning that the epoch
1847   // bits of the mark word are equal to the epoch bits of the
1848   // prototype header. (Note that the prototype header's epoch bits
1849   // only change at a safepoint.) If not, attempt to rebias the object
1850   // toward the current thread. Note that we must be absolutely sure
1851   // that the current epoch is invalid in order to do this because
1852   // otherwise the manipulations it performs on the mark word are
1853   // illegal.
1854 
1855   int shift_amount = 64 - markOopDesc::epoch_shift;
1856   // rotate epoch bits to right (little) end and set other bits to 0
1857   // [ big part | epoch | little part ] -> [ 0..0 | epoch ]
1858   rldicl_(temp2_reg, temp_reg, shift_amount, 64 - markOopDesc::epoch_bits);
1859   // branch if epoch bits are != 0, i.e. they differ, because the epoch has been incremented
1860   bne(CCR0, try_rebias);
1861 
1862   // The epoch of the current bias is still valid but we know nothing
1863   // about the owner; it might be set or it might be clear. Try to
1864   // acquire the bias of the object using an atomic operation. If this
1865   // fails we will go in to the runtime to revoke the object's bias.
1866   // Note that we first construct the presumed unbiased header so we
1867   // don't accidentally blow away another thread's valid bias.
1868   andi(mark_reg, mark_reg, (markOopDesc::biased_lock_mask_in_place |
1869                                 markOopDesc::age_mask_in_place |
1870                                 markOopDesc::epoch_mask_in_place));
1871   orr(temp_reg, R16_thread, mark_reg);
1872 
1873   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
1874 
1875   // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
1876   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
1877            /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
1878            /*where=*/obj_reg,
1879            MacroAssembler::MemBarAcq,
1880            MacroAssembler::cmpxchgx_hint_acquire_lock(),
1881            noreg, slow_case_int); // bail out if failed
1882 
1883   // If the biasing toward our thread failed, this means that
1884   // another thread succeeded in biasing it toward itself and we
1885   // need to revoke that bias. The revocation will occur in the
1886   // interpreter runtime in the slow case.
1887   if (PrintBiasedLockingStatistics) {
1888     load_const(temp_reg, (address) BiasedLocking::anonymously_biased_lock_entry_count_addr(), temp2_reg);
1889     lwz(temp2_reg, 0, temp_reg);
1890     addi(temp2_reg, temp2_reg, 1);
1891     stw(temp2_reg, 0, temp_reg);
1892   }
1893   b(done);
1894 
1895   bind(try_rebias);
1896   // At this point we know the epoch has expired, meaning that the
1897   // current "bias owner", if any, is actually invalid. Under these
1898   // circumstances _only_, we are allowed to use the current header's
1899   // value as the comparison value when doing the cas to acquire the
1900   // bias in the current epoch. In other words, we allow transfer of
1901   // the bias from one thread to another directly in this situation.
1902   andi(temp_reg, mark_reg, markOopDesc::age_mask_in_place);
1903   orr(temp_reg, R16_thread, temp_reg);
1904   load_klass(temp2_reg, obj_reg);
1905   ld(temp2_reg, in_bytes(Klass::prototype_header_offset()), temp2_reg);
1906   orr(temp_reg, temp_reg, temp2_reg);
1907 
1908   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
1909 
1910   // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
1911   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
1912                  /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
1913                  /*where=*/obj_reg,
1914                  MacroAssembler::MemBarAcq,
1915                  MacroAssembler::cmpxchgx_hint_acquire_lock(),
1916                  noreg, slow_case_int); // bail out if failed
1917 
1918   // If the biasing toward our thread failed, this means that
1919   // another thread succeeded in biasing it toward itself and we
1920   // need to revoke that bias. The revocation will occur in the
1921   // interpreter runtime in the slow case.
1922   if (PrintBiasedLockingStatistics) {
1923     load_const(temp_reg, (address) BiasedLocking::rebiased_lock_entry_count_addr(), temp2_reg);
1924     lwz(temp2_reg, 0, temp_reg);
1925     addi(temp2_reg, temp2_reg, 1);
1926     stw(temp2_reg, 0, temp_reg);
1927   }
1928   b(done);
1929 
1930   bind(try_revoke_bias);
1931   // The prototype mark in the klass doesn't have the bias bit set any
1932   // more, indicating that objects of this data type are not supposed
1933   // to be biased any more. We are going to try to reset the mark of
1934   // this object to the prototype value and fall through to the
1935   // CAS-based locking scheme. Note that if our CAS fails, it means
1936   // that another thread raced us for the privilege of revoking the
1937   // bias of this particular object, so it's okay to continue in the
1938   // normal locking code.
1939   load_klass(temp_reg, obj_reg);
1940   ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
1941   andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place);
1942   orr(temp_reg, temp_reg, temp2_reg);
1943 
1944   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
1945 
1946   // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
1947   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
1948                  /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
1949                  /*where=*/obj_reg,
1950                  MacroAssembler::MemBarAcq,
1951                  MacroAssembler::cmpxchgx_hint_acquire_lock());
1952 
1953   // reload markOop in mark_reg before continuing with lightweight locking
1954   ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);
1955 
1956   // Fall through to the normal CAS-based lock, because no matter what
1957   // the result of the above CAS, some thread must have succeeded in
1958   // removing the bias bit from the object's header.
1959   if (PrintBiasedLockingStatistics) {
1960     Label l;
1961     bne(cr_reg, l);
1962     load_const(temp_reg, (address) BiasedLocking::revoked_lock_entry_count_addr(), temp2_reg);
1963     lwz(temp2_reg, 0, temp_reg);
1964     addi(temp2_reg, temp2_reg, 1);
1965     stw(temp2_reg, 0, temp_reg);
1966     bind(l);
1967   }
1968 
1969   bind(cas_label);
1970 }
1971 
1972 void MacroAssembler::biased_locking_exit (ConditionRegister cr_reg, Register mark_addr, Register temp_reg, Label& done) {
1973   // Check for biased locking unlock case, which is a no-op
1974   // Note: we do not have to check the thread ID for two reasons.
1975   // First, the interpreter checks for IllegalMonitorStateException at
1976   // a higher level. Second, if the bias was revoked while we held the
1977   // lock, the object could not be rebiased toward another thread, so
1978   // the bias bit would be clear.
1979 
1980   ld(temp_reg, 0, mark_addr);
1981   andi(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
1982 
1983   cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern);
1984   beq(cr_reg, done);
1985 }
1986 
1987 // TM on PPC64.
1988 void MacroAssembler::atomic_inc_ptr(Register addr, Register result, int simm16) {
1989   Label retry;
1990   bind(retry);
1991   ldarx(result, addr, /*hint*/ false);
1992   addi(result, result, simm16);
1993   stdcx_(result, addr);
1994   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1995     bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
1996   } else {
1997     bne(                  CCR0, retry); // stXcx_ sets CCR0
1998   }
1999 }
2000 
2001 void MacroAssembler::atomic_ori_int(Register addr, Register result, int uimm16) {
2002   Label retry;
2003   bind(retry);
2004   lwarx(result, addr, /*hint*/ false);
2005   ori(result, result, uimm16);
2006   stwcx_(result, addr);
2007   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
2008     bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
2009   } else {
2010     bne(                  CCR0, retry); // stXcx_ sets CCR0
2011   }
2012 }
2013 
2014 #if INCLUDE_RTM_OPT
2015 
2016 // Update rtm_counters based on abort status
2017 // input: abort_status
2018 //        rtm_counters (RTMLockingCounters*)
2019 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters_Reg) {
2020   // Mapping to keep PreciseRTMLockingStatistics similar to x86.
2021   // x86 ppc (! means inverted, ? means not the same)
2022   //  0   31  Set if abort caused by XABORT instruction.
2023   //  1  ! 7  If set, the transaction may succeed on a retry. This bit is always clear if bit 0 is set.
2024   //  2   13  Set if another logical processor conflicted with a memory address that was part of the transaction that aborted.
2025   //  3   10  Set if an internal buffer overflowed.
2026   //  4  ?12  Set if a debug breakpoint was hit.
2027   //  5  ?32  Set if an abort occurred during execution of a nested transaction.
2028   const  int tm_failure_bit[] = {Assembler::tm_tabort, // Note: Seems like signal handler sets this, too.
2029                                  Assembler::tm_failure_persistent, // inverted: transient
2030                                  Assembler::tm_trans_cf,
2031                                  Assembler::tm_footprint_of,
2032                                  Assembler::tm_non_trans_cf,
2033                                  Assembler::tm_suspended};
2034   const bool tm_failure_inv[] = {false, true, false, false, false, false};
2035   assert(sizeof(tm_failure_bit)/sizeof(int) == RTMLockingCounters::ABORT_STATUS_LIMIT, "adapt mapping!");
2036 
2037   const Register addr_Reg = R0;
2038   // Keep track of offset to where rtm_counters_Reg had pointed to.
2039   int counters_offs = RTMLockingCounters::abort_count_offset();
2040   addi(addr_Reg, rtm_counters_Reg, counters_offs);
2041   const Register temp_Reg = rtm_counters_Reg;
2042 
2043   //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically
2044   ldx(temp_Reg, addr_Reg);
2045   addi(temp_Reg, temp_Reg, 1);
2046   stdx(temp_Reg, addr_Reg);
2047 
2048   if (PrintPreciseRTMLockingStatistics) {
2049     int counters_offs_delta = RTMLockingCounters::abortX_count_offset() - counters_offs;
2050 
2051     //mftexasr(abort_status); done by caller
2052     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
2053       counters_offs += counters_offs_delta;
2054       li(temp_Reg, counters_offs_delta); // can't use addi with R0
2055       add(addr_Reg, addr_Reg, temp_Reg); // point to next counter
2056       counters_offs_delta = sizeof(uintx);
2057 
2058       Label check_abort;
2059       rldicr_(temp_Reg, abort_status, tm_failure_bit[i], 0);
2060       if (tm_failure_inv[i]) {
2061         bne(CCR0, check_abort);
2062       } else {
2063         beq(CCR0, check_abort);
2064       }
2065       //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically
2066       ldx(temp_Reg, addr_Reg);
2067       addi(temp_Reg, temp_Reg, 1);
2068       stdx(temp_Reg, addr_Reg);
2069       bind(check_abort);
2070     }
2071   }
2072   li(temp_Reg, -counters_offs); // can't use addi with R0
2073   add(rtm_counters_Reg, addr_Reg, temp_Reg); // restore
2074 }
2075 
2076 // Branch if (random & (count-1) != 0), count is 2^n
2077 // tmp and CR0 are killed
2078 void MacroAssembler::branch_on_random_using_tb(Register tmp, int count, Label& brLabel) {
2079   mftb(tmp);
2080   andi_(tmp, tmp, count-1);
2081   bne(CCR0, brLabel);
2082 }
2083 
2084 // Perform abort ratio calculation, set no_rtm bit if high ratio.
2085 // input:  rtm_counters_Reg (RTMLockingCounters* address) - KILLED
2086 void MacroAssembler::rtm_abort_ratio_calculation(Register rtm_counters_Reg,
2087                                                  RTMLockingCounters* rtm_counters,
2088                                                  Metadata* method_data) {
2089   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
2090 
2091   if (RTMLockingCalculationDelay > 0) {
2092     // Delay calculation.
2093     ld(rtm_counters_Reg, (RegisterOrConstant)(intptr_t)RTMLockingCounters::rtm_calculation_flag_addr());
2094     cmpdi(CCR0, rtm_counters_Reg, 0);
2095     beq(CCR0, L_done);
2096     load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2097   }
2098   // Abort ratio calculation only if abort_count > RTMAbortThreshold.
2099   //   Aborted transactions = abort_count * 100
2100   //   All transactions = total_count *  RTMTotalCountIncrRate
2101   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
2102   ld(R0, RTMLockingCounters::abort_count_offset(), rtm_counters_Reg);
2103   cmpdi(CCR0, R0, RTMAbortThreshold);
2104   blt(CCR0, L_check_always_rtm2);
2105   mulli(R0, R0, 100);
2106 
2107   const Register tmpReg = rtm_counters_Reg;
2108   ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2109   mulli(tmpReg, tmpReg, RTMTotalCountIncrRate);
2110   mulli(tmpReg, tmpReg, RTMAbortRatio);
2111   cmpd(CCR0, R0, tmpReg);
2112   blt(CCR0, L_check_always_rtm1); // jump to reload
2113   if (method_data != NULL) {
2114     // Set rtm_state to "no rtm" in MDO.
2115     // Not using a metadata relocation. Method and Class Loader are kept alive anyway.
2116     // (See nmethod::metadata_do and CodeBuffer::finalize_oop_references.)
2117     load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2118     atomic_ori_int(R0, tmpReg, NoRTM);
2119   }
2120   b(L_done);
2121 
2122   bind(L_check_always_rtm1);
2123   load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2124   bind(L_check_always_rtm2);
2125   ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2126   cmpdi(CCR0, tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
2127   blt(CCR0, L_done);
2128   if (method_data != NULL) {
2129     // Set rtm_state to "always rtm" in MDO.
2130     // Not using a metadata relocation. See above.
2131     load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2132     atomic_ori_int(R0, tmpReg, UseRTM);
2133   }
2134   bind(L_done);
2135 }
2136 
2137 // Update counters and perform abort ratio calculation.
2138 // input: abort_status_Reg
2139 void MacroAssembler::rtm_profiling(Register abort_status_Reg, Register temp_Reg,
2140                                    RTMLockingCounters* rtm_counters,
2141                                    Metadata* method_data,
2142                                    bool profile_rtm) {
2143 
2144   assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2145   // Update rtm counters based on state at abort.
2146   // Reads abort_status_Reg, updates flags.
2147   assert_different_registers(abort_status_Reg, temp_Reg);
2148   load_const_optimized(temp_Reg, (address)rtm_counters, R0);
2149   rtm_counters_update(abort_status_Reg, temp_Reg);
2150   if (profile_rtm) {
2151     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2152     rtm_abort_ratio_calculation(temp_Reg, rtm_counters, method_data);
2153   }
2154 }
2155 
2156 // Retry on abort if abort's status indicates non-persistent failure.
2157 // inputs: retry_count_Reg
2158 //       : abort_status_Reg
2159 // output: retry_count_Reg decremented by 1
2160 void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg,
2161                                              Label& retryLabel, Label* checkRetry) {
2162   Label doneRetry;
2163   rldicr_(R0, abort_status_Reg, tm_failure_persistent, 0);
2164   bne(CCR0, doneRetry);
2165   if (checkRetry) { bind(*checkRetry); }
2166   addic_(retry_count_Reg, retry_count_Reg, -1);
2167   blt(CCR0, doneRetry);
2168   smt_yield(); // Can't use wait(). No permission (SIGILL).
2169   b(retryLabel);
2170   bind(doneRetry);
2171 }
2172 
2173 // Spin and retry if lock is busy.
2174 // inputs: box_Reg (monitor address)
2175 //       : retry_count_Reg
2176 // output: retry_count_Reg decremented by 1
2177 // CTR is killed
2178 void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register owner_addr_Reg, Label& retryLabel) {
2179   Label SpinLoop, doneRetry;
2180   addic_(retry_count_Reg, retry_count_Reg, -1);
2181   blt(CCR0, doneRetry);
2182   li(R0, RTMSpinLoopCount);
2183   mtctr(R0);
2184 
2185   bind(SpinLoop);
2186   smt_yield(); // Can't use waitrsv(). No permission (SIGILL).
2187   bdz(retryLabel);
2188   ld(R0, 0, owner_addr_Reg);
2189   cmpdi(CCR0, R0, 0);
2190   bne(CCR0, SpinLoop);
2191   b(retryLabel);
2192 
2193   bind(doneRetry);
2194 }
2195 
2196 // Use RTM for normal stack locks.
2197 // Input: objReg (object to lock)
2198 void MacroAssembler::rtm_stack_locking(ConditionRegister flag,
2199                                        Register obj, Register mark_word, Register tmp,
2200                                        Register retry_on_abort_count_Reg,
2201                                        RTMLockingCounters* stack_rtm_counters,
2202                                        Metadata* method_data, bool profile_rtm,
2203                                        Label& DONE_LABEL, Label& IsInflated) {
2204   assert(UseRTMForStackLocks, "why call this otherwise?");
2205   assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
2206   Label L_rtm_retry, L_decrement_retry, L_on_abort;
2207 
2208   if (RTMRetryCount > 0) {
2209     load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
2210     bind(L_rtm_retry);
2211   }
2212   andi_(R0, mark_word, markOopDesc::monitor_value);  // inflated vs stack-locked|neutral|biased
2213   bne(CCR0, IsInflated);
2214 
2215   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2216     Label L_noincrement;
2217     if (RTMTotalCountIncrRate > 1) {
2218       branch_on_random_using_tb(tmp, (int)RTMTotalCountIncrRate, L_noincrement);
2219     }
2220     assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
2221     load_const_optimized(tmp, (address)stack_rtm_counters->total_count_addr(), R0);
2222     //atomic_inc_ptr(tmp, /*temp, will be reloaded*/mark_word); We don't increment atomically
2223     ldx(mark_word, tmp);
2224     addi(mark_word, mark_word, 1);
2225     stdx(mark_word, tmp);
2226     bind(L_noincrement);
2227   }
2228   tbegin_();
2229   beq(CCR0, L_on_abort);
2230   ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);         // Reload in transaction, conflicts need to be tracked.
2231   andi(R0, mark_word, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
2232   cmpwi(flag, R0, markOopDesc::unlocked_value);                // bits = 001 unlocked
2233   beq(flag, DONE_LABEL);                                       // all done if unlocked
2234 
2235   if (UseRTMXendForLockBusy) {
2236     tend_();
2237     b(L_decrement_retry);
2238   } else {
2239     tabort_();
2240   }
2241   bind(L_on_abort);
2242   const Register abort_status_Reg = tmp;
2243   mftexasr(abort_status_Reg);
2244   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2245     rtm_profiling(abort_status_Reg, /*temp*/mark_word, stack_rtm_counters, method_data, profile_rtm);
2246   }
2247   ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // reload
2248   if (RTMRetryCount > 0) {
2249     // Retry on lock abort if abort status is not permanent.
2250     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry, &L_decrement_retry);
2251   } else {
2252     bind(L_decrement_retry);
2253   }
2254 }
2255 
2256 // Use RTM for inflating locks
2257 // inputs: obj       (object to lock)
2258 //         mark_word (current header - KILLED)
2259 //         boxReg    (on-stack box address (displaced header location) - KILLED)
2260 void MacroAssembler::rtm_inflated_locking(ConditionRegister flag,
2261                                           Register obj, Register mark_word, Register boxReg,
2262                                           Register retry_on_busy_count_Reg, Register retry_on_abort_count_Reg,
2263                                           RTMLockingCounters* rtm_counters,
2264                                           Metadata* method_data, bool profile_rtm,
2265                                           Label& DONE_LABEL) {
2266   assert(UseRTMLocking, "why call this otherwise?");
2267   Label L_rtm_retry, L_decrement_retry, L_on_abort;
2268   // Clean monitor_value bit to get valid pointer.
2269   int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markOopDesc::monitor_value;
2270 
2271   // Store non-null, using boxReg instead of (intptr_t)markOopDesc::unused_mark().
2272   std(boxReg, BasicLock::displaced_header_offset_in_bytes(), boxReg);
2273   const Register tmpReg = boxReg;
2274   const Register owner_addr_Reg = mark_word;
2275   addi(owner_addr_Reg, mark_word, owner_offset);
2276 
2277   if (RTMRetryCount > 0) {
2278     load_const_optimized(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy.
2279     load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort.
2280     bind(L_rtm_retry);
2281   }
2282   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2283     Label L_noincrement;
2284     if (RTMTotalCountIncrRate > 1) {
2285       branch_on_random_using_tb(R0, (int)RTMTotalCountIncrRate, L_noincrement);
2286     }
2287     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2288     load_const(R0, (address)rtm_counters->total_count_addr(), tmpReg);
2289     //atomic_inc_ptr(R0, tmpReg); We don't increment atomically
2290     ldx(tmpReg, R0);
2291     addi(tmpReg, tmpReg, 1);
2292     stdx(tmpReg, R0);
2293     bind(L_noincrement);
2294   }
2295   tbegin_();
2296   beq(CCR0, L_on_abort);
2297   // We don't reload mark word. Will only be reset at safepoint.
2298   ld(R0, 0, owner_addr_Reg); // Load in transaction, conflicts need to be tracked.
2299   cmpdi(flag, R0, 0);
2300   beq(flag, DONE_LABEL);
2301 
2302   if (UseRTMXendForLockBusy) {
2303     tend_();
2304     b(L_decrement_retry);
2305   } else {
2306     tabort_();
2307   }
2308   bind(L_on_abort);
2309   const Register abort_status_Reg = tmpReg;
2310   mftexasr(abort_status_Reg);
2311   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2312     rtm_profiling(abort_status_Reg, /*temp*/ owner_addr_Reg, rtm_counters, method_data, profile_rtm);
2313     // Restore owner_addr_Reg
2314     ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);
2315 #ifdef ASSERT
2316     andi_(R0, mark_word, markOopDesc::monitor_value);
2317     asm_assert_ne("must be inflated", 0xa754); // Deflating only allowed at safepoint.
2318 #endif
2319     addi(owner_addr_Reg, mark_word, owner_offset);
2320   }
2321   if (RTMRetryCount > 0) {
2322     // Retry on lock abort if abort status is not permanent.
2323     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
2324   }
2325 
2326   // Appears unlocked - try to swing _owner from null to non-null.
2327   cmpxchgd(flag, /*current val*/ R0, (intptr_t)0, /*new val*/ R16_thread, owner_addr_Reg,
2328            MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2329            MacroAssembler::cmpxchgx_hint_acquire_lock(), noreg, &L_decrement_retry, true);
2330 
2331   if (RTMRetryCount > 0) {
2332     // success done else retry
2333     b(DONE_LABEL);
2334     bind(L_decrement_retry);
2335     // Spin and retry if lock is busy.
2336     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, owner_addr_Reg, L_rtm_retry);
2337   } else {
2338     bind(L_decrement_retry);
2339   }
2340 }
2341 
2342 #endif //  INCLUDE_RTM_OPT
2343 
2344 // "The box" is the space on the stack where we copy the object mark.
2345 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box,
2346                                                Register temp, Register displaced_header, Register current_header,
2347                                                bool try_bias,
2348                                                RTMLockingCounters* rtm_counters,
2349                                                RTMLockingCounters* stack_rtm_counters,
2350                                                Metadata* method_data,
2351                                                bool use_rtm, bool profile_rtm) {
2352   assert_different_registers(oop, box, temp, displaced_header, current_header);
2353   assert(flag != CCR0, "bad condition register");
2354   Label cont;
2355   Label object_has_monitor;
2356   Label cas_failed;
2357 
2358   // Load markOop from object into displaced_header.
2359   ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop);
2360 
2361 
2362   // Always do locking in runtime.
2363   if (EmitSync & 0x01) {
2364     cmpdi(flag, oop, 0); // Oop can't be 0 here => always false.
2365     return;
2366   }
2367 
2368   if (try_bias) {
2369     biased_locking_enter(flag, oop, displaced_header, temp, current_header, cont);
2370   }
2371 
2372 #if INCLUDE_RTM_OPT
2373   if (UseRTMForStackLocks && use_rtm) {
2374     rtm_stack_locking(flag, oop, displaced_header, temp, /*temp*/ current_header,
2375                       stack_rtm_counters, method_data, profile_rtm,
2376                       cont, object_has_monitor);
2377   }
2378 #endif // INCLUDE_RTM_OPT
2379 
2380   // Handle existing monitor.
2381   if ((EmitSync & 0x02) == 0) {
2382     // The object has an existing monitor iff (mark & monitor_value) != 0.
2383     andi_(temp, displaced_header, markOopDesc::monitor_value);
2384     bne(CCR0, object_has_monitor);
2385   }
2386 
2387   // Set displaced_header to be (markOop of object | UNLOCK_VALUE).
2388   ori(displaced_header, displaced_header, markOopDesc::unlocked_value);
2389 
2390   // Load Compare Value application register.
2391 
2392   // Initialize the box. (Must happen before we update the object mark!)
2393   std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2394 
2395   // Must fence, otherwise, preceding store(s) may float below cmpxchg.
2396   // Compare object markOop with mark and if equal exchange scratch1 with object markOop.
2397   // CmpxchgX sets cr_reg to cmpX(current, displaced).
2398   membar(Assembler::StoreStore);
2399   cmpxchgd(/*flag=*/flag,
2400            /*current_value=*/current_header,
2401            /*compare_value=*/displaced_header,
2402            /*exchange_value=*/box,
2403            /*where=*/oop,
2404            MacroAssembler::MemBarAcq,
2405            MacroAssembler::cmpxchgx_hint_acquire_lock(),
2406            noreg,
2407            &cas_failed);
2408   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2409 
2410   // If the compare-and-exchange succeeded, then we found an unlocked
2411   // object and we have now locked it.
2412   b(cont);
2413 
2414   bind(cas_failed);
2415   // We did not see an unlocked object so try the fast recursive case.
2416 
2417   // Check if the owner is self by comparing the value in the markOop of object
2418   // (current_header) with the stack pointer.
2419   sub(current_header, current_header, R1_SP);
2420   load_const_optimized(temp, (address) (~(os::vm_page_size()-1) |
2421                                         markOopDesc::lock_mask_in_place));
2422 
2423   and_(R0/*==0?*/, current_header, temp);
2424   // If condition is true we are cont and hence we can store 0 as the
2425   // displaced header in the box, which indicates that it is a recursive lock.
2426   mcrf(flag,CCR0);
2427   std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box);
2428 
2429   // Handle existing monitor.
2430   if ((EmitSync & 0x02) == 0) {
2431     b(cont);
2432 
2433     bind(object_has_monitor);
2434     // The object's monitor m is unlocked iff m->owner == NULL,
2435     // otherwise m->owner may contain a thread or a stack address.
2436 
2437 #if INCLUDE_RTM_OPT
2438     // Use the same RTM locking code in 32- and 64-bit VM.
2439     if (use_rtm) {
2440       rtm_inflated_locking(flag, oop, displaced_header, box, temp, /*temp*/ current_header,
2441                            rtm_counters, method_data, profile_rtm, cont);
2442     } else {
2443 #endif // INCLUDE_RTM_OPT
2444 
2445     // Try to CAS m->owner from NULL to current thread.
2446     addi(temp, displaced_header, ObjectMonitor::owner_offset_in_bytes()-markOopDesc::monitor_value);
2447     li(displaced_header, 0);
2448     // CmpxchgX sets flag to cmpX(current, displaced).
2449     cmpxchgd(/*flag=*/flag,
2450              /*current_value=*/current_header,
2451              /*compare_value=*/(intptr_t)0,
2452              /*exchange_value=*/R16_thread,
2453              /*where=*/temp,
2454              MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2455              MacroAssembler::cmpxchgx_hint_acquire_lock());
2456 
2457     // Store a non-null value into the box.
2458     std(box, BasicLock::displaced_header_offset_in_bytes(), box);
2459 
2460 #   ifdef ASSERT
2461     bne(flag, cont);
2462     // We have acquired the monitor, check some invariants.
2463     addi(/*monitor=*/temp, temp, -ObjectMonitor::owner_offset_in_bytes());
2464     // Invariant 1: _recursions should be 0.
2465     //assert(ObjectMonitor::recursions_size_in_bytes() == 8, "unexpected size");
2466     asm_assert_mem8_is_zero(ObjectMonitor::recursions_offset_in_bytes(), temp,
2467                             "monitor->_recursions should be 0", -1);
2468     // Invariant 2: OwnerIsThread shouldn't be 0.
2469     //assert(ObjectMonitor::OwnerIsThread_size_in_bytes() == 4, "unexpected size");
2470     //asm_assert_mem4_isnot_zero(ObjectMonitor::OwnerIsThread_offset_in_bytes(), temp,
2471     //                           "monitor->OwnerIsThread shouldn't be 0", -1);
2472 #   endif
2473 
2474 #if INCLUDE_RTM_OPT
2475     } // use_rtm()
2476 #endif
2477   }
2478 
2479   bind(cont);
2480   // flag == EQ indicates success
2481   // flag == NE indicates failure
2482 }
2483 
2484 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box,
2485                                                  Register temp, Register displaced_header, Register current_header,
2486                                                  bool try_bias, bool use_rtm) {
2487   assert_different_registers(oop, box, temp, displaced_header, current_header);
2488   assert(flag != CCR0, "bad condition register");
2489   Label cont;
2490   Label object_has_monitor;
2491 
2492   // Always do locking in runtime.
2493   if (EmitSync & 0x01) {
2494     cmpdi(flag, oop, 0); // Oop can't be 0 here => always false.
2495     return;
2496   }
2497 
2498   if (try_bias) {
2499     biased_locking_exit(flag, oop, current_header, cont);
2500   }
2501 
2502 #if INCLUDE_RTM_OPT
2503   if (UseRTMForStackLocks && use_rtm) {
2504     assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
2505     Label L_regular_unlock;
2506     ld(current_header, oopDesc::mark_offset_in_bytes(), oop);         // fetch markword
2507     andi(R0, current_header, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
2508     cmpwi(flag, R0, markOopDesc::unlocked_value);                     // bits = 001 unlocked
2509     bne(flag, L_regular_unlock);                                      // else RegularLock
2510     tend_();                                                          // otherwise end...
2511     b(cont);                                                          // ... and we're done
2512     bind(L_regular_unlock);
2513   }
2514 #endif
2515 
2516   // Find the lock address and load the displaced header from the stack.
2517   ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2518 
2519   // If the displaced header is 0, we have a recursive unlock.
2520   cmpdi(flag, displaced_header, 0);
2521   beq(flag, cont);
2522 
2523   // Handle existing monitor.
2524   if ((EmitSync & 0x02) == 0) {
2525     // The object has an existing monitor iff (mark & monitor_value) != 0.
2526     RTM_OPT_ONLY( if (!(UseRTMForStackLocks && use_rtm)) ) // skip load if already done
2527     ld(current_header, oopDesc::mark_offset_in_bytes(), oop);
2528     andi_(R0, current_header, markOopDesc::monitor_value);
2529     bne(CCR0, object_has_monitor);
2530   }
2531 
2532   // Check if it is still a light weight lock, this is is true if we see
2533   // the stack address of the basicLock in the markOop of the object.
2534   // Cmpxchg sets flag to cmpd(current_header, box).
2535   cmpxchgd(/*flag=*/flag,
2536            /*current_value=*/current_header,
2537            /*compare_value=*/box,
2538            /*exchange_value=*/displaced_header,
2539            /*where=*/oop,
2540            MacroAssembler::MemBarRel,
2541            MacroAssembler::cmpxchgx_hint_release_lock(),
2542            noreg,
2543            &cont);
2544 
2545   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2546 
2547   // Handle existing monitor.
2548   if ((EmitSync & 0x02) == 0) {
2549     b(cont);
2550 
2551     bind(object_has_monitor);
2552     addi(current_header, current_header, -markOopDesc::monitor_value); // monitor
2553     ld(temp,             ObjectMonitor::owner_offset_in_bytes(), current_header);
2554 
2555     // It's inflated.
2556 #if INCLUDE_RTM_OPT
2557     if (use_rtm) {
2558       Label L_regular_inflated_unlock;
2559       // Clean monitor_value bit to get valid pointer
2560       cmpdi(flag, temp, 0);
2561       bne(flag, L_regular_inflated_unlock);
2562       tend_();
2563       b(cont);
2564       bind(L_regular_inflated_unlock);
2565     }
2566 #endif
2567 
2568     ld(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header);
2569     xorr(temp, R16_thread, temp);      // Will be 0 if we are the owner.
2570     orr(temp, temp, displaced_header); // Will be 0 if there are 0 recursions.
2571     cmpdi(flag, temp, 0);
2572     bne(flag, cont);
2573 
2574     ld(temp,             ObjectMonitor::EntryList_offset_in_bytes(), current_header);
2575     ld(displaced_header, ObjectMonitor::cxq_offset_in_bytes(), current_header);
2576     orr(temp, temp, displaced_header); // Will be 0 if both are 0.
2577     cmpdi(flag, temp, 0);
2578     bne(flag, cont);
2579     release();
2580     std(temp, ObjectMonitor::owner_offset_in_bytes(), current_header);
2581   }
2582 
2583   bind(cont);
2584   // flag == EQ indicates success
2585   // flag == NE indicates failure
2586 }
2587 
2588 // Write serialization page so VM thread can do a pseudo remote membar.
2589 // We use the current thread pointer to calculate a thread specific
2590 // offset to write to within the page. This minimizes bus traffic
2591 // due to cache line collision.
2592 void MacroAssembler::serialize_memory(Register thread, Register tmp1, Register tmp2) {
2593   srdi(tmp2, thread, os::get_serialize_page_shift_count());
2594 
2595   int mask = os::vm_page_size() - sizeof(int);
2596   if (Assembler::is_simm(mask, 16)) {
2597     andi(tmp2, tmp2, mask);
2598   } else {
2599     lis(tmp1, (int)((signed short) (mask >> 16)));
2600     ori(tmp1, tmp1, mask & 0x0000ffff);
2601     andr(tmp2, tmp2, tmp1);
2602   }
2603 
2604   load_const(tmp1, (long) os::get_memory_serialize_page());
2605   release();
2606   stwx(R0, tmp1, tmp2);
2607 }
2608 
2609 
2610 // GC barrier helper macros
2611 
2612 // Write the card table byte if needed.
2613 void MacroAssembler::card_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp) {
2614   CardTableModRefBS* bs =
2615     barrier_set_cast<CardTableModRefBS>(Universe::heap()->barrier_set());
2616   assert(bs->kind() == BarrierSet::CardTableModRef ||
2617          bs->kind() == BarrierSet::CardTableExtension, "wrong barrier");
2618 #ifdef ASSERT
2619   cmpdi(CCR0, Rnew_val, 0);
2620   asm_assert_ne("null oop not allowed", 0x321);
2621 #endif
2622   card_table_write(bs->byte_map_base, Rtmp, Rstore_addr);
2623 }
2624 
2625 // Write the card table byte.
2626 void MacroAssembler::card_table_write(jbyte* byte_map_base, Register Rtmp, Register Robj) {
2627   assert_different_registers(Robj, Rtmp, R0);
2628   load_const_optimized(Rtmp, (address)byte_map_base, R0);
2629   srdi(Robj, Robj, CardTableModRefBS::card_shift);
2630   li(R0, 0); // dirty
2631   if (UseConcMarkSweepGC) membar(Assembler::StoreStore);
2632   stbx(R0, Rtmp, Robj);
2633 }
2634 
2635 #if INCLUDE_ALL_GCS
2636 // General G1 pre-barrier generator.
2637 // Goal: record the previous value if it is not null.
2638 void MacroAssembler::g1_write_barrier_pre(Register Robj, RegisterOrConstant offset, Register Rpre_val,
2639                                           Register Rtmp1, Register Rtmp2, bool needs_frame) {
2640   Label runtime, filtered;
2641 
2642   // Is marking active?
2643   if (in_bytes(PtrQueue::byte_width_of_active()) == 4) {
2644     lwz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + PtrQueue::byte_offset_of_active()), R16_thread);
2645   } else {
2646     guarantee(in_bytes(PtrQueue::byte_width_of_active()) == 1, "Assumption");
2647     lbz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + PtrQueue::byte_offset_of_active()), R16_thread);
2648   }
2649   cmpdi(CCR0, Rtmp1, 0);
2650   beq(CCR0, filtered);
2651 
2652   // Do we need to load the previous value?
2653   if (Robj != noreg) {
2654     // Load the previous value...
2655     if (UseCompressedOops) {
2656       lwz(Rpre_val, offset, Robj);
2657     } else {
2658       ld(Rpre_val, offset, Robj);
2659     }
2660     // Previous value has been loaded into Rpre_val.
2661   }
2662   assert(Rpre_val != noreg, "must have a real register");
2663 
2664   // Is the previous value null?
2665   cmpdi(CCR0, Rpre_val, 0);
2666   beq(CCR0, filtered);
2667 
2668   if (Robj != noreg && UseCompressedOops) {
2669     decode_heap_oop_not_null(Rpre_val);
2670   }
2671 
2672   // OK, it's not filtered, so we'll need to call enqueue. In the normal
2673   // case, pre_val will be a scratch G-reg, but there are some cases in
2674   // which it's an O-reg. In the first case, do a normal call. In the
2675   // latter, do a save here and call the frameless version.
2676 
2677   // Can we store original value in the thread's buffer?
2678   // Is index == 0?
2679   // (The index field is typed as size_t.)
2680   const Register Rbuffer = Rtmp1, Rindex = Rtmp2;
2681 
2682   ld(Rindex, in_bytes(JavaThread::satb_mark_queue_offset() + PtrQueue::byte_offset_of_index()), R16_thread);
2683   cmpdi(CCR0, Rindex, 0);
2684   beq(CCR0, runtime); // If index == 0, goto runtime.
2685   ld(Rbuffer, in_bytes(JavaThread::satb_mark_queue_offset() + PtrQueue::byte_offset_of_buf()), R16_thread);
2686 
2687   addi(Rindex, Rindex, -wordSize); // Decrement index.
2688   std(Rindex, in_bytes(JavaThread::satb_mark_queue_offset() + PtrQueue::byte_offset_of_index()), R16_thread);
2689 
2690   // Record the previous value.
2691   stdx(Rpre_val, Rbuffer, Rindex);
2692   b(filtered);
2693 
2694   bind(runtime);
2695 
2696   // VM call need frame to access(write) O register.
2697   if (needs_frame) {
2698     save_LR_CR(Rtmp1);
2699     push_frame_reg_args(0, Rtmp2);
2700   }
2701 
2702   if (Rpre_val->is_volatile() && Robj == noreg) mr(R31, Rpre_val); // Save pre_val across C call if it was preloaded.
2703   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), Rpre_val, R16_thread);
2704   if (Rpre_val->is_volatile() && Robj == noreg) mr(Rpre_val, R31); // restore
2705 
2706   if (needs_frame) {
2707     pop_frame();
2708     restore_LR_CR(Rtmp1);
2709   }
2710 
2711   bind(filtered);
2712 }
2713 
2714 // General G1 post-barrier generator
2715 // Store cross-region card.
2716 void MacroAssembler::g1_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp1, Register Rtmp2, Register Rtmp3, Label *filtered_ext) {
2717   Label runtime, filtered_int;
2718   Label& filtered = (filtered_ext != NULL) ? *filtered_ext : filtered_int;
2719   assert_different_registers(Rstore_addr, Rnew_val, Rtmp1, Rtmp2);
2720 
2721   G1SATBCardTableLoggingModRefBS* bs =
2722     barrier_set_cast<G1SATBCardTableLoggingModRefBS>(Universe::heap()->barrier_set());
2723 
2724   // Does store cross heap regions?
2725   if (G1RSBarrierRegionFilter) {
2726     xorr(Rtmp1, Rstore_addr, Rnew_val);
2727     srdi_(Rtmp1, Rtmp1, HeapRegion::LogOfHRGrainBytes);
2728     beq(CCR0, filtered);
2729   }
2730 
2731   // Crosses regions, storing NULL?
2732 #ifdef ASSERT
2733   cmpdi(CCR0, Rnew_val, 0);
2734   asm_assert_ne("null oop not allowed (G1)", 0x322); // Checked by caller on PPC64, so following branch is obsolete:
2735   //beq(CCR0, filtered);
2736 #endif
2737 
2738   // Storing region crossing non-NULL, is card already dirty?
2739   assert(sizeof(*bs->byte_map_base) == sizeof(jbyte), "adjust this code");
2740   const Register Rcard_addr = Rtmp1;
2741   Register Rbase = Rtmp2;
2742   load_const_optimized(Rbase, (address)bs->byte_map_base, /*temp*/ Rtmp3);
2743 
2744   srdi(Rcard_addr, Rstore_addr, CardTableModRefBS::card_shift);
2745 
2746   // Get the address of the card.
2747   lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr);
2748   cmpwi(CCR0, Rtmp3, (int)G1SATBCardTableModRefBS::g1_young_card_val());
2749   beq(CCR0, filtered);
2750 
2751   membar(Assembler::StoreLoad);
2752   lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr);  // Reload after membar.
2753   cmpwi(CCR0, Rtmp3 /* card value */, CardTableModRefBS::dirty_card_val());
2754   beq(CCR0, filtered);
2755 
2756   // Storing a region crossing, non-NULL oop, card is clean.
2757   // Dirty card and log.
2758   li(Rtmp3, CardTableModRefBS::dirty_card_val());
2759   //release(); // G1: oops are allowed to get visible after dirty marking.
2760   stbx(Rtmp3, Rbase, Rcard_addr);
2761 
2762   add(Rcard_addr, Rbase, Rcard_addr); // This is the address which needs to get enqueued.
2763   Rbase = noreg; // end of lifetime
2764 
2765   const Register Rqueue_index = Rtmp2,
2766                  Rqueue_buf   = Rtmp3;
2767   ld(Rqueue_index, in_bytes(JavaThread::dirty_card_queue_offset() + PtrQueue::byte_offset_of_index()), R16_thread);
2768   cmpdi(CCR0, Rqueue_index, 0);
2769   beq(CCR0, runtime); // index == 0 then jump to runtime
2770   ld(Rqueue_buf, in_bytes(JavaThread::dirty_card_queue_offset() + PtrQueue::byte_offset_of_buf()), R16_thread);
2771 
2772   addi(Rqueue_index, Rqueue_index, -wordSize); // decrement index
2773   std(Rqueue_index, in_bytes(JavaThread::dirty_card_queue_offset() + PtrQueue::byte_offset_of_index()), R16_thread);
2774 
2775   stdx(Rcard_addr, Rqueue_buf, Rqueue_index); // store card
2776   b(filtered);
2777 
2778   bind(runtime);
2779 
2780   // Save the live input values.
2781   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), Rcard_addr, R16_thread);
2782 
2783   bind(filtered_int);
2784 }
2785 #endif // INCLUDE_ALL_GCS
2786 
2787 // Values for last_Java_pc, and last_Java_sp must comply to the rules
2788 // in frame_ppc.hpp.
2789 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) {
2790   // Always set last_Java_pc and flags first because once last_Java_sp
2791   // is visible has_last_Java_frame is true and users will look at the
2792   // rest of the fields. (Note: flags should always be zero before we
2793   // get here so doesn't need to be set.)
2794 
2795   // Verify that last_Java_pc was zeroed on return to Java
2796   asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread,
2797                           "last_Java_pc not zeroed before leaving Java", 0x200);
2798 
2799   // When returning from calling out from Java mode the frame anchor's
2800   // last_Java_pc will always be set to NULL. It is set here so that
2801   // if we are doing a call to native (not VM) that we capture the
2802   // known pc and don't have to rely on the native call having a
2803   // standard frame linkage where we can find the pc.
2804   if (last_Java_pc != noreg)
2805     std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
2806 
2807   // Set last_Java_sp last.
2808   std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
2809 }
2810 
2811 void MacroAssembler::reset_last_Java_frame(void) {
2812   asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()),
2813                              R16_thread, "SP was not set, still zero", 0x202);
2814 
2815   BLOCK_COMMENT("reset_last_Java_frame {");
2816   li(R0, 0);
2817 
2818   // _last_Java_sp = 0
2819   std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
2820 
2821   // _last_Java_pc = 0
2822   std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
2823   BLOCK_COMMENT("} reset_last_Java_frame");
2824 }
2825 
2826 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) {
2827   assert_different_registers(sp, tmp1);
2828 
2829   // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via
2830   // TOP_IJAVA_FRAME_ABI.
2831   // FIXME: assert that we really have a TOP_IJAVA_FRAME here!
2832 #ifdef CC_INTERP
2833   ld(tmp1/*pc*/, _top_ijava_frame_abi(frame_manager_lr), sp);
2834 #else
2835   address entry = pc();
2836   load_const_optimized(tmp1, entry);
2837 #endif
2838 
2839   set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1);
2840 }
2841 
2842 void MacroAssembler::get_vm_result(Register oop_result) {
2843   // Read:
2844   //   R16_thread
2845   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
2846   //
2847   // Updated:
2848   //   oop_result
2849   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
2850 
2851   verify_thread();
2852 
2853   ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread);
2854   li(R0, 0);
2855   std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread);
2856 
2857   verify_oop(oop_result);
2858 }
2859 
2860 void MacroAssembler::get_vm_result_2(Register metadata_result) {
2861   // Read:
2862   //   R16_thread
2863   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
2864   //
2865   // Updated:
2866   //   metadata_result
2867   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
2868 
2869   ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
2870   li(R0, 0);
2871   std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
2872 }
2873 
2874 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) {
2875   Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided.
2876   if (Universe::narrow_klass_base() != 0) {
2877     // Use dst as temp if it is free.
2878     sub_const_optimized(dst, current, Universe::narrow_klass_base(), R0);
2879     current = dst;
2880   }
2881   if (Universe::narrow_klass_shift() != 0) {
2882     srdi(dst, current, Universe::narrow_klass_shift());
2883     current = dst;
2884   }
2885   return current;
2886 }
2887 
2888 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) {
2889   if (UseCompressedClassPointers) {
2890     Register compressedKlass = encode_klass_not_null(ck, klass);
2891     stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop);
2892   } else {
2893     std(klass, oopDesc::klass_offset_in_bytes(), dst_oop);
2894   }
2895 }
2896 
2897 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) {
2898   if (UseCompressedClassPointers) {
2899     if (val == noreg) {
2900       val = R0;
2901       li(val, 0);
2902     }
2903     stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed
2904   }
2905 }
2906 
2907 int MacroAssembler::instr_size_for_decode_klass_not_null() {
2908   if (!UseCompressedClassPointers) return 0;
2909   int num_instrs = 1;  // shift or move
2910   if (Universe::narrow_klass_base() != 0) num_instrs = 7;  // shift + load const + add
2911   return num_instrs * BytesPerInstWord;
2912 }
2913 
2914 void MacroAssembler::decode_klass_not_null(Register dst, Register src) {
2915   assert(dst != R0, "Dst reg may not be R0, as R0 is used here.");
2916   if (src == noreg) src = dst;
2917   Register shifted_src = src;
2918   if (Universe::narrow_klass_shift() != 0 ||
2919       Universe::narrow_klass_base() == 0 && src != dst) {  // Move required.
2920     shifted_src = dst;
2921     sldi(shifted_src, src, Universe::narrow_klass_shift());
2922   }
2923   if (Universe::narrow_klass_base() != 0) {
2924     add_const_optimized(dst, shifted_src, Universe::narrow_klass_base(), R0);
2925   }
2926 }
2927 
2928 void MacroAssembler::load_klass(Register dst, Register src) {
2929   if (UseCompressedClassPointers) {
2930     lwz(dst, oopDesc::klass_offset_in_bytes(), src);
2931     // Attention: no null check here!
2932     decode_klass_not_null(dst, dst);
2933   } else {
2934     ld(dst, oopDesc::klass_offset_in_bytes(), src);
2935   }
2936 }
2937 
2938 void MacroAssembler::load_klass_with_trap_null_check(Register dst, Register src) {
2939   if (!os::zero_page_read_protected()) {
2940     if (TrapBasedNullChecks) {
2941       trap_null_check(src);
2942     }
2943   }
2944   load_klass(dst, src);
2945 }
2946 
2947 void MacroAssembler::reinit_heapbase(Register d, Register tmp) {
2948   if (Universe::heap() != NULL) {
2949     load_const_optimized(R30, Universe::narrow_ptrs_base(), tmp);
2950   } else {
2951     // Heap not yet allocated. Load indirectly.
2952     int simm16_offset = load_const_optimized(R30, Universe::narrow_ptrs_base_addr(), tmp, true);
2953     ld(R30, simm16_offset, R30);
2954   }
2955 }
2956 
2957 // Clear Array
2958 // Kills both input registers. tmp == R0 is allowed.
2959 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp) {
2960   // Procedure for large arrays (uses data cache block zero instruction).
2961     Label startloop, fast, fastloop, small_rest, restloop, done;
2962     const int cl_size         = VM_Version::get_cache_line_size(),
2963               cl_dwords       = cl_size>>3,
2964               cl_dw_addr_bits = exact_log2(cl_dwords),
2965               dcbz_min        = 1;                     // Min count of dcbz executions, needs to be >0.
2966 
2967 //2:
2968     cmpdi(CCR1, cnt_dwords, ((dcbz_min+1)<<cl_dw_addr_bits)-1); // Big enough? (ensure >=dcbz_min lines included).
2969     blt(CCR1, small_rest);                                      // Too small.
2970     rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits);           // Extract dword offset within first cache line.
2971     beq(CCR0, fast);                                            // Already 128byte aligned.
2972 
2973     subfic(tmp, tmp, cl_dwords);
2974     mtctr(tmp);                        // Set ctr to hit 128byte boundary (0<ctr<cl_dwords).
2975     subf(cnt_dwords, tmp, cnt_dwords); // rest.
2976     li(tmp, 0);
2977 //10:
2978   bind(startloop);                     // Clear at the beginning to reach 128byte boundary.
2979     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
2980     addi(base_ptr, base_ptr, 8);
2981     bdnz(startloop);
2982 //13:
2983   bind(fast);                                  // Clear 128byte blocks.
2984     srdi(tmp, cnt_dwords, cl_dw_addr_bits);    // Loop count for 128byte loop (>0).
2985     andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords.
2986     mtctr(tmp);                                // Load counter.
2987 //16:
2988   bind(fastloop);
2989     dcbz(base_ptr);                    // Clear 128byte aligned block.
2990     addi(base_ptr, base_ptr, cl_size);
2991     bdnz(fastloop);
2992     if (InsertEndGroupPPC64) { endgroup(); } else { nop(); }
2993 //20:
2994   bind(small_rest);
2995     cmpdi(CCR0, cnt_dwords, 0);        // size 0?
2996     beq(CCR0, done);                   // rest == 0
2997     li(tmp, 0);
2998     mtctr(cnt_dwords);                 // Load counter.
2999 //24:
3000   bind(restloop);                      // Clear rest.
3001     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3002     addi(base_ptr, base_ptr, 8);
3003     bdnz(restloop);
3004 //27:
3005   bind(done);
3006 }
3007 
3008 /////////////////////////////////////////// String intrinsics ////////////////////////////////////////////
3009 
3010 // Search for a single jchar in an jchar[].
3011 //
3012 // Assumes that result differs from all other registers.
3013 //
3014 // Haystack, needle are the addresses of jchar-arrays.
3015 // NeedleChar is needle[0] if it is known at compile time.
3016 // Haycnt is the length of the haystack. We assume haycnt >=1.
3017 //
3018 // Preserves haystack, haycnt, kills all other registers.
3019 //
3020 // If needle == R0, we search for the constant needleChar.
3021 void MacroAssembler::string_indexof_1(Register result, Register haystack, Register haycnt,
3022                                       Register needle, jchar needleChar,
3023                                       Register tmp1, Register tmp2) {
3024 
3025   assert_different_registers(result, haystack, haycnt, needle, tmp1, tmp2);
3026 
3027   Label L_InnerLoop, L_FinalCheck, L_Found1, L_Found2, L_Found3, L_NotFound, L_End;
3028   Register needle0 = needle, // Contains needle[0].
3029            addr = tmp1,
3030            ch1 = tmp2,
3031            ch2 = R0;
3032 
3033 //2 (variable) or 3 (const):
3034    if (needle != R0) lhz(needle0, 0, needle); // Preload needle character, needle has len==1.
3035    dcbtct(haystack, 0x00);                        // Indicate R/O access to haystack.
3036 
3037    srwi_(tmp2, haycnt, 1);   // Shift right by exact_log2(UNROLL_FACTOR).
3038    mr(addr, haystack);
3039    beq(CCR0, L_FinalCheck);
3040    mtctr(tmp2);              // Move to count register.
3041 //8:
3042   bind(L_InnerLoop);             // Main work horse (2x unrolled search loop).
3043    lhz(ch1, 0, addr);        // Load characters from haystack.
3044    lhz(ch2, 2, addr);
3045    (needle != R0) ? cmpw(CCR0, ch1, needle0) : cmplwi(CCR0, ch1, needleChar);
3046    (needle != R0) ? cmpw(CCR1, ch2, needle0) : cmplwi(CCR1, ch2, needleChar);
3047    beq(CCR0, L_Found1);   // Did we find the needle?
3048    beq(CCR1, L_Found2);
3049    addi(addr, addr, 4);
3050    bdnz(L_InnerLoop);
3051 //16:
3052   bind(L_FinalCheck);
3053    andi_(R0, haycnt, 1);
3054    beq(CCR0, L_NotFound);
3055    lhz(ch1, 0, addr);        // One position left at which we have to compare.
3056    (needle != R0) ? cmpw(CCR1, ch1, needle0) : cmplwi(CCR1, ch1, needleChar);
3057    beq(CCR1, L_Found3);
3058 //21:
3059   bind(L_NotFound);
3060    li(result, -1);           // Not found.
3061    b(L_End);
3062 
3063   bind(L_Found2);
3064    addi(addr, addr, 2);
3065 //24:
3066   bind(L_Found1);
3067   bind(L_Found3);                  // Return index ...
3068    subf(addr, haystack, addr); // relative to haystack,
3069    srdi(result, addr, 1);      // in characters.
3070   bind(L_End);
3071 }
3072 
3073 
3074 // Implementation of IndexOf for jchar arrays.
3075 //
3076 // The length of haystack and needle are not constant, i.e. passed in a register.
3077 //
3078 // Preserves registers haystack, needle.
3079 // Kills registers haycnt, needlecnt.
3080 // Assumes that result differs from all other registers.
3081 // Haystack, needle are the addresses of jchar-arrays.
3082 // Haycnt, needlecnt are the lengths of them, respectively.
3083 //
3084 // Needlecntval must be zero or 15-bit unsigned immediate and > 1.
3085 void MacroAssembler::string_indexof(Register result, Register haystack, Register haycnt,
3086                                     Register needle, ciTypeArray* needle_values, Register needlecnt, int needlecntval,
3087                                     Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
3088 
3089   // Ensure 0<needlecnt<=haycnt in ideal graph as prerequisite!
3090   Label L_TooShort, L_Found, L_NotFound, L_End;
3091   Register last_addr = haycnt, // Kill haycnt at the beginning.
3092            addr      = tmp1,
3093            n_start   = tmp2,
3094            ch1       = tmp3,
3095            ch2       = R0;
3096 
3097   // **************************************************************************************************
3098   // Prepare for main loop: optimized for needle count >=2, bail out otherwise.
3099   // **************************************************************************************************
3100 
3101 //1 (variable) or 3 (const):
3102    dcbtct(needle, 0x00);    // Indicate R/O access to str1.
3103    dcbtct(haystack, 0x00);  // Indicate R/O access to str2.
3104 
3105   // Compute last haystack addr to use if no match gets found.
3106   if (needlecntval == 0) { // variable needlecnt
3107 //3:
3108    subf(ch1, needlecnt, haycnt);      // Last character index to compare is haycnt-needlecnt.
3109    addi(addr, haystack, -2);          // Accesses use pre-increment.
3110    cmpwi(CCR6, needlecnt, 2);
3111    blt(CCR6, L_TooShort);          // Variable needlecnt: handle short needle separately.
3112    slwi(ch1, ch1, 1);                 // Scale to number of bytes.
3113    lwz(n_start, 0, needle);           // Load first 2 characters of needle.
3114    add(last_addr, haystack, ch1);     // Point to last address to compare (haystack+2*(haycnt-needlecnt)).
3115    addi(needlecnt, needlecnt, -2);    // Rest of needle.
3116   } else { // constant needlecnt
3117   guarantee(needlecntval != 1, "IndexOf with single-character needle must be handled separately");
3118   assert((needlecntval & 0x7fff) == needlecntval, "wrong immediate");
3119 //5:
3120    addi(ch1, haycnt, -needlecntval);  // Last character index to compare is haycnt-needlecnt.
3121    lwz(n_start, 0, needle);           // Load first 2 characters of needle.
3122    addi(addr, haystack, -2);          // Accesses use pre-increment.
3123    slwi(ch1, ch1, 1);                 // Scale to number of bytes.
3124    add(last_addr, haystack, ch1);     // Point to last address to compare (haystack+2*(haycnt-needlecnt)).
3125    li(needlecnt, needlecntval-2);     // Rest of needle.
3126   }
3127 
3128   // Main Loop (now we have at least 3 characters).
3129 //11:
3130   Label L_OuterLoop, L_InnerLoop, L_FinalCheck, L_Comp1, L_Comp2, L_Comp3;
3131   bind(L_OuterLoop); // Search for 1st 2 characters.
3132   Register addr_diff = tmp4;
3133    subf(addr_diff, addr, last_addr); // Difference between already checked address and last address to check.
3134    addi(addr, addr, 2);              // This is the new address we want to use for comparing.
3135    srdi_(ch2, addr_diff, 2);
3136    beq(CCR0, L_FinalCheck);       // 2 characters left?
3137    mtctr(ch2);                       // addr_diff/4
3138 //16:
3139   bind(L_InnerLoop);                // Main work horse (2x unrolled search loop)
3140    lwz(ch1, 0, addr);           // Load 2 characters of haystack (ignore alignment).
3141    lwz(ch2, 2, addr);
3142    cmpw(CCR0, ch1, n_start); // Compare 2 characters (1 would be sufficient but try to reduce branches to CompLoop).
3143    cmpw(CCR1, ch2, n_start);
3144    beq(CCR0, L_Comp1);       // Did we find the needle start?
3145    beq(CCR1, L_Comp2);
3146    addi(addr, addr, 4);
3147    bdnz(L_InnerLoop);
3148 //24:
3149   bind(L_FinalCheck);
3150    rldicl_(addr_diff, addr_diff, 64-1, 63); // Remaining characters not covered by InnerLoop: (addr_diff>>1)&1.
3151    beq(CCR0, L_NotFound);
3152    lwz(ch1, 0, addr);                       // One position left at which we have to compare.
3153    cmpw(CCR1, ch1, n_start);
3154    beq(CCR1, L_Comp3);
3155 //29:
3156   bind(L_NotFound);
3157    li(result, -1); // not found
3158    b(L_End);
3159 
3160 
3161    // **************************************************************************************************
3162    // Special Case: unfortunately, the variable needle case can be called with needlecnt<2
3163    // **************************************************************************************************
3164 //31:
3165  if ((needlecntval>>1) !=1 ) { // Const needlecnt is 2 or 3? Reduce code size.
3166   int nopcnt = 5;
3167   if (needlecntval !=0 ) ++nopcnt; // Balance alignment (other case: see below).
3168   if (needlecntval == 0) {         // We have to handle these cases separately.
3169   Label L_OneCharLoop;
3170   bind(L_TooShort);
3171    mtctr(haycnt);
3172    lhz(n_start, 0, needle);    // First character of needle
3173   bind(L_OneCharLoop);
3174    lhzu(ch1, 2, addr);
3175    cmpw(CCR1, ch1, n_start);
3176    beq(CCR1, L_Found);      // Did we find the one character needle?
3177    bdnz(L_OneCharLoop);
3178    li(result, -1);             // Not found.
3179    b(L_End);
3180   } // 8 instructions, so no impact on alignment.
3181   for (int x = 0; x < nopcnt; ++x) nop();
3182  }
3183 
3184   // **************************************************************************************************
3185   // Regular Case Part II: compare rest of needle (first 2 characters have been compared already)
3186   // **************************************************************************************************
3187 
3188   // Compare the rest
3189 //36 if needlecntval==0, else 37:
3190   bind(L_Comp2);
3191    addi(addr, addr, 2); // First comparison has failed, 2nd one hit.
3192   bind(L_Comp1);            // Addr points to possible needle start.
3193   bind(L_Comp3);            // Could have created a copy and use a different return address but saving code size here.
3194   if (needlecntval != 2) {  // Const needlecnt==2?
3195    if (needlecntval != 3) {
3196     if (needlecntval == 0) beq(CCR6, L_Found); // Variable needlecnt==2?
3197     Register ind_reg = tmp4;
3198     li(ind_reg, 2*2);   // First 2 characters are already compared, use index 2.
3199     mtctr(needlecnt);   // Decremented by 2, still > 0.
3200 //40:
3201    Label L_CompLoop;
3202    bind(L_CompLoop);
3203     lhzx(ch2, needle, ind_reg);
3204     lhzx(ch1, addr, ind_reg);
3205     cmpw(CCR1, ch1, ch2);
3206     bne(CCR1, L_OuterLoop);
3207     addi(ind_reg, ind_reg, 2);
3208     bdnz(L_CompLoop);
3209    } else { // No loop required if there's only one needle character left.
3210     lhz(ch2, 2*2, needle);
3211     lhz(ch1, 2*2, addr);
3212     cmpw(CCR1, ch1, ch2);
3213     bne(CCR1, L_OuterLoop);
3214    }
3215   }
3216   // Return index ...
3217 //46:
3218   bind(L_Found);
3219    subf(addr, haystack, addr); // relative to haystack, ...
3220    srdi(result, addr, 1);      // in characters.
3221 //48:
3222   bind(L_End);
3223 }
3224 
3225 // Implementation of Compare for jchar arrays.
3226 //
3227 // Kills the registers str1, str2, cnt1, cnt2.
3228 // Kills cr0, ctr.
3229 // Assumes that result differes from the input registers.
3230 void MacroAssembler::string_compare(Register str1_reg, Register str2_reg, Register cnt1_reg, Register cnt2_reg,
3231                                     Register result_reg, Register tmp_reg) {
3232    assert_different_registers(result_reg, str1_reg, str2_reg, cnt1_reg, cnt2_reg, tmp_reg);
3233 
3234    Label Ldone, Lslow_case, Lslow_loop, Lfast_loop;
3235    Register cnt_diff = R0,
3236             limit_reg = cnt1_reg,
3237             chr1_reg = result_reg,
3238             chr2_reg = cnt2_reg,
3239             addr_diff = str2_reg;
3240 
3241    // Offset 0 should be 32 byte aligned.
3242 //-4:
3243     dcbtct(str1_reg, 0x00);  // Indicate R/O access to str1.
3244     dcbtct(str2_reg, 0x00);  // Indicate R/O access to str2.
3245 //-2:
3246    // Compute min(cnt1, cnt2) and check if 0 (bail out if we don't need to compare characters).
3247     subf(result_reg, cnt2_reg, cnt1_reg);  // difference between cnt1/2
3248     subf_(addr_diff, str1_reg, str2_reg);  // alias?
3249     beq(CCR0, Ldone);                   // return cnt difference if both ones are identical
3250     srawi(limit_reg, result_reg, 31);      // generate signmask (cnt1/2 must be non-negative so cnt_diff can't overflow)
3251     mr(cnt_diff, result_reg);
3252     andr(limit_reg, result_reg, limit_reg); // difference or zero (negative): cnt1<cnt2 ? cnt1-cnt2 : 0
3253     add_(limit_reg, cnt2_reg, limit_reg);  // min(cnt1, cnt2)==0?
3254     beq(CCR0, Ldone);                   // return cnt difference if one has 0 length
3255 
3256     lhz(chr1_reg, 0, str1_reg);            // optional: early out if first characters mismatch
3257     lhzx(chr2_reg, str1_reg, addr_diff);   // optional: early out if first characters mismatch
3258     addi(tmp_reg, limit_reg, -1);          // min(cnt1, cnt2)-1
3259     subf_(result_reg, chr2_reg, chr1_reg); // optional: early out if first characters mismatch
3260     bne(CCR0, Ldone);                   // optional: early out if first characters mismatch
3261 
3262    // Set loop counter by scaling down tmp_reg
3263     srawi_(chr2_reg, tmp_reg, exact_log2(4)); // (min(cnt1, cnt2)-1)/4
3264     ble(CCR0, Lslow_case);                 // need >4 characters for fast loop
3265     andi(limit_reg, tmp_reg, 4-1);            // remaining characters
3266 
3267    // Adapt str1_reg str2_reg for the first loop iteration
3268     mtctr(chr2_reg);                 // (min(cnt1, cnt2)-1)/4
3269     addi(limit_reg, limit_reg, 4+1); // compare last 5-8 characters in slow_case if mismatch found in fast_loop
3270 //16:
3271    // Compare the rest of the characters
3272    bind(Lfast_loop);
3273     ld(chr1_reg, 0, str1_reg);
3274     ldx(chr2_reg, str1_reg, addr_diff);
3275     cmpd(CCR0, chr2_reg, chr1_reg);
3276     bne(CCR0, Lslow_case); // return chr1_reg
3277     addi(str1_reg, str1_reg, 4*2);
3278     bdnz(Lfast_loop);
3279     addi(limit_reg, limit_reg, -4); // no mismatch found in fast_loop, only 1-4 characters missing
3280 //23:
3281    bind(Lslow_case);
3282     mtctr(limit_reg);
3283 //24:
3284    bind(Lslow_loop);
3285     lhz(chr1_reg, 0, str1_reg);
3286     lhzx(chr2_reg, str1_reg, addr_diff);
3287     subf_(result_reg, chr2_reg, chr1_reg);
3288     bne(CCR0, Ldone); // return chr1_reg
3289     addi(str1_reg, str1_reg, 1*2);
3290     bdnz(Lslow_loop);
3291 //30:
3292    // If strings are equal up to min length, return the length difference.
3293     mr(result_reg, cnt_diff);
3294     nop(); // alignment
3295 //32:
3296    // Otherwise, return the difference between the first mismatched chars.
3297    bind(Ldone);
3298 }
3299 
3300 
3301 // Compare char[] arrays.
3302 //
3303 // str1_reg   USE only
3304 // str2_reg   USE only
3305 // cnt_reg    USE_DEF, due to tmp reg shortage
3306 // result_reg DEF only, might compromise USE only registers
3307 void MacroAssembler::char_arrays_equals(Register str1_reg, Register str2_reg, Register cnt_reg, Register result_reg,
3308                                         Register tmp1_reg, Register tmp2_reg, Register tmp3_reg, Register tmp4_reg,
3309                                         Register tmp5_reg) {
3310 
3311   // Str1 may be the same register as str2 which can occur e.g. after scalar replacement.
3312   assert_different_registers(result_reg, str1_reg, cnt_reg, tmp1_reg, tmp2_reg, tmp3_reg, tmp4_reg, tmp5_reg);
3313   assert_different_registers(result_reg, str2_reg, cnt_reg, tmp1_reg, tmp2_reg, tmp3_reg, tmp4_reg, tmp5_reg);
3314 
3315   // Offset 0 should be 32 byte aligned.
3316   Label Linit_cbc, Lcbc, Lloop, Ldone_true, Ldone_false;
3317   Register index_reg = tmp5_reg;
3318   Register cbc_iter  = tmp4_reg;
3319 
3320 //-1:
3321   dcbtct(str1_reg, 0x00);  // Indicate R/O access to str1.
3322   dcbtct(str2_reg, 0x00);  // Indicate R/O access to str2.
3323 //1:
3324   andi(cbc_iter, cnt_reg, 4-1);            // Remaining iterations after 4 java characters per iteration loop.
3325   li(index_reg, 0); // init
3326   li(result_reg, 0); // assume false
3327   srwi_(tmp2_reg, cnt_reg, exact_log2(4)); // Div: 4 java characters per iteration (main loop).
3328 
3329   cmpwi(CCR1, cbc_iter, 0);             // CCR1 = (cbc_iter==0)
3330   beq(CCR0, Linit_cbc);                 // too short
3331     mtctr(tmp2_reg);
3332 //8:
3333     bind(Lloop);
3334       ldx(tmp1_reg, str1_reg, index_reg);
3335       ldx(tmp2_reg, str2_reg, index_reg);
3336       cmpd(CCR0, tmp1_reg, tmp2_reg);
3337       bne(CCR0, Ldone_false);  // Unequal char pair found -> done.
3338       addi(index_reg, index_reg, 4*sizeof(jchar));
3339       bdnz(Lloop);
3340 //14:
3341   bind(Linit_cbc);
3342   beq(CCR1, Ldone_true);
3343     mtctr(cbc_iter);
3344 //16:
3345     bind(Lcbc);
3346       lhzx(tmp1_reg, str1_reg, index_reg);
3347       lhzx(tmp2_reg, str2_reg, index_reg);
3348       cmpw(CCR0, tmp1_reg, tmp2_reg);
3349       bne(CCR0, Ldone_false);  // Unequal char pair found -> done.
3350       addi(index_reg, index_reg, 1*sizeof(jchar));
3351       bdnz(Lcbc);
3352     nop();
3353   bind(Ldone_true);
3354   li(result_reg, 1);
3355 //24:
3356   bind(Ldone_false);
3357 }
3358 
3359 
3360 void MacroAssembler::char_arrays_equalsImm(Register str1_reg, Register str2_reg, int cntval, Register result_reg,
3361                                            Register tmp1_reg, Register tmp2_reg) {
3362   // Str1 may be the same register as str2 which can occur e.g. after scalar replacement.
3363   assert_different_registers(result_reg, str1_reg, tmp1_reg, tmp2_reg);
3364   assert_different_registers(result_reg, str2_reg, tmp1_reg, tmp2_reg);
3365   assert(sizeof(jchar) == 2, "must be");
3366   assert(cntval >= 0 && ((cntval & 0x7fff) == cntval), "wrong immediate");
3367 
3368   Label Ldone_false;
3369 
3370   if (cntval < 16) { // short case
3371     if (cntval != 0) li(result_reg, 0); // assume false
3372 
3373     const int num_bytes = cntval*sizeof(jchar);
3374     int index = 0;
3375     for (int next_index; (next_index = index + 8) <= num_bytes; index = next_index) {
3376       ld(tmp1_reg, index, str1_reg);
3377       ld(tmp2_reg, index, str2_reg);
3378       cmpd(CCR0, tmp1_reg, tmp2_reg);
3379       bne(CCR0, Ldone_false);
3380     }
3381     if (cntval & 2) {
3382       lwz(tmp1_reg, index, str1_reg);
3383       lwz(tmp2_reg, index, str2_reg);
3384       cmpw(CCR0, tmp1_reg, tmp2_reg);
3385       bne(CCR0, Ldone_false);
3386       index += 4;
3387     }
3388     if (cntval & 1) {
3389       lhz(tmp1_reg, index, str1_reg);
3390       lhz(tmp2_reg, index, str2_reg);
3391       cmpw(CCR0, tmp1_reg, tmp2_reg);
3392       bne(CCR0, Ldone_false);
3393     }
3394     // fallthrough: true
3395   } else {
3396     Label Lloop;
3397     Register index_reg = tmp1_reg;
3398     const int loopcnt = cntval/4;
3399     assert(loopcnt > 0, "must be");
3400     // Offset 0 should be 32 byte aligned.
3401     //2:
3402     dcbtct(str1_reg, 0x00);  // Indicate R/O access to str1.
3403     dcbtct(str2_reg, 0x00);  // Indicate R/O access to str2.
3404     li(tmp2_reg, loopcnt);
3405     li(index_reg, 0); // init
3406     li(result_reg, 0); // assume false
3407     mtctr(tmp2_reg);
3408     //8:
3409     bind(Lloop);
3410     ldx(R0, str1_reg, index_reg);
3411     ldx(tmp2_reg, str2_reg, index_reg);
3412     cmpd(CCR0, R0, tmp2_reg);
3413     bne(CCR0, Ldone_false);  // Unequal char pair found -> done.
3414     addi(index_reg, index_reg, 4*sizeof(jchar));
3415     bdnz(Lloop);
3416     //14:
3417     if (cntval & 2) {
3418       lwzx(R0, str1_reg, index_reg);
3419       lwzx(tmp2_reg, str2_reg, index_reg);
3420       cmpw(CCR0, R0, tmp2_reg);
3421       bne(CCR0, Ldone_false);
3422       if (cntval & 1) addi(index_reg, index_reg, 2*sizeof(jchar));
3423     }
3424     if (cntval & 1) {
3425       lhzx(R0, str1_reg, index_reg);
3426       lhzx(tmp2_reg, str2_reg, index_reg);
3427       cmpw(CCR0, R0, tmp2_reg);
3428       bne(CCR0, Ldone_false);
3429     }
3430     // fallthru: true
3431   }
3432   li(result_reg, 1);
3433   bind(Ldone_false);
3434 }
3435 
3436 // dest_lo += src1 + src2
3437 // dest_hi += carry1 + carry2
3438 void MacroAssembler::add2_with_carry(Register dest_hi,
3439                                      Register dest_lo,
3440                                      Register src1, Register src2) {
3441   li(R0, 0);
3442   addc(dest_lo, dest_lo, src1);
3443   adde(dest_hi, dest_hi, R0);
3444   addc(dest_lo, dest_lo, src2);
3445   adde(dest_hi, dest_hi, R0);
3446 }
3447 
3448 // Multiply 64 bit by 64 bit first loop.
3449 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,
3450                                            Register x_xstart,
3451                                            Register y, Register y_idx,
3452                                            Register z,
3453                                            Register carry,
3454                                            Register product_high, Register product,
3455                                            Register idx, Register kdx,
3456                                            Register tmp) {
3457   //  jlong carry, x[], y[], z[];
3458   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
3459   //    huge_128 product = y[idx] * x[xstart] + carry;
3460   //    z[kdx] = (jlong)product;
3461   //    carry  = (jlong)(product >>> 64);
3462   //  }
3463   //  z[xstart] = carry;
3464 
3465   Label L_first_loop, L_first_loop_exit;
3466   Label L_one_x, L_one_y, L_multiply;
3467 
3468   addic_(xstart, xstart, -1);
3469   blt(CCR0, L_one_x);   // Special case: length of x is 1.
3470 
3471   // Load next two integers of x.
3472   sldi(tmp, xstart, LogBytesPerInt);
3473   ldx(x_xstart, x, tmp);
3474 #ifdef VM_LITTLE_ENDIAN
3475   rldicl(x_xstart, x_xstart, 32, 0);
3476 #endif
3477 
3478   align(32, 16);
3479   bind(L_first_loop);
3480 
3481   cmpdi(CCR0, idx, 1);
3482   blt(CCR0, L_first_loop_exit);
3483   addi(idx, idx, -2);
3484   beq(CCR0, L_one_y);
3485 
3486   // Load next two integers of y.
3487   sldi(tmp, idx, LogBytesPerInt);
3488   ldx(y_idx, y, tmp);
3489 #ifdef VM_LITTLE_ENDIAN
3490   rldicl(y_idx, y_idx, 32, 0);
3491 #endif
3492 
3493 
3494   bind(L_multiply);
3495   multiply64(product_high, product, x_xstart, y_idx);
3496 
3497   li(tmp, 0);
3498   addc(product, product, carry);         // Add carry to result.
3499   adde(product_high, product_high, tmp); // Add carry of the last addition.
3500   addi(kdx, kdx, -2);
3501 
3502   // Store result.
3503 #ifdef VM_LITTLE_ENDIAN
3504   rldicl(product, product, 32, 0);
3505 #endif
3506   sldi(tmp, kdx, LogBytesPerInt);
3507   stdx(product, z, tmp);
3508   mr_if_needed(carry, product_high);
3509   b(L_first_loop);
3510 
3511 
3512   bind(L_one_y); // Load one 32 bit portion of y as (0,value).
3513 
3514   lwz(y_idx, 0, y);
3515   b(L_multiply);
3516 
3517 
3518   bind( L_one_x ); // Load one 32 bit portion of x as (0,value).
3519 
3520   lwz(x_xstart, 0, x);
3521   b(L_first_loop);
3522 
3523   bind(L_first_loop_exit);
3524 }
3525 
3526 // Multiply 64 bit by 64 bit and add 128 bit.
3527 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y,
3528                                             Register z, Register yz_idx,
3529                                             Register idx, Register carry,
3530                                             Register product_high, Register product,
3531                                             Register tmp, int offset) {
3532 
3533   //  huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
3534   //  z[kdx] = (jlong)product;
3535 
3536   sldi(tmp, idx, LogBytesPerInt);
3537   if ( offset ) {
3538     addi(tmp, tmp, offset);
3539   }
3540   ldx(yz_idx, y, tmp);
3541 #ifdef VM_LITTLE_ENDIAN
3542   rldicl(yz_idx, yz_idx, 32, 0);
3543 #endif
3544 
3545   multiply64(product_high, product, x_xstart, yz_idx);
3546   ldx(yz_idx, z, tmp);
3547 #ifdef VM_LITTLE_ENDIAN
3548   rldicl(yz_idx, yz_idx, 32, 0);
3549 #endif
3550 
3551   add2_with_carry(product_high, product, carry, yz_idx);
3552 
3553   sldi(tmp, idx, LogBytesPerInt);
3554   if ( offset ) {
3555     addi(tmp, tmp, offset);
3556   }
3557 #ifdef VM_LITTLE_ENDIAN
3558   rldicl(product, product, 32, 0);
3559 #endif
3560   stdx(product, z, tmp);
3561 }
3562 
3563 // Multiply 128 bit by 128 bit. Unrolled inner loop.
3564 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart,
3565                                              Register y, Register z,
3566                                              Register yz_idx, Register idx, Register carry,
3567                                              Register product_high, Register product,
3568                                              Register carry2, Register tmp) {
3569 
3570   //  jlong carry, x[], y[], z[];
3571   //  int kdx = ystart+1;
3572   //  for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
3573   //    huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
3574   //    z[kdx+idx+1] = (jlong)product;
3575   //    jlong carry2 = (jlong)(product >>> 64);
3576   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
3577   //    z[kdx+idx] = (jlong)product;
3578   //    carry = (jlong)(product >>> 64);
3579   //  }
3580   //  idx += 2;
3581   //  if (idx > 0) {
3582   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
3583   //    z[kdx+idx] = (jlong)product;
3584   //    carry = (jlong)(product >>> 64);
3585   //  }
3586 
3587   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
3588   const Register jdx = R0;
3589 
3590   // Scale the index.
3591   srdi_(jdx, idx, 2);
3592   beq(CCR0, L_third_loop_exit);
3593   mtctr(jdx);
3594 
3595   align(32, 16);
3596   bind(L_third_loop);
3597 
3598   addi(idx, idx, -4);
3599 
3600   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8);
3601   mr_if_needed(carry2, product_high);
3602 
3603   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0);
3604   mr_if_needed(carry, product_high);
3605   bdnz(L_third_loop);
3606 
3607   bind(L_third_loop_exit);  // Handle any left-over operand parts.
3608 
3609   andi_(idx, idx, 0x3);
3610   beq(CCR0, L_post_third_loop_done);
3611 
3612   Label L_check_1;
3613 
3614   addic_(idx, idx, -2);
3615   blt(CCR0, L_check_1);
3616 
3617   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0);
3618   mr_if_needed(carry, product_high);
3619 
3620   bind(L_check_1);
3621 
3622   addi(idx, idx, 0x2);
3623   andi_(idx, idx, 0x1) ;
3624   addic_(idx, idx, -1);
3625   blt(CCR0, L_post_third_loop_done);
3626 
3627   sldi(tmp, idx, LogBytesPerInt);
3628   lwzx(yz_idx, y, tmp);
3629   multiply64(product_high, product, x_xstart, yz_idx);
3630   lwzx(yz_idx, z, tmp);
3631 
3632   add2_with_carry(product_high, product, yz_idx, carry);
3633 
3634   sldi(tmp, idx, LogBytesPerInt);
3635   stwx(product, z, tmp);
3636   srdi(product, product, 32);
3637 
3638   sldi(product_high, product_high, 32);
3639   orr(product, product, product_high);
3640   mr_if_needed(carry, product);
3641 
3642   bind(L_post_third_loop_done);
3643 }   // multiply_128_x_128_loop
3644 
3645 void MacroAssembler::multiply_to_len(Register x, Register xlen,
3646                                      Register y, Register ylen,
3647                                      Register z, Register zlen,
3648                                      Register tmp1, Register tmp2,
3649                                      Register tmp3, Register tmp4,
3650                                      Register tmp5, Register tmp6,
3651                                      Register tmp7, Register tmp8,
3652                                      Register tmp9, Register tmp10,
3653                                      Register tmp11, Register tmp12,
3654                                      Register tmp13) {
3655 
3656   ShortBranchVerifier sbv(this);
3657 
3658   assert_different_registers(x, xlen, y, ylen, z, zlen,
3659                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
3660   assert_different_registers(x, xlen, y, ylen, z, zlen,
3661                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp7);
3662   assert_different_registers(x, xlen, y, ylen, z, zlen,
3663                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp8);
3664 
3665   const Register idx = tmp1;
3666   const Register kdx = tmp2;
3667   const Register xstart = tmp3;
3668 
3669   const Register y_idx = tmp4;
3670   const Register carry = tmp5;
3671   const Register product = tmp6;
3672   const Register product_high = tmp7;
3673   const Register x_xstart = tmp8;
3674   const Register tmp = tmp9;
3675 
3676   // First Loop.
3677   //
3678   //  final static long LONG_MASK = 0xffffffffL;
3679   //  int xstart = xlen - 1;
3680   //  int ystart = ylen - 1;
3681   //  long carry = 0;
3682   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
3683   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
3684   //    z[kdx] = (int)product;
3685   //    carry = product >>> 32;
3686   //  }
3687   //  z[xstart] = (int)carry;
3688 
3689   mr_if_needed(idx, ylen);        // idx = ylen
3690   mr_if_needed(kdx, zlen);        // kdx = xlen + ylen
3691   li(carry, 0);                   // carry = 0
3692 
3693   Label L_done;
3694 
3695   addic_(xstart, xlen, -1);
3696   blt(CCR0, L_done);
3697 
3698   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z,
3699                         carry, product_high, product, idx, kdx, tmp);
3700 
3701   Label L_second_loop;
3702 
3703   cmpdi(CCR0, kdx, 0);
3704   beq(CCR0, L_second_loop);
3705 
3706   Label L_carry;
3707 
3708   addic_(kdx, kdx, -1);
3709   beq(CCR0, L_carry);
3710 
3711   // Store lower 32 bits of carry.
3712   sldi(tmp, kdx, LogBytesPerInt);
3713   stwx(carry, z, tmp);
3714   srdi(carry, carry, 32);
3715   addi(kdx, kdx, -1);
3716 
3717 
3718   bind(L_carry);
3719 
3720   // Store upper 32 bits of carry.
3721   sldi(tmp, kdx, LogBytesPerInt);
3722   stwx(carry, z, tmp);
3723 
3724   // Second and third (nested) loops.
3725   //
3726   //  for (int i = xstart-1; i >= 0; i--) { // Second loop
3727   //    carry = 0;
3728   //    for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
3729   //      long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
3730   //                     (z[k] & LONG_MASK) + carry;
3731   //      z[k] = (int)product;
3732   //      carry = product >>> 32;
3733   //    }
3734   //    z[i] = (int)carry;
3735   //  }
3736   //
3737   //  i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
3738 
3739   bind(L_second_loop);
3740 
3741   li(carry, 0);                   // carry = 0;
3742 
3743   addic_(xstart, xstart, -1);     // i = xstart-1;
3744   blt(CCR0, L_done);
3745 
3746   Register zsave = tmp10;
3747 
3748   mr(zsave, z);
3749 
3750 
3751   Label L_last_x;
3752 
3753   sldi(tmp, xstart, LogBytesPerInt);
3754   add(z, z, tmp);                 // z = z + k - j
3755   addi(z, z, 4);
3756   addic_(xstart, xstart, -1);     // i = xstart-1;
3757   blt(CCR0, L_last_x);
3758 
3759   sldi(tmp, xstart, LogBytesPerInt);
3760   ldx(x_xstart, x, tmp);
3761 #ifdef VM_LITTLE_ENDIAN
3762   rldicl(x_xstart, x_xstart, 32, 0);
3763 #endif
3764 
3765 
3766   Label L_third_loop_prologue;
3767 
3768   bind(L_third_loop_prologue);
3769 
3770   Register xsave = tmp11;
3771   Register xlensave = tmp12;
3772   Register ylensave = tmp13;
3773 
3774   mr(xsave, x);
3775   mr(xlensave, xstart);
3776   mr(ylensave, ylen);
3777 
3778 
3779   multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen,
3780                           carry, product_high, product, x, tmp);
3781 
3782   mr(z, zsave);
3783   mr(x, xsave);
3784   mr(xlen, xlensave);   // This is the decrement of the loop counter!
3785   mr(ylen, ylensave);
3786 
3787   addi(tmp3, xlen, 1);
3788   sldi(tmp, tmp3, LogBytesPerInt);
3789   stwx(carry, z, tmp);
3790   addic_(tmp3, tmp3, -1);
3791   blt(CCR0, L_done);
3792 
3793   srdi(carry, carry, 32);
3794   sldi(tmp, tmp3, LogBytesPerInt);
3795   stwx(carry, z, tmp);
3796   b(L_second_loop);
3797 
3798   // Next infrequent code is moved outside loops.
3799   bind(L_last_x);
3800 
3801   lwz(x_xstart, 0, x);
3802   b(L_third_loop_prologue);
3803 
3804   bind(L_done);
3805 }   // multiply_to_len
3806 
3807 void MacroAssembler::asm_assert(bool check_equal, const char *msg, int id) {
3808 #ifdef ASSERT
3809   Label ok;
3810   if (check_equal) {
3811     beq(CCR0, ok);
3812   } else {
3813     bne(CCR0, ok);
3814   }
3815   stop(msg, id);
3816   bind(ok);
3817 #endif
3818 }
3819 
3820 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset,
3821                                           Register mem_base, const char* msg, int id) {
3822 #ifdef ASSERT
3823   switch (size) {
3824     case 4:
3825       lwz(R0, mem_offset, mem_base);
3826       cmpwi(CCR0, R0, 0);
3827       break;
3828     case 8:
3829       ld(R0, mem_offset, mem_base);
3830       cmpdi(CCR0, R0, 0);
3831       break;
3832     default:
3833       ShouldNotReachHere();
3834   }
3835   asm_assert(check_equal, msg, id);
3836 #endif // ASSERT
3837 }
3838 
3839 void MacroAssembler::verify_thread() {
3840   if (VerifyThread) {
3841     unimplemented("'VerifyThread' currently not implemented on PPC");
3842   }
3843 }
3844 
3845 // READ: oop. KILL: R0. Volatile floats perhaps.
3846 void MacroAssembler::verify_oop(Register oop, const char* msg) {
3847   if (!VerifyOops) {
3848     return;
3849   }
3850 
3851   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
3852   const Register tmp = R11; // Will be preserved.
3853   const int nbytes_save = 11*8; // Volatile gprs except R0.
3854   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
3855 
3856   if (oop == tmp) mr(R4_ARG2, oop);
3857   save_LR_CR(tmp); // save in old frame
3858   push_frame_reg_args(nbytes_save, tmp);
3859   // load FunctionDescriptor** / entry_address *
3860   load_const_optimized(tmp, fd, R0);
3861   // load FunctionDescriptor* / entry_address
3862   ld(tmp, 0, tmp);
3863   if (oop != tmp) mr_if_needed(R4_ARG2, oop);
3864   load_const_optimized(R3_ARG1, (address)msg, R0);
3865   // Call destination for its side effect.
3866   call_c(tmp);
3867 
3868   pop_frame();
3869   restore_LR_CR(tmp);
3870   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
3871 }
3872 
3873 const char* stop_types[] = {
3874   "stop",
3875   "untested",
3876   "unimplemented",
3877   "shouldnotreachhere"
3878 };
3879 
3880 static void stop_on_request(int tp, const char* msg) {
3881   tty->print("PPC assembly code requires stop: (%s) %s\n", stop_types[tp%/*stop_end*/4], msg);
3882   guarantee(false, err_msg("PPC assembly code requires stop: %s", msg));
3883 }
3884 
3885 // Call a C-function that prints output.
3886 void MacroAssembler::stop(int type, const char* msg, int id) {
3887 #ifndef PRODUCT
3888   block_comment(err_msg("stop: %s %s {", stop_types[type%stop_end], msg));
3889 #else
3890   block_comment("stop {");
3891 #endif
3892 
3893   // setup arguments
3894   load_const_optimized(R3_ARG1, type);
3895   load_const_optimized(R4_ARG2, (void *)msg, /*tmp=*/R0);
3896   call_VM_leaf(CAST_FROM_FN_PTR(address, stop_on_request), R3_ARG1, R4_ARG2);
3897   illtrap();
3898   emit_int32(id);
3899   block_comment("} stop;");
3900 }
3901 
3902 #ifndef PRODUCT
3903 // Write pattern 0x0101010101010101 in memory region [low-before, high+after].
3904 // Val, addr are temp registers.
3905 // If low == addr, addr is killed.
3906 // High is preserved.
3907 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) {
3908   if (!ZapMemory) return;
3909 
3910   assert_different_registers(low, val);
3911 
3912   BLOCK_COMMENT("zap memory region {");
3913   load_const_optimized(val, 0x0101010101010101);
3914   int size = before + after;
3915   if (low == high && size < 5 && size > 0) {
3916     int offset = -before*BytesPerWord;
3917     for (int i = 0; i < size; ++i) {
3918       std(val, offset, low);
3919       offset += (1*BytesPerWord);
3920     }
3921   } else {
3922     addi(addr, low, -before*BytesPerWord);
3923     assert_different_registers(high, val);
3924     if (after) addi(high, high, after * BytesPerWord);
3925     Label loop;
3926     bind(loop);
3927     std(val, 0, addr);
3928     addi(addr, addr, 8);
3929     cmpd(CCR6, addr, high);
3930     ble(CCR6, loop);
3931     if (after) addi(high, high, -after * BytesPerWord);  // Correct back to old value.
3932   }
3933   BLOCK_COMMENT("} zap memory region");
3934 }
3935 
3936 #endif // !PRODUCT
3937 
3938 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() {
3939   int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true);
3940   assert(sizeof(bool) == 1, "PowerPC ABI");
3941   masm->lbz(temp, simm16_offset, temp);
3942   masm->cmpwi(CCR0, temp, 0);
3943   masm->beq(CCR0, _label);
3944 }
3945 
3946 SkipIfEqualZero::~SkipIfEqualZero() {
3947   _masm->bind(_label);
3948 }