1 /*
   2  * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2012, 2018, SAP SE. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "compiler/disassembler.hpp"
  29 #include "gc/shared/cardTable.hpp"
  30 #include "gc/shared/cardTableBarrierSet.hpp"
  31 #include "gc/shared/collectedHeap.inline.hpp"
  32 #include "interpreter/interpreter.hpp"
  33 #include "memory/resourceArea.hpp"
  34 #include "nativeInst_ppc.hpp"
  35 #include "prims/methodHandles.hpp"
  36 #include "runtime/biasedLocking.hpp"
  37 #include "runtime/icache.hpp"
  38 #include "runtime/interfaceSupport.inline.hpp"
  39 #include "runtime/objectMonitor.hpp"
  40 #include "runtime/os.hpp"
  41 #include "runtime/safepoint.hpp"
  42 #include "runtime/safepointMechanism.hpp"
  43 #include "runtime/sharedRuntime.hpp"
  44 #include "runtime/stubRoutines.hpp"
  45 #include "utilities/macros.hpp"
  46 #if INCLUDE_ALL_GCS
  47 #include "gc/g1/g1BarrierSet.hpp"
  48 #include "gc/g1/g1CardTable.hpp"
  49 #include "gc/g1/g1ThreadLocalData.hpp"
  50 #include "gc/g1/heapRegion.hpp"
  51 #endif // INCLUDE_ALL_GCS
  52 #ifdef COMPILER2
  53 #include "opto/intrinsicnode.hpp"
  54 #endif
  55 
  56 #ifdef PRODUCT
  57 #define BLOCK_COMMENT(str) // nothing
  58 #else
  59 #define BLOCK_COMMENT(str) block_comment(str)
  60 #endif
  61 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  62 
  63 #ifdef ASSERT
  64 // On RISC, there's no benefit to verifying instruction boundaries.
  65 bool AbstractAssembler::pd_check_instruction_mark() { return false; }
  66 #endif
  67 
  68 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) {
  69   assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range");
  70   if (Assembler::is_simm(si31, 16)) {
  71     ld(d, si31, a);
  72     if (emit_filler_nop) nop();
  73   } else {
  74     const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31);
  75     const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31);
  76     addis(d, a, hi);
  77     ld(d, lo, d);
  78   }
  79 }
  80 
  81 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) {
  82   assert_different_registers(d, a);
  83   ld_largeoffset_unchecked(d, si31, a, emit_filler_nop);
  84 }
  85 
  86 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base,
  87                                       size_t size_in_bytes, bool is_signed) {
  88   switch (size_in_bytes) {
  89   case  8:              ld(dst, offs, base);                         break;
  90   case  4:  is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break;
  91   case  2:  is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break;
  92   case  1:  lbz(dst, offs, base); if (is_signed) extsb(dst, dst);    break; // lba doesn't exist :(
  93   default:  ShouldNotReachHere();
  94   }
  95 }
  96 
  97 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base,
  98                                        size_t size_in_bytes) {
  99   switch (size_in_bytes) {
 100   case  8:  std(dst, offs, base); break;
 101   case  4:  stw(dst, offs, base); break;
 102   case  2:  sth(dst, offs, base); break;
 103   case  1:  stb(dst, offs, base); break;
 104   default:  ShouldNotReachHere();
 105   }
 106 }
 107 
 108 void MacroAssembler::align(int modulus, int max, int rem) {
 109   int padding = (rem + modulus - (offset() % modulus)) % modulus;
 110   if (padding > max) return;
 111   for (int c = (padding >> 2); c > 0; --c) { nop(); }
 112 }
 113 
 114 // Issue instructions that calculate given TOC from global TOC.
 115 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16,
 116                                                        bool add_relocation, bool emit_dummy_addr) {
 117   int offset = -1;
 118   if (emit_dummy_addr) {
 119     offset = -128; // dummy address
 120   } else if (addr != (address)(intptr_t)-1) {
 121     offset = MacroAssembler::offset_to_global_toc(addr);
 122   }
 123 
 124   if (hi16) {
 125     addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset));
 126   }
 127   if (lo16) {
 128     if (add_relocation) {
 129       // Relocate at the addi to avoid confusion with a load from the method's TOC.
 130       relocate(internal_word_Relocation::spec(addr));
 131     }
 132     addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset));
 133   }
 134 }
 135 
 136 address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) {
 137   const int offset = MacroAssembler::offset_to_global_toc(addr);
 138 
 139   const address inst2_addr = a;
 140   const int inst2 = *(int *)inst2_addr;
 141 
 142   // The relocation points to the second instruction, the addi,
 143   // and the addi reads and writes the same register dst.
 144   const int dst = inv_rt_field(inst2);
 145   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 146 
 147   // Now, find the preceding addis which writes to dst.
 148   int inst1 = 0;
 149   address inst1_addr = inst2_addr - BytesPerInstWord;
 150   while (inst1_addr >= bound) {
 151     inst1 = *(int *) inst1_addr;
 152     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 153       // Stop, found the addis which writes dst.
 154       break;
 155     }
 156     inst1_addr -= BytesPerInstWord;
 157   }
 158 
 159   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 160   set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset));
 161   set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset));
 162   return inst1_addr;
 163 }
 164 
 165 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) {
 166   const address inst2_addr = a;
 167   const int inst2 = *(int *)inst2_addr;
 168 
 169   // The relocation points to the second instruction, the addi,
 170   // and the addi reads and writes the same register dst.
 171   const int dst = inv_rt_field(inst2);
 172   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 173 
 174   // Now, find the preceding addis which writes to dst.
 175   int inst1 = 0;
 176   address inst1_addr = inst2_addr - BytesPerInstWord;
 177   while (inst1_addr >= bound) {
 178     inst1 = *(int *) inst1_addr;
 179     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 180       // stop, found the addis which writes dst
 181       break;
 182     }
 183     inst1_addr -= BytesPerInstWord;
 184   }
 185 
 186   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 187 
 188   int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0);
 189   // -1 is a special case
 190   if (offset == -1) {
 191     return (address)(intptr_t)-1;
 192   } else {
 193     return global_toc() + offset;
 194   }
 195 }
 196 
 197 #ifdef _LP64
 198 // Patch compressed oops or klass constants.
 199 // Assembler sequence is
 200 // 1) compressed oops:
 201 //    lis  rx = const.hi
 202 //    ori rx = rx | const.lo
 203 // 2) compressed klass:
 204 //    lis  rx = const.hi
 205 //    clrldi rx = rx & 0xFFFFffff // clearMS32b, optional
 206 //    ori rx = rx | const.lo
 207 // Clrldi will be passed by.
 208 address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) {
 209   assert(UseCompressedOops, "Should only patch compressed oops");
 210 
 211   const address inst2_addr = a;
 212   const int inst2 = *(int *)inst2_addr;
 213 
 214   // The relocation points to the second instruction, the ori,
 215   // and the ori reads and writes the same register dst.
 216   const int dst = inv_rta_field(inst2);
 217   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 218   // Now, find the preceding addis which writes to dst.
 219   int inst1 = 0;
 220   address inst1_addr = inst2_addr - BytesPerInstWord;
 221   bool inst1_found = false;
 222   while (inst1_addr >= bound) {
 223     inst1 = *(int *)inst1_addr;
 224     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; }
 225     inst1_addr -= BytesPerInstWord;
 226   }
 227   assert(inst1_found, "inst is not lis");
 228 
 229   int xc = (data >> 16) & 0xffff;
 230   int xd = (data >>  0) & 0xffff;
 231 
 232   set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo
 233   set_imm((int *)inst2_addr,        (xd)); // unsigned int
 234   return inst1_addr;
 235 }
 236 
 237 // Get compressed oop or klass constant.
 238 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) {
 239   assert(UseCompressedOops, "Should only patch compressed oops");
 240 
 241   const address inst2_addr = a;
 242   const int inst2 = *(int *)inst2_addr;
 243 
 244   // The relocation points to the second instruction, the ori,
 245   // and the ori reads and writes the same register dst.
 246   const int dst = inv_rta_field(inst2);
 247   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 248   // Now, find the preceding lis which writes to dst.
 249   int inst1 = 0;
 250   address inst1_addr = inst2_addr - BytesPerInstWord;
 251   bool inst1_found = false;
 252 
 253   while (inst1_addr >= bound) {
 254     inst1 = *(int *) inst1_addr;
 255     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;}
 256     inst1_addr -= BytesPerInstWord;
 257   }
 258   assert(inst1_found, "inst is not lis");
 259 
 260   uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff));
 261   uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16);
 262 
 263   return (int) (xl | xh);
 264 }
 265 #endif // _LP64
 266 
 267 // Returns true if successful.
 268 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a,
 269                                                 Register toc, bool fixed_size) {
 270   int toc_offset = 0;
 271   // Use RelocationHolder::none for the constant pool entry, otherwise
 272   // we will end up with a failing NativeCall::verify(x) where x is
 273   // the address of the constant pool entry.
 274   // FIXME: We should insert relocation information for oops at the constant
 275   // pool entries instead of inserting it at the loads; patching of a constant
 276   // pool entry should be less expensive.
 277   address const_address = address_constant((address)a.value(), RelocationHolder::none);
 278   if (const_address == NULL) { return false; } // allocation failure
 279   // Relocate at the pc of the load.
 280   relocate(a.rspec());
 281   toc_offset = (int)(const_address - code()->consts()->start());
 282   ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size);
 283   return true;
 284 }
 285 
 286 bool MacroAssembler::is_load_const_from_method_toc_at(address a) {
 287   const address inst1_addr = a;
 288   const int inst1 = *(int *)inst1_addr;
 289 
 290    // The relocation points to the ld or the addis.
 291    return (is_ld(inst1)) ||
 292           (is_addis(inst1) && inv_ra_field(inst1) != 0);
 293 }
 294 
 295 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) {
 296   assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc");
 297 
 298   const address inst1_addr = a;
 299   const int inst1 = *(int *)inst1_addr;
 300 
 301   if (is_ld(inst1)) {
 302     return inv_d1_field(inst1);
 303   } else if (is_addis(inst1)) {
 304     const int dst = inv_rt_field(inst1);
 305 
 306     // Now, find the succeeding ld which reads and writes to dst.
 307     address inst2_addr = inst1_addr + BytesPerInstWord;
 308     int inst2 = 0;
 309     while (true) {
 310       inst2 = *(int *) inst2_addr;
 311       if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) {
 312         // Stop, found the ld which reads and writes dst.
 313         break;
 314       }
 315       inst2_addr += BytesPerInstWord;
 316     }
 317     return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2);
 318   }
 319   ShouldNotReachHere();
 320   return 0;
 321 }
 322 
 323 // Get the constant from a `load_const' sequence.
 324 long MacroAssembler::get_const(address a) {
 325   assert(is_load_const_at(a), "not a load of a constant");
 326   const int *p = (const int*) a;
 327   unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48);
 328   if (is_ori(*(p+1))) {
 329     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32);
 330     x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16);
 331     x |= (((unsigned long) (get_imm(a,4) & 0xffff)));
 332   } else if (is_lis(*(p+1))) {
 333     x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32);
 334     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16);
 335     x |= (((unsigned long) (get_imm(a,3) & 0xffff)));
 336   } else {
 337     ShouldNotReachHere();
 338     return (long) 0;
 339   }
 340   return (long) x;
 341 }
 342 
 343 // Patch the 64 bit constant of a `load_const' sequence. This is a low
 344 // level procedure. It neither flushes the instruction cache nor is it
 345 // mt safe.
 346 void MacroAssembler::patch_const(address a, long x) {
 347   assert(is_load_const_at(a), "not a load of a constant");
 348   int *p = (int*) a;
 349   if (is_ori(*(p+1))) {
 350     set_imm(0 + p, (x >> 48) & 0xffff);
 351     set_imm(1 + p, (x >> 32) & 0xffff);
 352     set_imm(3 + p, (x >> 16) & 0xffff);
 353     set_imm(4 + p, x & 0xffff);
 354   } else if (is_lis(*(p+1))) {
 355     set_imm(0 + p, (x >> 48) & 0xffff);
 356     set_imm(2 + p, (x >> 32) & 0xffff);
 357     set_imm(1 + p, (x >> 16) & 0xffff);
 358     set_imm(3 + p, x & 0xffff);
 359   } else {
 360     ShouldNotReachHere();
 361   }
 362 }
 363 
 364 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) {
 365   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
 366   int index = oop_recorder()->allocate_metadata_index(obj);
 367   RelocationHolder rspec = metadata_Relocation::spec(index);
 368   return AddressLiteral((address)obj, rspec);
 369 }
 370 
 371 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) {
 372   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
 373   int index = oop_recorder()->find_index(obj);
 374   RelocationHolder rspec = metadata_Relocation::spec(index);
 375   return AddressLiteral((address)obj, rspec);
 376 }
 377 
 378 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) {
 379   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
 380   int oop_index = oop_recorder()->allocate_oop_index(obj);
 381   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 382 }
 383 
 384 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) {
 385   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
 386   int oop_index = oop_recorder()->find_index(obj);
 387   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 388 }
 389 
 390 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
 391                                                       Register tmp, int offset) {
 392   intptr_t value = *delayed_value_addr;
 393   if (value != 0) {
 394     return RegisterOrConstant(value + offset);
 395   }
 396 
 397   // Load indirectly to solve generation ordering problem.
 398   // static address, no relocation
 399   int simm16_offset = load_const_optimized(tmp, delayed_value_addr, noreg, true);
 400   ld(tmp, simm16_offset, tmp); // must be aligned ((xa & 3) == 0)
 401 
 402   if (offset != 0) {
 403     addi(tmp, tmp, offset);
 404   }
 405 
 406   return RegisterOrConstant(tmp);
 407 }
 408 
 409 #ifndef PRODUCT
 410 void MacroAssembler::pd_print_patched_instruction(address branch) {
 411   Unimplemented(); // TODO: PPC port
 412 }
 413 #endif // ndef PRODUCT
 414 
 415 // Conditional far branch for destinations encodable in 24+2 bits.
 416 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) {
 417 
 418   // If requested by flag optimize, relocate the bc_far as a
 419   // runtime_call and prepare for optimizing it when the code gets
 420   // relocated.
 421   if (optimize == bc_far_optimize_on_relocate) {
 422     relocate(relocInfo::runtime_call_type);
 423   }
 424 
 425   // variant 2:
 426   //
 427   //    b!cxx SKIP
 428   //    bxx   DEST
 429   //  SKIP:
 430   //
 431 
 432   const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 433                                                 opposite_bcond(inv_boint_bcond(boint)));
 434 
 435   // We emit two branches.
 436   // First, a conditional branch which jumps around the far branch.
 437   const address not_taken_pc = pc() + 2 * BytesPerInstWord;
 438   const address bc_pc        = pc();
 439   bc(opposite_boint, biint, not_taken_pc);
 440 
 441   const int bc_instr = *(int*)bc_pc;
 442   assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition");
 443   assert(opposite_boint == inv_bo_field(bc_instr), "postcondition");
 444   assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))),
 445                                      opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))),
 446          "postcondition");
 447   assert(biint == inv_bi_field(bc_instr), "postcondition");
 448 
 449   // Second, an unconditional far branch which jumps to dest.
 450   // Note: target(dest) remembers the current pc (see CodeSection::target)
 451   //       and returns the current pc if the label is not bound yet; when
 452   //       the label gets bound, the unconditional far branch will be patched.
 453   const address target_pc = target(dest);
 454   const address b_pc  = pc();
 455   b(target_pc);
 456 
 457   assert(not_taken_pc == pc(),                     "postcondition");
 458   assert(dest.is_bound() || target_pc == b_pc, "postcondition");
 459 }
 460 
 461 // 1 or 2 instructions
 462 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) {
 463   if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) {
 464     bc(boint, biint, dest);
 465   } else {
 466     bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate);
 467   }
 468 }
 469 
 470 bool MacroAssembler::is_bc_far_at(address instruction_addr) {
 471   return is_bc_far_variant1_at(instruction_addr) ||
 472          is_bc_far_variant2_at(instruction_addr) ||
 473          is_bc_far_variant3_at(instruction_addr);
 474 }
 475 
 476 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) {
 477   if (is_bc_far_variant1_at(instruction_addr)) {
 478     const address instruction_1_addr = instruction_addr;
 479     const int instruction_1 = *(int*)instruction_1_addr;
 480     return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr);
 481   } else if (is_bc_far_variant2_at(instruction_addr)) {
 482     const address instruction_2_addr = instruction_addr + 4;
 483     return bxx_destination(instruction_2_addr);
 484   } else if (is_bc_far_variant3_at(instruction_addr)) {
 485     return instruction_addr + 8;
 486   }
 487   // variant 4 ???
 488   ShouldNotReachHere();
 489   return NULL;
 490 }
 491 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) {
 492 
 493   if (is_bc_far_variant3_at(instruction_addr)) {
 494     // variant 3, far cond branch to the next instruction, already patched to nops:
 495     //
 496     //    nop
 497     //    endgroup
 498     //  SKIP/DEST:
 499     //
 500     return;
 501   }
 502 
 503   // first, extract boint and biint from the current branch
 504   int boint = 0;
 505   int biint = 0;
 506 
 507   ResourceMark rm;
 508   const int code_size = 2 * BytesPerInstWord;
 509   CodeBuffer buf(instruction_addr, code_size);
 510   MacroAssembler masm(&buf);
 511   if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) {
 512     // Far branch to next instruction: Optimize it by patching nops (produce variant 3).
 513     masm.nop();
 514     masm.endgroup();
 515   } else {
 516     if (is_bc_far_variant1_at(instruction_addr)) {
 517       // variant 1, the 1st instruction contains the destination address:
 518       //
 519       //    bcxx  DEST
 520       //    nop
 521       //
 522       const int instruction_1 = *(int*)(instruction_addr);
 523       boint = inv_bo_field(instruction_1);
 524       biint = inv_bi_field(instruction_1);
 525     } else if (is_bc_far_variant2_at(instruction_addr)) {
 526       // variant 2, the 2nd instruction contains the destination address:
 527       //
 528       //    b!cxx SKIP
 529       //    bxx   DEST
 530       //  SKIP:
 531       //
 532       const int instruction_1 = *(int*)(instruction_addr);
 533       boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))),
 534           opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1))));
 535       biint = inv_bi_field(instruction_1);
 536     } else {
 537       // variant 4???
 538       ShouldNotReachHere();
 539     }
 540 
 541     // second, set the new branch destination and optimize the code
 542     if (dest != instruction_addr + 4 && // the bc_far is still unbound!
 543         masm.is_within_range_of_bcxx(dest, instruction_addr)) {
 544       // variant 1:
 545       //
 546       //    bcxx  DEST
 547       //    nop
 548       //
 549       masm.bc(boint, biint, dest);
 550       masm.nop();
 551     } else {
 552       // variant 2:
 553       //
 554       //    b!cxx SKIP
 555       //    bxx   DEST
 556       //  SKIP:
 557       //
 558       const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 559                                                     opposite_bcond(inv_boint_bcond(boint)));
 560       const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord;
 561       masm.bc(opposite_boint, biint, not_taken_pc);
 562       masm.b(dest);
 563     }
 564   }
 565   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 566 }
 567 
 568 // Emit a NOT mt-safe patchable 64 bit absolute call/jump.
 569 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) {
 570   // get current pc
 571   uint64_t start_pc = (uint64_t) pc();
 572 
 573   const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last
 574   const address pc_of_b  = (address) (start_pc + (0*BytesPerInstWord)); // b is first
 575 
 576   // relocate here
 577   if (rt != relocInfo::none) {
 578     relocate(rt);
 579   }
 580 
 581   if ( ReoptimizeCallSequences &&
 582        (( link && is_within_range_of_b(dest, pc_of_bl)) ||
 583         (!link && is_within_range_of_b(dest, pc_of_b)))) {
 584     // variant 2:
 585     // Emit an optimized, pc-relative call/jump.
 586 
 587     if (link) {
 588       // some padding
 589       nop();
 590       nop();
 591       nop();
 592       nop();
 593       nop();
 594       nop();
 595 
 596       // do the call
 597       assert(pc() == pc_of_bl, "just checking");
 598       bl(dest, relocInfo::none);
 599     } else {
 600       // do the jump
 601       assert(pc() == pc_of_b, "just checking");
 602       b(dest, relocInfo::none);
 603 
 604       // some padding
 605       nop();
 606       nop();
 607       nop();
 608       nop();
 609       nop();
 610       nop();
 611     }
 612 
 613     // Assert that we can identify the emitted call/jump.
 614     assert(is_bxx64_patchable_variant2_at((address)start_pc, link),
 615            "can't identify emitted call");
 616   } else {
 617     // variant 1:
 618     mr(R0, R11);  // spill R11 -> R0.
 619 
 620     // Load the destination address into CTR,
 621     // calculate destination relative to global toc.
 622     calculate_address_from_global_toc(R11, dest, true, true, false);
 623 
 624     mtctr(R11);
 625     mr(R11, R0);  // spill R11 <- R0.
 626     nop();
 627 
 628     // do the call/jump
 629     if (link) {
 630       bctrl();
 631     } else{
 632       bctr();
 633     }
 634     // Assert that we can identify the emitted call/jump.
 635     assert(is_bxx64_patchable_variant1b_at((address)start_pc, link),
 636            "can't identify emitted call");
 637   }
 638 
 639   // Assert that we can identify the emitted call/jump.
 640   assert(is_bxx64_patchable_at((address)start_pc, link),
 641          "can't identify emitted call");
 642   assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest,
 643          "wrong encoding of dest address");
 644 }
 645 
 646 // Identify a bxx64_patchable instruction.
 647 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) {
 648   return is_bxx64_patchable_variant1b_at(instruction_addr, link)
 649     //|| is_bxx64_patchable_variant1_at(instruction_addr, link)
 650       || is_bxx64_patchable_variant2_at(instruction_addr, link);
 651 }
 652 
 653 // Does the call64_patchable instruction use a pc-relative encoding of
 654 // the call destination?
 655 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) {
 656   // variant 2 is pc-relative
 657   return is_bxx64_patchable_variant2_at(instruction_addr, link);
 658 }
 659 
 660 // Identify variant 1.
 661 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) {
 662   unsigned int* instr = (unsigned int*) instruction_addr;
 663   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 664       && is_mtctr(instr[5]) // mtctr
 665     && is_load_const_at(instruction_addr);
 666 }
 667 
 668 // Identify variant 1b: load destination relative to global toc.
 669 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) {
 670   unsigned int* instr = (unsigned int*) instruction_addr;
 671   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 672     && is_mtctr(instr[3]) // mtctr
 673     && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr);
 674 }
 675 
 676 // Identify variant 2.
 677 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) {
 678   unsigned int* instr = (unsigned int*) instruction_addr;
 679   if (link) {
 680     return is_bl (instr[6])  // bl dest is last
 681       && is_nop(instr[0])  // nop
 682       && is_nop(instr[1])  // nop
 683       && is_nop(instr[2])  // nop
 684       && is_nop(instr[3])  // nop
 685       && is_nop(instr[4])  // nop
 686       && is_nop(instr[5]); // nop
 687   } else {
 688     return is_b  (instr[0])  // b  dest is first
 689       && is_nop(instr[1])  // nop
 690       && is_nop(instr[2])  // nop
 691       && is_nop(instr[3])  // nop
 692       && is_nop(instr[4])  // nop
 693       && is_nop(instr[5])  // nop
 694       && is_nop(instr[6]); // nop
 695   }
 696 }
 697 
 698 // Set dest address of a bxx64_patchable instruction.
 699 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) {
 700   ResourceMark rm;
 701   int code_size = MacroAssembler::bxx64_patchable_size;
 702   CodeBuffer buf(instruction_addr, code_size);
 703   MacroAssembler masm(&buf);
 704   masm.bxx64_patchable(dest, relocInfo::none, link);
 705   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 706 }
 707 
 708 // Get dest address of a bxx64_patchable instruction.
 709 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) {
 710   if (is_bxx64_patchable_variant1_at(instruction_addr, link)) {
 711     return (address) (unsigned long) get_const(instruction_addr);
 712   } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) {
 713     unsigned int* instr = (unsigned int*) instruction_addr;
 714     if (link) {
 715       const int instr_idx = 6; // bl is last
 716       int branchoffset = branch_destination(instr[instr_idx], 0);
 717       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 718     } else {
 719       const int instr_idx = 0; // b is first
 720       int branchoffset = branch_destination(instr[instr_idx], 0);
 721       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 722     }
 723   // Load dest relative to global toc.
 724   } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) {
 725     return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord,
 726                                                                instruction_addr);
 727   } else {
 728     ShouldNotReachHere();
 729     return NULL;
 730   }
 731 }
 732 
 733 // Uses ordering which corresponds to ABI:
 734 //    _savegpr0_14:  std  r14,-144(r1)
 735 //    _savegpr0_15:  std  r15,-136(r1)
 736 //    _savegpr0_16:  std  r16,-128(r1)
 737 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) {
 738   std(R14, offset, dst);   offset += 8;
 739   std(R15, offset, dst);   offset += 8;
 740   std(R16, offset, dst);   offset += 8;
 741   std(R17, offset, dst);   offset += 8;
 742   std(R18, offset, dst);   offset += 8;
 743   std(R19, offset, dst);   offset += 8;
 744   std(R20, offset, dst);   offset += 8;
 745   std(R21, offset, dst);   offset += 8;
 746   std(R22, offset, dst);   offset += 8;
 747   std(R23, offset, dst);   offset += 8;
 748   std(R24, offset, dst);   offset += 8;
 749   std(R25, offset, dst);   offset += 8;
 750   std(R26, offset, dst);   offset += 8;
 751   std(R27, offset, dst);   offset += 8;
 752   std(R28, offset, dst);   offset += 8;
 753   std(R29, offset, dst);   offset += 8;
 754   std(R30, offset, dst);   offset += 8;
 755   std(R31, offset, dst);   offset += 8;
 756 
 757   stfd(F14, offset, dst);   offset += 8;
 758   stfd(F15, offset, dst);   offset += 8;
 759   stfd(F16, offset, dst);   offset += 8;
 760   stfd(F17, offset, dst);   offset += 8;
 761   stfd(F18, offset, dst);   offset += 8;
 762   stfd(F19, offset, dst);   offset += 8;
 763   stfd(F20, offset, dst);   offset += 8;
 764   stfd(F21, offset, dst);   offset += 8;
 765   stfd(F22, offset, dst);   offset += 8;
 766   stfd(F23, offset, dst);   offset += 8;
 767   stfd(F24, offset, dst);   offset += 8;
 768   stfd(F25, offset, dst);   offset += 8;
 769   stfd(F26, offset, dst);   offset += 8;
 770   stfd(F27, offset, dst);   offset += 8;
 771   stfd(F28, offset, dst);   offset += 8;
 772   stfd(F29, offset, dst);   offset += 8;
 773   stfd(F30, offset, dst);   offset += 8;
 774   stfd(F31, offset, dst);
 775 }
 776 
 777 // Uses ordering which corresponds to ABI:
 778 //    _restgpr0_14:  ld   r14,-144(r1)
 779 //    _restgpr0_15:  ld   r15,-136(r1)
 780 //    _restgpr0_16:  ld   r16,-128(r1)
 781 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) {
 782   ld(R14, offset, src);   offset += 8;
 783   ld(R15, offset, src);   offset += 8;
 784   ld(R16, offset, src);   offset += 8;
 785   ld(R17, offset, src);   offset += 8;
 786   ld(R18, offset, src);   offset += 8;
 787   ld(R19, offset, src);   offset += 8;
 788   ld(R20, offset, src);   offset += 8;
 789   ld(R21, offset, src);   offset += 8;
 790   ld(R22, offset, src);   offset += 8;
 791   ld(R23, offset, src);   offset += 8;
 792   ld(R24, offset, src);   offset += 8;
 793   ld(R25, offset, src);   offset += 8;
 794   ld(R26, offset, src);   offset += 8;
 795   ld(R27, offset, src);   offset += 8;
 796   ld(R28, offset, src);   offset += 8;
 797   ld(R29, offset, src);   offset += 8;
 798   ld(R30, offset, src);   offset += 8;
 799   ld(R31, offset, src);   offset += 8;
 800 
 801   // FP registers
 802   lfd(F14, offset, src);   offset += 8;
 803   lfd(F15, offset, src);   offset += 8;
 804   lfd(F16, offset, src);   offset += 8;
 805   lfd(F17, offset, src);   offset += 8;
 806   lfd(F18, offset, src);   offset += 8;
 807   lfd(F19, offset, src);   offset += 8;
 808   lfd(F20, offset, src);   offset += 8;
 809   lfd(F21, offset, src);   offset += 8;
 810   lfd(F22, offset, src);   offset += 8;
 811   lfd(F23, offset, src);   offset += 8;
 812   lfd(F24, offset, src);   offset += 8;
 813   lfd(F25, offset, src);   offset += 8;
 814   lfd(F26, offset, src);   offset += 8;
 815   lfd(F27, offset, src);   offset += 8;
 816   lfd(F28, offset, src);   offset += 8;
 817   lfd(F29, offset, src);   offset += 8;
 818   lfd(F30, offset, src);   offset += 8;
 819   lfd(F31, offset, src);
 820 }
 821 
 822 // For verify_oops.
 823 void MacroAssembler::save_volatile_gprs(Register dst, int offset) {
 824   std(R2,  offset, dst);   offset += 8;
 825   std(R3,  offset, dst);   offset += 8;
 826   std(R4,  offset, dst);   offset += 8;
 827   std(R5,  offset, dst);   offset += 8;
 828   std(R6,  offset, dst);   offset += 8;
 829   std(R7,  offset, dst);   offset += 8;
 830   std(R8,  offset, dst);   offset += 8;
 831   std(R9,  offset, dst);   offset += 8;
 832   std(R10, offset, dst);   offset += 8;
 833   std(R11, offset, dst);   offset += 8;
 834   std(R12, offset, dst);   offset += 8;
 835 
 836   stfd(F0, offset, dst);   offset += 8;
 837   stfd(F1, offset, dst);   offset += 8;
 838   stfd(F2, offset, dst);   offset += 8;
 839   stfd(F3, offset, dst);   offset += 8;
 840   stfd(F4, offset, dst);   offset += 8;
 841   stfd(F5, offset, dst);   offset += 8;
 842   stfd(F6, offset, dst);   offset += 8;
 843   stfd(F7, offset, dst);   offset += 8;
 844   stfd(F8, offset, dst);   offset += 8;
 845   stfd(F9, offset, dst);   offset += 8;
 846   stfd(F10, offset, dst);  offset += 8;
 847   stfd(F11, offset, dst);  offset += 8;
 848   stfd(F12, offset, dst);  offset += 8;
 849   stfd(F13, offset, dst);
 850 }
 851 
 852 // For verify_oops.
 853 void MacroAssembler::restore_volatile_gprs(Register src, int offset) {
 854   ld(R2,  offset, src);   offset += 8;
 855   ld(R3,  offset, src);   offset += 8;
 856   ld(R4,  offset, src);   offset += 8;
 857   ld(R5,  offset, src);   offset += 8;
 858   ld(R6,  offset, src);   offset += 8;
 859   ld(R7,  offset, src);   offset += 8;
 860   ld(R8,  offset, src);   offset += 8;
 861   ld(R9,  offset, src);   offset += 8;
 862   ld(R10, offset, src);   offset += 8;
 863   ld(R11, offset, src);   offset += 8;
 864   ld(R12, offset, src);   offset += 8;
 865 
 866   lfd(F0, offset, src);   offset += 8;
 867   lfd(F1, offset, src);   offset += 8;
 868   lfd(F2, offset, src);   offset += 8;
 869   lfd(F3, offset, src);   offset += 8;
 870   lfd(F4, offset, src);   offset += 8;
 871   lfd(F5, offset, src);   offset += 8;
 872   lfd(F6, offset, src);   offset += 8;
 873   lfd(F7, offset, src);   offset += 8;
 874   lfd(F8, offset, src);   offset += 8;
 875   lfd(F9, offset, src);   offset += 8;
 876   lfd(F10, offset, src);  offset += 8;
 877   lfd(F11, offset, src);  offset += 8;
 878   lfd(F12, offset, src);  offset += 8;
 879   lfd(F13, offset, src);
 880 }
 881 
 882 void MacroAssembler::save_LR_CR(Register tmp) {
 883   mfcr(tmp);
 884   std(tmp, _abi(cr), R1_SP);
 885   mflr(tmp);
 886   std(tmp, _abi(lr), R1_SP);
 887   // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad)
 888 }
 889 
 890 void MacroAssembler::restore_LR_CR(Register tmp) {
 891   assert(tmp != R1_SP, "must be distinct");
 892   ld(tmp, _abi(lr), R1_SP);
 893   mtlr(tmp);
 894   ld(tmp, _abi(cr), R1_SP);
 895   mtcr(tmp);
 896 }
 897 
 898 address MacroAssembler::get_PC_trash_LR(Register result) {
 899   Label L;
 900   bl(L);
 901   bind(L);
 902   address lr_pc = pc();
 903   mflr(result);
 904   return lr_pc;
 905 }
 906 
 907 void MacroAssembler::resize_frame(Register offset, Register tmp) {
 908 #ifdef ASSERT
 909   assert_different_registers(offset, tmp, R1_SP);
 910   andi_(tmp, offset, frame::alignment_in_bytes-1);
 911   asm_assert_eq("resize_frame: unaligned", 0x204);
 912 #endif
 913 
 914   // tmp <- *(SP)
 915   ld(tmp, _abi(callers_sp), R1_SP);
 916   // addr <- SP + offset;
 917   // *(addr) <- tmp;
 918   // SP <- addr
 919   stdux(tmp, R1_SP, offset);
 920 }
 921 
 922 void MacroAssembler::resize_frame(int offset, Register tmp) {
 923   assert(is_simm(offset, 16), "too big an offset");
 924   assert_different_registers(tmp, R1_SP);
 925   assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned");
 926   // tmp <- *(SP)
 927   ld(tmp, _abi(callers_sp), R1_SP);
 928   // addr <- SP + offset;
 929   // *(addr) <- tmp;
 930   // SP <- addr
 931   stdu(tmp, offset, R1_SP);
 932 }
 933 
 934 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) {
 935   // (addr == tmp1) || (addr == tmp2) is allowed here!
 936   assert(tmp1 != tmp2, "must be distinct");
 937 
 938   // compute offset w.r.t. current stack pointer
 939   // tmp_1 <- addr - SP (!)
 940   subf(tmp1, R1_SP, addr);
 941 
 942   // atomically update SP keeping back link.
 943   resize_frame(tmp1/* offset */, tmp2/* tmp */);
 944 }
 945 
 946 void MacroAssembler::push_frame(Register bytes, Register tmp) {
 947 #ifdef ASSERT
 948   assert(bytes != R0, "r0 not allowed here");
 949   andi_(R0, bytes, frame::alignment_in_bytes-1);
 950   asm_assert_eq("push_frame(Reg, Reg): unaligned", 0x203);
 951 #endif
 952   neg(tmp, bytes);
 953   stdux(R1_SP, R1_SP, tmp);
 954 }
 955 
 956 // Push a frame of size `bytes'.
 957 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) {
 958   long offset = align_addr(bytes, frame::alignment_in_bytes);
 959   if (is_simm(-offset, 16)) {
 960     stdu(R1_SP, -offset, R1_SP);
 961   } else {
 962     load_const_optimized(tmp, -offset);
 963     stdux(R1_SP, R1_SP, tmp);
 964   }
 965 }
 966 
 967 // Push a frame of size `bytes' plus abi_reg_args on top.
 968 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) {
 969   push_frame(bytes + frame::abi_reg_args_size, tmp);
 970 }
 971 
 972 // Setup up a new C frame with a spill area for non-volatile GPRs and
 973 // additional space for local variables.
 974 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes,
 975                                                       Register tmp) {
 976   push_frame(bytes + frame::abi_reg_args_size + frame::spill_nonvolatiles_size, tmp);
 977 }
 978 
 979 // Pop current C frame.
 980 void MacroAssembler::pop_frame() {
 981   ld(R1_SP, _abi(callers_sp), R1_SP);
 982 }
 983 
 984 #if defined(ABI_ELFv2)
 985 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) {
 986   // TODO(asmundak): make sure the caller uses R12 as function descriptor
 987   // most of the times.
 988   if (R12 != r_function_entry) {
 989     mr(R12, r_function_entry);
 990   }
 991   mtctr(R12);
 992   // Do a call or a branch.
 993   if (and_link) {
 994     bctrl();
 995   } else {
 996     bctr();
 997   }
 998   _last_calls_return_pc = pc();
 999 
1000   return _last_calls_return_pc;
1001 }
1002 
1003 // Call a C function via a function descriptor and use full C
1004 // calling conventions. Updates and returns _last_calls_return_pc.
1005 address MacroAssembler::call_c(Register r_function_entry) {
1006   return branch_to(r_function_entry, /*and_link=*/true);
1007 }
1008 
1009 // For tail calls: only branch, don't link, so callee returns to caller of this function.
1010 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) {
1011   return branch_to(r_function_entry, /*and_link=*/false);
1012 }
1013 
1014 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) {
1015   load_const(R12, function_entry, R0);
1016   return branch_to(R12,  /*and_link=*/true);
1017 }
1018 
1019 #else
1020 // Generic version of a call to C function via a function descriptor
1021 // with variable support for C calling conventions (TOC, ENV, etc.).
1022 // Updates and returns _last_calls_return_pc.
1023 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call,
1024                                   bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) {
1025   // we emit standard ptrgl glue code here
1026   assert((function_descriptor != R0), "function_descriptor cannot be R0");
1027 
1028   // retrieve necessary entries from the function descriptor
1029   ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor);
1030   mtctr(R0);
1031 
1032   if (load_toc_of_callee) {
1033     ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor);
1034   }
1035   if (load_env_of_callee) {
1036     ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor);
1037   } else if (load_toc_of_callee) {
1038     li(R11, 0);
1039   }
1040 
1041   // do a call or a branch
1042   if (and_link) {
1043     bctrl();
1044   } else {
1045     bctr();
1046   }
1047   _last_calls_return_pc = pc();
1048 
1049   return _last_calls_return_pc;
1050 }
1051 
1052 // Call a C function via a function descriptor and use full C calling
1053 // conventions.
1054 // We don't use the TOC in generated code, so there is no need to save
1055 // and restore its value.
1056 address MacroAssembler::call_c(Register fd) {
1057   return branch_to(fd, /*and_link=*/true,
1058                        /*save toc=*/false,
1059                        /*restore toc=*/false,
1060                        /*load toc=*/true,
1061                        /*load env=*/true);
1062 }
1063 
1064 address MacroAssembler::call_c_and_return_to_caller(Register fd) {
1065   return branch_to(fd, /*and_link=*/false,
1066                        /*save toc=*/false,
1067                        /*restore toc=*/false,
1068                        /*load toc=*/true,
1069                        /*load env=*/true);
1070 }
1071 
1072 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) {
1073   if (rt != relocInfo::none) {
1074     // this call needs to be relocatable
1075     if (!ReoptimizeCallSequences
1076         || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1077         || fd == NULL   // support code-size estimation
1078         || !fd->is_friend_function()
1079         || fd->entry() == NULL) {
1080       // it's not a friend function as defined by class FunctionDescriptor,
1081       // so do a full call-c here.
1082       load_const(R11, (address)fd, R0);
1083 
1084       bool has_env = (fd != NULL && fd->env() != NULL);
1085       return branch_to(R11, /*and_link=*/true,
1086                             /*save toc=*/false,
1087                             /*restore toc=*/false,
1088                             /*load toc=*/true,
1089                             /*load env=*/has_env);
1090     } else {
1091       // It's a friend function. Load the entry point and don't care about
1092       // toc and env. Use an optimizable call instruction, but ensure the
1093       // same code-size as in the case of a non-friend function.
1094       nop();
1095       nop();
1096       nop();
1097       bl64_patchable(fd->entry(), rt);
1098       _last_calls_return_pc = pc();
1099       return _last_calls_return_pc;
1100     }
1101   } else {
1102     // This call does not need to be relocatable, do more aggressive
1103     // optimizations.
1104     if (!ReoptimizeCallSequences
1105       || !fd->is_friend_function()) {
1106       // It's not a friend function as defined by class FunctionDescriptor,
1107       // so do a full call-c here.
1108       load_const(R11, (address)fd, R0);
1109       return branch_to(R11, /*and_link=*/true,
1110                             /*save toc=*/false,
1111                             /*restore toc=*/false,
1112                             /*load toc=*/true,
1113                             /*load env=*/true);
1114     } else {
1115       // it's a friend function, load the entry point and don't care about
1116       // toc and env.
1117       address dest = fd->entry();
1118       if (is_within_range_of_b(dest, pc())) {
1119         bl(dest);
1120       } else {
1121         bl64_patchable(dest, rt);
1122       }
1123       _last_calls_return_pc = pc();
1124       return _last_calls_return_pc;
1125     }
1126   }
1127 }
1128 
1129 // Call a C function.  All constants needed reside in TOC.
1130 //
1131 // Read the address to call from the TOC.
1132 // Read env from TOC, if fd specifies an env.
1133 // Read new TOC from TOC.
1134 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd,
1135                                          relocInfo::relocType rt, Register toc) {
1136   if (!ReoptimizeCallSequences
1137     || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1138     || !fd->is_friend_function()) {
1139     // It's not a friend function as defined by class FunctionDescriptor,
1140     // so do a full call-c here.
1141     assert(fd->entry() != NULL, "function must be linked");
1142 
1143     AddressLiteral fd_entry(fd->entry());
1144     bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true);
1145     mtctr(R11);
1146     if (fd->env() == NULL) {
1147       li(R11, 0);
1148       nop();
1149     } else {
1150       AddressLiteral fd_env(fd->env());
1151       success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true);
1152     }
1153     AddressLiteral fd_toc(fd->toc());
1154     // Set R2_TOC (load from toc)
1155     success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true);
1156     bctrl();
1157     _last_calls_return_pc = pc();
1158     if (!success) { return NULL; }
1159   } else {
1160     // It's a friend function, load the entry point and don't care about
1161     // toc and env. Use an optimizable call instruction, but ensure the
1162     // same code-size as in the case of a non-friend function.
1163     nop();
1164     bl64_patchable(fd->entry(), rt);
1165     _last_calls_return_pc = pc();
1166   }
1167   return _last_calls_return_pc;
1168 }
1169 #endif // ABI_ELFv2
1170 
1171 void MacroAssembler::call_VM_base(Register oop_result,
1172                                   Register last_java_sp,
1173                                   address  entry_point,
1174                                   bool     check_exceptions) {
1175   BLOCK_COMMENT("call_VM {");
1176   // Determine last_java_sp register.
1177   if (!last_java_sp->is_valid()) {
1178     last_java_sp = R1_SP;
1179   }
1180   set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1);
1181 
1182   // ARG1 must hold thread address.
1183   mr(R3_ARG1, R16_thread);
1184 #if defined(ABI_ELFv2)
1185   address return_pc = call_c(entry_point, relocInfo::none);
1186 #else
1187   address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none);
1188 #endif
1189 
1190   reset_last_Java_frame();
1191 
1192   // Check for pending exceptions.
1193   if (check_exceptions) {
1194     // We don't check for exceptions here.
1195     ShouldNotReachHere();
1196   }
1197 
1198   // Get oop result if there is one and reset the value in the thread.
1199   if (oop_result->is_valid()) {
1200     get_vm_result(oop_result);
1201   }
1202 
1203   _last_calls_return_pc = return_pc;
1204   BLOCK_COMMENT("} call_VM");
1205 }
1206 
1207 void MacroAssembler::call_VM_leaf_base(address entry_point) {
1208   BLOCK_COMMENT("call_VM_leaf {");
1209 #if defined(ABI_ELFv2)
1210   call_c(entry_point, relocInfo::none);
1211 #else
1212   call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none);
1213 #endif
1214   BLOCK_COMMENT("} call_VM_leaf");
1215 }
1216 
1217 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) {
1218   call_VM_base(oop_result, noreg, entry_point, check_exceptions);
1219 }
1220 
1221 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1,
1222                              bool check_exceptions) {
1223   // R3_ARG1 is reserved for the thread.
1224   mr_if_needed(R4_ARG2, arg_1);
1225   call_VM(oop_result, entry_point, check_exceptions);
1226 }
1227 
1228 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2,
1229                              bool check_exceptions) {
1230   // R3_ARG1 is reserved for the thread
1231   mr_if_needed(R4_ARG2, arg_1);
1232   assert(arg_2 != R4_ARG2, "smashed argument");
1233   mr_if_needed(R5_ARG3, arg_2);
1234   call_VM(oop_result, entry_point, check_exceptions);
1235 }
1236 
1237 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3,
1238                              bool check_exceptions) {
1239   // R3_ARG1 is reserved for the thread
1240   mr_if_needed(R4_ARG2, arg_1);
1241   assert(arg_2 != R4_ARG2, "smashed argument");
1242   mr_if_needed(R5_ARG3, arg_2);
1243   mr_if_needed(R6_ARG4, arg_3);
1244   call_VM(oop_result, entry_point, check_exceptions);
1245 }
1246 
1247 void MacroAssembler::call_VM_leaf(address entry_point) {
1248   call_VM_leaf_base(entry_point);
1249 }
1250 
1251 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) {
1252   mr_if_needed(R3_ARG1, arg_1);
1253   call_VM_leaf(entry_point);
1254 }
1255 
1256 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) {
1257   mr_if_needed(R3_ARG1, arg_1);
1258   assert(arg_2 != R3_ARG1, "smashed argument");
1259   mr_if_needed(R4_ARG2, arg_2);
1260   call_VM_leaf(entry_point);
1261 }
1262 
1263 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) {
1264   mr_if_needed(R3_ARG1, arg_1);
1265   assert(arg_2 != R3_ARG1, "smashed argument");
1266   mr_if_needed(R4_ARG2, arg_2);
1267   assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument");
1268   mr_if_needed(R5_ARG3, arg_3);
1269   call_VM_leaf(entry_point);
1270 }
1271 
1272 // Check whether instruction is a read access to the polling page
1273 // which was emitted by load_from_polling_page(..).
1274 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext,
1275                                                address* polling_address_ptr) {
1276   if (!is_ld(instruction))
1277     return false; // It's not a ld. Fail.
1278 
1279   int rt = inv_rt_field(instruction);
1280   int ra = inv_ra_field(instruction);
1281   int ds = inv_ds_field(instruction);
1282   if (!(ds == 0 && ra != 0 && rt == 0)) {
1283     return false; // It's not a ld(r0, X, ra). Fail.
1284   }
1285 
1286   if (!ucontext) {
1287     // Set polling address.
1288     if (polling_address_ptr != NULL) {
1289       *polling_address_ptr = NULL;
1290     }
1291     return true; // No ucontext given. Can't check value of ra. Assume true.
1292   }
1293 
1294 #ifdef LINUX
1295   // Ucontext given. Check that register ra contains the address of
1296   // the safepoing polling page.
1297   ucontext_t* uc = (ucontext_t*) ucontext;
1298   // Set polling address.
1299   address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds;
1300   if (polling_address_ptr != NULL) {
1301     *polling_address_ptr = addr;
1302   }
1303   return os::is_poll_address(addr);
1304 #else
1305   // Not on Linux, ucontext must be NULL.
1306   ShouldNotReachHere();
1307   return false;
1308 #endif
1309 }
1310 
1311 bool MacroAssembler::is_memory_serialization(int instruction, JavaThread* thread, void* ucontext) {
1312 #ifdef LINUX
1313   ucontext_t* uc = (ucontext_t*) ucontext;
1314 
1315   if (is_stwx(instruction) || is_stwux(instruction)) {
1316     int ra = inv_ra_field(instruction);
1317     int rb = inv_rb_field(instruction);
1318 
1319     // look up content of ra and rb in ucontext
1320     address ra_val=(address)uc->uc_mcontext.regs->gpr[ra];
1321     long rb_val=(long)uc->uc_mcontext.regs->gpr[rb];
1322     return os::is_memory_serialize_page(thread, ra_val+rb_val);
1323   } else if (is_stw(instruction) || is_stwu(instruction)) {
1324     int ra = inv_ra_field(instruction);
1325     int d1 = inv_d1_field(instruction);
1326 
1327     // look up content of ra in ucontext
1328     address ra_val=(address)uc->uc_mcontext.regs->gpr[ra];
1329     return os::is_memory_serialize_page(thread, ra_val+d1);
1330   } else {
1331     return false;
1332   }
1333 #else
1334   // workaround not needed on !LINUX :-)
1335   ShouldNotCallThis();
1336   return false;
1337 #endif
1338 }
1339 
1340 void MacroAssembler::bang_stack_with_offset(int offset) {
1341   // When increasing the stack, the old stack pointer will be written
1342   // to the new top of stack according to the PPC64 abi.
1343   // Therefore, stack banging is not necessary when increasing
1344   // the stack by <= os::vm_page_size() bytes.
1345   // When increasing the stack by a larger amount, this method is
1346   // called repeatedly to bang the intermediate pages.
1347 
1348   // Stack grows down, caller passes positive offset.
1349   assert(offset > 0, "must bang with positive offset");
1350 
1351   long stdoffset = -offset;
1352 
1353   if (is_simm(stdoffset, 16)) {
1354     // Signed 16 bit offset, a simple std is ok.
1355     if (UseLoadInstructionsForStackBangingPPC64) {
1356       ld(R0, (int)(signed short)stdoffset, R1_SP);
1357     } else {
1358       std(R0,(int)(signed short)stdoffset, R1_SP);
1359     }
1360   } else if (is_simm(stdoffset, 31)) {
1361     const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset);
1362     const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset);
1363 
1364     Register tmp = R11;
1365     addis(tmp, R1_SP, hi);
1366     if (UseLoadInstructionsForStackBangingPPC64) {
1367       ld(R0,  lo, tmp);
1368     } else {
1369       std(R0, lo, tmp);
1370     }
1371   } else {
1372     ShouldNotReachHere();
1373   }
1374 }
1375 
1376 // If instruction is a stack bang of the form
1377 //    std    R0,    x(Ry),       (see bang_stack_with_offset())
1378 //    stdu   R1_SP, x(R1_SP),    (see push_frame(), resize_frame())
1379 // or stdux  R1_SP, Rx, R1_SP    (see push_frame(), resize_frame())
1380 // return the banged address. Otherwise, return 0.
1381 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) {
1382 #ifdef LINUX
1383   ucontext_t* uc = (ucontext_t*) ucontext;
1384   int rs = inv_rs_field(instruction);
1385   int ra = inv_ra_field(instruction);
1386   if (   (is_ld(instruction)   && rs == 0 &&  UseLoadInstructionsForStackBangingPPC64)
1387       || (is_std(instruction)  && rs == 0 && !UseLoadInstructionsForStackBangingPPC64)
1388       || (is_stdu(instruction) && rs == 1)) {
1389     int ds = inv_ds_field(instruction);
1390     // return banged address
1391     return ds+(address)uc->uc_mcontext.regs->gpr[ra];
1392   } else if (is_stdux(instruction) && rs == 1) {
1393     int rb = inv_rb_field(instruction);
1394     address sp = (address)uc->uc_mcontext.regs->gpr[1];
1395     long rb_val = (long)uc->uc_mcontext.regs->gpr[rb];
1396     return ra != 1 || rb_val >= 0 ? NULL         // not a stack bang
1397                                   : sp + rb_val; // banged address
1398   }
1399   return NULL; // not a stack bang
1400 #else
1401   // workaround not needed on !LINUX :-)
1402   ShouldNotCallThis();
1403   return NULL;
1404 #endif
1405 }
1406 
1407 void MacroAssembler::reserved_stack_check(Register return_pc) {
1408   // Test if reserved zone needs to be enabled.
1409   Label no_reserved_zone_enabling;
1410 
1411   ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread);
1412   cmpld(CCR0, R1_SP, R0);
1413   blt_predict_taken(CCR0, no_reserved_zone_enabling);
1414 
1415   // Enable reserved zone again, throw stack overflow exception.
1416   push_frame_reg_args(0, R0);
1417   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread);
1418   pop_frame();
1419   mtlr(return_pc);
1420   load_const_optimized(R0, StubRoutines::throw_delayed_StackOverflowError_entry());
1421   mtctr(R0);
1422   bctr();
1423 
1424   should_not_reach_here();
1425 
1426   bind(no_reserved_zone_enabling);
1427 }
1428 
1429 void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base,
1430                                 bool cmpxchgx_hint) {
1431   Label retry;
1432   bind(retry);
1433   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1434   stdcx_(exchange_value, addr_base);
1435   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1436     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1437   } else {
1438     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1439   }
1440 }
1441 
1442 void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base,
1443                                 Register tmp, bool cmpxchgx_hint) {
1444   Label retry;
1445   bind(retry);
1446   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1447   add(tmp, dest_current_value, inc_value);
1448   stdcx_(tmp, addr_base);
1449   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1450     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1451   } else {
1452     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1453   }
1454 }
1455 
1456 // Word/sub-word atomic helper functions
1457 
1458 // Temps and addr_base are killed if size < 4 and processor does not support respective instructions.
1459 // Only signed types are supported with size < 4.
1460 // Atomic add always kills tmp1.
1461 void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value,
1462                                                    Register addr_base, Register tmp1, Register tmp2, Register tmp3,
1463                                                    bool cmpxchgx_hint, bool is_add, int size) {
1464   // Sub-word instructions are available since Power 8.
1465   // For older processors, instruction_type != size holds, and we
1466   // emulate the sub-word instructions by constructing a 4-byte value
1467   // that leaves the other bytes unchanged.
1468   const int instruction_type = VM_Version::has_lqarx() ? size : 4;
1469 
1470   Label retry;
1471   Register shift_amount = noreg,
1472            val32 = dest_current_value,
1473            modval = is_add ? tmp1 : exchange_value;
1474 
1475   if (instruction_type != size) {
1476     assert_different_registers(tmp1, tmp2, tmp3, dest_current_value, exchange_value, addr_base);
1477     modval = tmp1;
1478     shift_amount = tmp2;
1479     val32 = tmp3;
1480     // Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.
1481 #ifdef VM_LITTLE_ENDIAN
1482     rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;
1483     clrrdi(addr_base, addr_base, 2);
1484 #else
1485     xori(shift_amount, addr_base, (size == 1) ? 3 : 2);
1486     clrrdi(addr_base, addr_base, 2);
1487     rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;
1488 #endif
1489   }
1490 
1491   // atomic emulation loop
1492   bind(retry);
1493 
1494   switch (instruction_type) {
1495     case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1496     case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1497     case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1498     default: ShouldNotReachHere();
1499   }
1500 
1501   if (instruction_type != size) {
1502     srw(dest_current_value, val32, shift_amount);
1503   }
1504 
1505   if (is_add) { add(modval, dest_current_value, exchange_value); }
1506 
1507   if (instruction_type != size) {
1508     // Transform exchange value such that the replacement can be done by one xor instruction.
1509     xorr(modval, dest_current_value, is_add ? modval : exchange_value);
1510     clrldi(modval, modval, (size == 1) ? 56 : 48);
1511     slw(modval, modval, shift_amount);
1512     xorr(modval, val32, modval);
1513   }
1514 
1515   switch (instruction_type) {
1516     case 4: stwcx_(modval, addr_base); break;
1517     case 2: sthcx_(modval, addr_base); break;
1518     case 1: stbcx_(modval, addr_base); break;
1519     default: ShouldNotReachHere();
1520   }
1521 
1522   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1523     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1524   } else {
1525     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1526   }
1527 
1528   // l?arx zero-extends, but Java wants byte/short values sign-extended.
1529   if (size == 1) {
1530     extsb(dest_current_value, dest_current_value);
1531   } else if (size == 2) {
1532     extsh(dest_current_value, dest_current_value);
1533   };
1534 }
1535 
1536 // Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions.
1537 // Only signed types are supported with size < 4.
1538 void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value,
1539                                        Register compare_value, Register exchange_value,
1540                                        Register addr_base, Register tmp1, Register tmp2,
1541                                        Label &retry, Label &failed, bool cmpxchgx_hint, int size) {
1542   // Sub-word instructions are available since Power 8.
1543   // For older processors, instruction_type != size holds, and we
1544   // emulate the sub-word instructions by constructing a 4-byte value
1545   // that leaves the other bytes unchanged.
1546   const int instruction_type = VM_Version::has_lqarx() ? size : 4;
1547 
1548   Register shift_amount = noreg,
1549            val32 = dest_current_value,
1550            modval = exchange_value;
1551 
1552   if (instruction_type != size) {
1553     assert_different_registers(tmp1, tmp2, dest_current_value, compare_value, exchange_value, addr_base);
1554     shift_amount = tmp1;
1555     val32 = tmp2;
1556     modval = tmp2;
1557     // Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.
1558 #ifdef VM_LITTLE_ENDIAN
1559     rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;
1560     clrrdi(addr_base, addr_base, 2);
1561 #else
1562     xori(shift_amount, addr_base, (size == 1) ? 3 : 2);
1563     clrrdi(addr_base, addr_base, 2);
1564     rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;
1565 #endif
1566     // Transform exchange value such that the replacement can be done by one xor instruction.
1567     xorr(exchange_value, compare_value, exchange_value);
1568     clrldi(exchange_value, exchange_value, (size == 1) ? 56 : 48);
1569     slw(exchange_value, exchange_value, shift_amount);
1570   }
1571 
1572   // atomic emulation loop
1573   bind(retry);
1574 
1575   switch (instruction_type) {
1576     case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1577     case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1578     case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1579     default: ShouldNotReachHere();
1580   }
1581 
1582   if (instruction_type != size) {
1583     srw(dest_current_value, val32, shift_amount);
1584   }
1585   if (size == 1) {
1586     extsb(dest_current_value, dest_current_value);
1587   } else if (size == 2) {
1588     extsh(dest_current_value, dest_current_value);
1589   };
1590 
1591   cmpw(flag, dest_current_value, compare_value);
1592   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1593     bne_predict_not_taken(flag, failed);
1594   } else {
1595     bne(                  flag, failed);
1596   }
1597   // branch to done  => (flag == ne), (dest_current_value != compare_value)
1598   // fall through    => (flag == eq), (dest_current_value == compare_value)
1599 
1600   if (instruction_type != size) {
1601     xorr(modval, val32, exchange_value);
1602   }
1603 
1604   switch (instruction_type) {
1605     case 4: stwcx_(modval, addr_base); break;
1606     case 2: sthcx_(modval, addr_base); break;
1607     case 1: stbcx_(modval, addr_base); break;
1608     default: ShouldNotReachHere();
1609   }
1610 }
1611 
1612 // CmpxchgX sets condition register to cmpX(current, compare).
1613 void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value,
1614                                      Register compare_value, Register exchange_value,
1615                                      Register addr_base, Register tmp1, Register tmp2,
1616                                      int semantics, bool cmpxchgx_hint,
1617                                      Register int_flag_success, bool contention_hint, bool weak, int size) {
1618   Label retry;
1619   Label failed;
1620   Label done;
1621 
1622   // Save one branch if result is returned via register and
1623   // result register is different from the other ones.
1624   bool use_result_reg    = (int_flag_success != noreg);
1625   bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value &&
1626                             int_flag_success != exchange_value && int_flag_success != addr_base &&
1627                             int_flag_success != tmp1 && int_flag_success != tmp2);
1628   assert(!weak || flag == CCR0, "weak only supported with CCR0");
1629   assert(size == 1 || size == 2 || size == 4, "unsupported");
1630 
1631   if (use_result_reg && preset_result_reg) {
1632     li(int_flag_success, 0); // preset (assume cas failed)
1633   }
1634 
1635   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1636   if (contention_hint) { // Don't try to reserve if cmp fails.
1637     switch (size) {
1638       case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break;
1639       case 2: lha(dest_current_value, 0, addr_base); break;
1640       case 4: lwz(dest_current_value, 0, addr_base); break;
1641       default: ShouldNotReachHere();
1642     }
1643     cmpw(flag, dest_current_value, compare_value);
1644     bne(flag, failed);
1645   }
1646 
1647   // release/fence semantics
1648   if (semantics & MemBarRel) {
1649     release();
1650   }
1651 
1652   cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base, tmp1, tmp2,
1653                     retry, failed, cmpxchgx_hint, size);
1654   if (!weak || use_result_reg) {
1655     if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1656       bne_predict_not_taken(CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1657     } else {
1658       bne(                  CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1659     }
1660   }
1661   // fall through    => (flag == eq), (dest_current_value == compare_value), (swapped)
1662 
1663   // Result in register (must do this at the end because int_flag_success can be the
1664   // same register as one above).
1665   if (use_result_reg) {
1666     li(int_flag_success, 1);
1667   }
1668 
1669   if (semantics & MemBarFenceAfter) {
1670     fence();
1671   } else if (semantics & MemBarAcq) {
1672     isync();
1673   }
1674 
1675   if (use_result_reg && !preset_result_reg) {
1676     b(done);
1677   }
1678 
1679   bind(failed);
1680   if (use_result_reg && !preset_result_reg) {
1681     li(int_flag_success, 0);
1682   }
1683 
1684   bind(done);
1685   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1686   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1687 }
1688 
1689 // Preforms atomic compare exchange:
1690 //   if (compare_value == *addr_base)
1691 //     *addr_base = exchange_value
1692 //     int_flag_success = 1;
1693 //   else
1694 //     int_flag_success = 0;
1695 //
1696 // ConditionRegister flag       = cmp(compare_value, *addr_base)
1697 // Register dest_current_value  = *addr_base
1698 // Register compare_value       Used to compare with value in memory
1699 // Register exchange_value      Written to memory if compare_value == *addr_base
1700 // Register addr_base           The memory location to compareXChange
1701 // Register int_flag_success    Set to 1 if exchange_value was written to *addr_base
1702 //
1703 // To avoid the costly compare exchange the value is tested beforehand.
1704 // Several special cases exist to avoid that unnecessary information is generated.
1705 //
1706 void MacroAssembler::cmpxchgd(ConditionRegister flag,
1707                               Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value,
1708                               Register addr_base, int semantics, bool cmpxchgx_hint,
1709                               Register int_flag_success, Label* failed_ext, bool contention_hint, bool weak) {
1710   Label retry;
1711   Label failed_int;
1712   Label& failed = (failed_ext != NULL) ? *failed_ext : failed_int;
1713   Label done;
1714 
1715   // Save one branch if result is returned via register and result register is different from the other ones.
1716   bool use_result_reg    = (int_flag_success!=noreg);
1717   bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() &&
1718                             int_flag_success!=exchange_value && int_flag_success!=addr_base);
1719   assert(!weak || flag == CCR0, "weak only supported with CCR0");
1720   assert(int_flag_success == noreg || failed_ext == NULL, "cannot have both");
1721 
1722   if (use_result_reg && preset_result_reg) {
1723     li(int_flag_success, 0); // preset (assume cas failed)
1724   }
1725 
1726   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1727   if (contention_hint) { // Don't try to reserve if cmp fails.
1728     ld(dest_current_value, 0, addr_base);
1729     cmpd(flag, compare_value, dest_current_value);
1730     bne(flag, failed);
1731   }
1732 
1733   // release/fence semantics
1734   if (semantics & MemBarRel) {
1735     release();
1736   }
1737 
1738   // atomic emulation loop
1739   bind(retry);
1740 
1741   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1742   cmpd(flag, compare_value, dest_current_value);
1743   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1744     bne_predict_not_taken(flag, failed);
1745   } else {
1746     bne(                  flag, failed);
1747   }
1748 
1749   stdcx_(exchange_value, addr_base);
1750   if (!weak || use_result_reg || failed_ext) {
1751     if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1752       bne_predict_not_taken(CCR0, weak ? failed : retry); // stXcx_ sets CCR0
1753     } else {
1754       bne(                  CCR0, weak ? failed : retry); // stXcx_ sets CCR0
1755     }
1756   }
1757 
1758   // result in register (must do this at the end because int_flag_success can be the same register as one above)
1759   if (use_result_reg) {
1760     li(int_flag_success, 1);
1761   }
1762 
1763   if (semantics & MemBarFenceAfter) {
1764     fence();
1765   } else if (semantics & MemBarAcq) {
1766     isync();
1767   }
1768 
1769   if (use_result_reg && !preset_result_reg) {
1770     b(done);
1771   }
1772 
1773   bind(failed_int);
1774   if (use_result_reg && !preset_result_reg) {
1775     li(int_flag_success, 0);
1776   }
1777 
1778   bind(done);
1779   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1780   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1781 }
1782 
1783 // Look up the method for a megamorphic invokeinterface call.
1784 // The target method is determined by <intf_klass, itable_index>.
1785 // The receiver klass is in recv_klass.
1786 // On success, the result will be in method_result, and execution falls through.
1787 // On failure, execution transfers to the given label.
1788 void MacroAssembler::lookup_interface_method(Register recv_klass,
1789                                              Register intf_klass,
1790                                              RegisterOrConstant itable_index,
1791                                              Register method_result,
1792                                              Register scan_temp,
1793                                              Register temp2,
1794                                              Label& L_no_such_interface,
1795                                              bool return_method) {
1796   assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
1797 
1798   // Compute start of first itableOffsetEntry (which is at the end of the vtable).
1799   int vtable_base = in_bytes(Klass::vtable_start_offset());
1800   int itentry_off = itableMethodEntry::method_offset_in_bytes();
1801   int logMEsize   = exact_log2(itableMethodEntry::size() * wordSize);
1802   int scan_step   = itableOffsetEntry::size() * wordSize;
1803   int log_vte_size= exact_log2(vtableEntry::size_in_bytes());
1804 
1805   lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass);
1806   // %%% We should store the aligned, prescaled offset in the klassoop.
1807   // Then the next several instructions would fold away.
1808 
1809   sldi(scan_temp, scan_temp, log_vte_size);
1810   addi(scan_temp, scan_temp, vtable_base);
1811   add(scan_temp, recv_klass, scan_temp);
1812 
1813   // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1814   if (return_method) {
1815     if (itable_index.is_register()) {
1816       Register itable_offset = itable_index.as_register();
1817       sldi(method_result, itable_offset, logMEsize);
1818       if (itentry_off) { addi(method_result, method_result, itentry_off); }
1819       add(method_result, method_result, recv_klass);
1820     } else {
1821       long itable_offset = (long)itable_index.as_constant();
1822       // static address, no relocation
1823       add_const_optimized(method_result, recv_klass, (itable_offset << logMEsize) + itentry_off, temp2);
1824     }
1825   }
1826 
1827   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
1828   //   if (scan->interface() == intf) {
1829   //     result = (klass + scan->offset() + itable_index);
1830   //   }
1831   // }
1832   Label search, found_method;
1833 
1834   for (int peel = 1; peel >= 0; peel--) {
1835     // %%%% Could load both offset and interface in one ldx, if they were
1836     // in the opposite order. This would save a load.
1837     ld(temp2, itableOffsetEntry::interface_offset_in_bytes(), scan_temp);
1838 
1839     // Check that this entry is non-null. A null entry means that
1840     // the receiver class doesn't implement the interface, and wasn't the
1841     // same as when the caller was compiled.
1842     cmpd(CCR0, temp2, intf_klass);
1843 
1844     if (peel) {
1845       beq(CCR0, found_method);
1846     } else {
1847       bne(CCR0, search);
1848       // (invert the test to fall through to found_method...)
1849     }
1850 
1851     if (!peel) break;
1852 
1853     bind(search);
1854 
1855     cmpdi(CCR0, temp2, 0);
1856     beq(CCR0, L_no_such_interface);
1857     addi(scan_temp, scan_temp, scan_step);
1858   }
1859 
1860   bind(found_method);
1861 
1862   // Got a hit.
1863   if (return_method) {
1864     int ito_offset = itableOffsetEntry::offset_offset_in_bytes();
1865     lwz(scan_temp, ito_offset, scan_temp);
1866     ldx(method_result, scan_temp, method_result);
1867   }
1868 }
1869 
1870 // virtual method calling
1871 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1872                                            RegisterOrConstant vtable_index,
1873                                            Register method_result) {
1874 
1875   assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg());
1876 
1877   const int base = in_bytes(Klass::vtable_start_offset());
1878   assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1879 
1880   if (vtable_index.is_register()) {
1881     sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord);
1882     add(recv_klass, vtable_index.as_register(), recv_klass);
1883   } else {
1884     addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord);
1885   }
1886   ld(R19_method, base + vtableEntry::method_offset_in_bytes(), recv_klass);
1887 }
1888 
1889 /////////////////////////////////////////// subtype checking ////////////////////////////////////////////
1890 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1891                                                    Register super_klass,
1892                                                    Register temp1_reg,
1893                                                    Register temp2_reg,
1894                                                    Label* L_success,
1895                                                    Label* L_failure,
1896                                                    Label* L_slow_path,
1897                                                    RegisterOrConstant super_check_offset) {
1898 
1899   const Register check_cache_offset = temp1_reg;
1900   const Register cached_super       = temp2_reg;
1901 
1902   assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super);
1903 
1904   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1905   int sc_offset  = in_bytes(Klass::secondary_super_cache_offset());
1906 
1907   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1908   bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset);
1909 
1910   Label L_fallthrough;
1911   int label_nulls = 0;
1912   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1913   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1914   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1915   assert(label_nulls <= 1 ||
1916          (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path),
1917          "at most one NULL in the batch, usually");
1918 
1919   // If the pointers are equal, we are done (e.g., String[] elements).
1920   // This self-check enables sharing of secondary supertype arrays among
1921   // non-primary types such as array-of-interface. Otherwise, each such
1922   // type would need its own customized SSA.
1923   // We move this check to the front of the fast path because many
1924   // type checks are in fact trivially successful in this manner,
1925   // so we get a nicely predicted branch right at the start of the check.
1926   cmpd(CCR0, sub_klass, super_klass);
1927   beq(CCR0, *L_success);
1928 
1929   // Check the supertype display:
1930   if (must_load_sco) {
1931     // The super check offset is always positive...
1932     lwz(check_cache_offset, sco_offset, super_klass);
1933     super_check_offset = RegisterOrConstant(check_cache_offset);
1934     // super_check_offset is register.
1935     assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register());
1936   }
1937   // The loaded value is the offset from KlassOopDesc.
1938 
1939   ld(cached_super, super_check_offset, sub_klass);
1940   cmpd(CCR0, cached_super, super_klass);
1941 
1942   // This check has worked decisively for primary supers.
1943   // Secondary supers are sought in the super_cache ('super_cache_addr').
1944   // (Secondary supers are interfaces and very deeply nested subtypes.)
1945   // This works in the same check above because of a tricky aliasing
1946   // between the super_cache and the primary super display elements.
1947   // (The 'super_check_addr' can address either, as the case requires.)
1948   // Note that the cache is updated below if it does not help us find
1949   // what we need immediately.
1950   // So if it was a primary super, we can just fail immediately.
1951   // Otherwise, it's the slow path for us (no success at this point).
1952 
1953 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); }
1954 
1955   if (super_check_offset.is_register()) {
1956     beq(CCR0, *L_success);
1957     cmpwi(CCR0, super_check_offset.as_register(), sc_offset);
1958     if (L_failure == &L_fallthrough) {
1959       beq(CCR0, *L_slow_path);
1960     } else {
1961       bne(CCR0, *L_failure);
1962       FINAL_JUMP(*L_slow_path);
1963     }
1964   } else {
1965     if (super_check_offset.as_constant() == sc_offset) {
1966       // Need a slow path; fast failure is impossible.
1967       if (L_slow_path == &L_fallthrough) {
1968         beq(CCR0, *L_success);
1969       } else {
1970         bne(CCR0, *L_slow_path);
1971         FINAL_JUMP(*L_success);
1972       }
1973     } else {
1974       // No slow path; it's a fast decision.
1975       if (L_failure == &L_fallthrough) {
1976         beq(CCR0, *L_success);
1977       } else {
1978         bne(CCR0, *L_failure);
1979         FINAL_JUMP(*L_success);
1980       }
1981     }
1982   }
1983 
1984   bind(L_fallthrough);
1985 #undef FINAL_JUMP
1986 }
1987 
1988 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1989                                                    Register super_klass,
1990                                                    Register temp1_reg,
1991                                                    Register temp2_reg,
1992                                                    Label* L_success,
1993                                                    Register result_reg) {
1994   const Register array_ptr = temp1_reg; // current value from cache array
1995   const Register temp      = temp2_reg;
1996 
1997   assert_different_registers(sub_klass, super_klass, array_ptr, temp);
1998 
1999   int source_offset = in_bytes(Klass::secondary_supers_offset());
2000   int target_offset = in_bytes(Klass::secondary_super_cache_offset());
2001 
2002   int length_offset = Array<Klass*>::length_offset_in_bytes();
2003   int base_offset   = Array<Klass*>::base_offset_in_bytes();
2004 
2005   Label hit, loop, failure, fallthru;
2006 
2007   ld(array_ptr, source_offset, sub_klass);
2008 
2009   // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated.");
2010   lwz(temp, length_offset, array_ptr);
2011   cmpwi(CCR0, temp, 0);
2012   beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0
2013 
2014   mtctr(temp); // load ctr
2015 
2016   bind(loop);
2017   // Oops in table are NO MORE compressed.
2018   ld(temp, base_offset, array_ptr);
2019   cmpd(CCR0, temp, super_klass);
2020   beq(CCR0, hit);
2021   addi(array_ptr, array_ptr, BytesPerWord);
2022   bdnz(loop);
2023 
2024   bind(failure);
2025   if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss)
2026   b(fallthru);
2027 
2028   bind(hit);
2029   std(super_klass, target_offset, sub_klass); // save result to cache
2030   if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit)
2031   if (L_success != NULL) { b(*L_success); }
2032   else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided
2033 
2034   bind(fallthru);
2035 }
2036 
2037 // Try fast path, then go to slow one if not successful
2038 void MacroAssembler::check_klass_subtype(Register sub_klass,
2039                          Register super_klass,
2040                          Register temp1_reg,
2041                          Register temp2_reg,
2042                          Label& L_success) {
2043   Label L_failure;
2044   check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure);
2045   check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success);
2046   bind(L_failure); // Fallthru if not successful.
2047 }
2048 
2049 void MacroAssembler::check_method_handle_type(Register mtype_reg, Register mh_reg,
2050                                               Register temp_reg,
2051                                               Label& wrong_method_type) {
2052   assert_different_registers(mtype_reg, mh_reg, temp_reg);
2053   // Compare method type against that of the receiver.
2054   load_heap_oop_not_null(temp_reg, delayed_value(java_lang_invoke_MethodHandle::type_offset_in_bytes, temp_reg), mh_reg);
2055   cmpd(CCR0, temp_reg, mtype_reg);
2056   bne(CCR0, wrong_method_type);
2057 }
2058 
2059 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot,
2060                                                    Register temp_reg,
2061                                                    int extra_slot_offset) {
2062   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
2063   int stackElementSize = Interpreter::stackElementSize;
2064   int offset = extra_slot_offset * stackElementSize;
2065   if (arg_slot.is_constant()) {
2066     offset += arg_slot.as_constant() * stackElementSize;
2067     return offset;
2068   } else {
2069     assert(temp_reg != noreg, "must specify");
2070     sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize));
2071     if (offset != 0)
2072       addi(temp_reg, temp_reg, offset);
2073     return temp_reg;
2074   }
2075 }
2076 
2077 // Supports temp2_reg = R0.
2078 void MacroAssembler::biased_locking_enter(ConditionRegister cr_reg, Register obj_reg,
2079                                           Register mark_reg, Register temp_reg,
2080                                           Register temp2_reg, Label& done, Label* slow_case) {
2081   assert(UseBiasedLocking, "why call this otherwise?");
2082 
2083 #ifdef ASSERT
2084   assert_different_registers(obj_reg, mark_reg, temp_reg, temp2_reg);
2085 #endif
2086 
2087   Label cas_label;
2088 
2089   // Branch to done if fast path fails and no slow_case provided.
2090   Label *slow_case_int = (slow_case != NULL) ? slow_case : &done;
2091 
2092   // Biased locking
2093   // See whether the lock is currently biased toward our thread and
2094   // whether the epoch is still valid
2095   // Note that the runtime guarantees sufficient alignment of JavaThread
2096   // pointers to allow age to be placed into low bits
2097   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits,
2098          "biased locking makes assumptions about bit layout");
2099 
2100   if (PrintBiasedLockingStatistics) {
2101     load_const(temp2_reg, (address) BiasedLocking::total_entry_count_addr(), temp_reg);
2102     lwzx(temp_reg, temp2_reg);
2103     addi(temp_reg, temp_reg, 1);
2104     stwx(temp_reg, temp2_reg);
2105   }
2106 
2107   andi(temp_reg, mark_reg, markOopDesc::biased_lock_mask_in_place);
2108   cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern);
2109   bne(cr_reg, cas_label);
2110 
2111   load_klass(temp_reg, obj_reg);
2112 
2113   load_const_optimized(temp2_reg, ~((int) markOopDesc::age_mask_in_place));
2114   ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
2115   orr(temp_reg, R16_thread, temp_reg);
2116   xorr(temp_reg, mark_reg, temp_reg);
2117   andr(temp_reg, temp_reg, temp2_reg);
2118   cmpdi(cr_reg, temp_reg, 0);
2119   if (PrintBiasedLockingStatistics) {
2120     Label l;
2121     bne(cr_reg, l);
2122     load_const(temp2_reg, (address) BiasedLocking::biased_lock_entry_count_addr());
2123     lwzx(mark_reg, temp2_reg);
2124     addi(mark_reg, mark_reg, 1);
2125     stwx(mark_reg, temp2_reg);
2126     // restore mark_reg
2127     ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);
2128     bind(l);
2129   }
2130   beq(cr_reg, done);
2131 
2132   Label try_revoke_bias;
2133   Label try_rebias;
2134 
2135   // At this point we know that the header has the bias pattern and
2136   // that we are not the bias owner in the current epoch. We need to
2137   // figure out more details about the state of the header in order to
2138   // know what operations can be legally performed on the object's
2139   // header.
2140 
2141   // If the low three bits in the xor result aren't clear, that means
2142   // the prototype header is no longer biased and we have to revoke
2143   // the bias on this object.
2144   andi(temp2_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
2145   cmpwi(cr_reg, temp2_reg, 0);
2146   bne(cr_reg, try_revoke_bias);
2147 
2148   // Biasing is still enabled for this data type. See whether the
2149   // epoch of the current bias is still valid, meaning that the epoch
2150   // bits of the mark word are equal to the epoch bits of the
2151   // prototype header. (Note that the prototype header's epoch bits
2152   // only change at a safepoint.) If not, attempt to rebias the object
2153   // toward the current thread. Note that we must be absolutely sure
2154   // that the current epoch is invalid in order to do this because
2155   // otherwise the manipulations it performs on the mark word are
2156   // illegal.
2157 
2158   int shift_amount = 64 - markOopDesc::epoch_shift;
2159   // rotate epoch bits to right (little) end and set other bits to 0
2160   // [ big part | epoch | little part ] -> [ 0..0 | epoch ]
2161   rldicl_(temp2_reg, temp_reg, shift_amount, 64 - markOopDesc::epoch_bits);
2162   // branch if epoch bits are != 0, i.e. they differ, because the epoch has been incremented
2163   bne(CCR0, try_rebias);
2164 
2165   // The epoch of the current bias is still valid but we know nothing
2166   // about the owner; it might be set or it might be clear. Try to
2167   // acquire the bias of the object using an atomic operation. If this
2168   // fails we will go in to the runtime to revoke the object's bias.
2169   // Note that we first construct the presumed unbiased header so we
2170   // don't accidentally blow away another thread's valid bias.
2171   andi(mark_reg, mark_reg, (markOopDesc::biased_lock_mask_in_place |
2172                                 markOopDesc::age_mask_in_place |
2173                                 markOopDesc::epoch_mask_in_place));
2174   orr(temp_reg, R16_thread, mark_reg);
2175 
2176   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2177 
2178   // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
2179   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
2180            /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
2181            /*where=*/obj_reg,
2182            MacroAssembler::MemBarAcq,
2183            MacroAssembler::cmpxchgx_hint_acquire_lock(),
2184            noreg, slow_case_int); // bail out if failed
2185 
2186   // If the biasing toward our thread failed, this means that
2187   // another thread succeeded in biasing it toward itself and we
2188   // need to revoke that bias. The revocation will occur in the
2189   // interpreter runtime in the slow case.
2190   if (PrintBiasedLockingStatistics) {
2191     load_const(temp2_reg, (address) BiasedLocking::anonymously_biased_lock_entry_count_addr(), temp_reg);
2192     lwzx(temp_reg, temp2_reg);
2193     addi(temp_reg, temp_reg, 1);
2194     stwx(temp_reg, temp2_reg);
2195   }
2196   b(done);
2197 
2198   bind(try_rebias);
2199   // At this point we know the epoch has expired, meaning that the
2200   // current "bias owner", if any, is actually invalid. Under these
2201   // circumstances _only_, we are allowed to use the current header's
2202   // value as the comparison value when doing the cas to acquire the
2203   // bias in the current epoch. In other words, we allow transfer of
2204   // the bias from one thread to another directly in this situation.
2205   load_klass(temp_reg, obj_reg);
2206   andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place);
2207   orr(temp2_reg, R16_thread, temp2_reg);
2208   ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
2209   orr(temp_reg, temp2_reg, temp_reg);
2210 
2211   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2212 
2213   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
2214                  /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
2215                  /*where=*/obj_reg,
2216                  MacroAssembler::MemBarAcq,
2217                  MacroAssembler::cmpxchgx_hint_acquire_lock(),
2218                  noreg, slow_case_int); // bail out if failed
2219 
2220   // If the biasing toward our thread failed, this means that
2221   // another thread succeeded in biasing it toward itself and we
2222   // need to revoke that bias. The revocation will occur in the
2223   // interpreter runtime in the slow case.
2224   if (PrintBiasedLockingStatistics) {
2225     load_const(temp2_reg, (address) BiasedLocking::rebiased_lock_entry_count_addr(), temp_reg);
2226     lwzx(temp_reg, temp2_reg);
2227     addi(temp_reg, temp_reg, 1);
2228     stwx(temp_reg, temp2_reg);
2229   }
2230   b(done);
2231 
2232   bind(try_revoke_bias);
2233   // The prototype mark in the klass doesn't have the bias bit set any
2234   // more, indicating that objects of this data type are not supposed
2235   // to be biased any more. We are going to try to reset the mark of
2236   // this object to the prototype value and fall through to the
2237   // CAS-based locking scheme. Note that if our CAS fails, it means
2238   // that another thread raced us for the privilege of revoking the
2239   // bias of this particular object, so it's okay to continue in the
2240   // normal locking code.
2241   load_klass(temp_reg, obj_reg);
2242   ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
2243   andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place);
2244   orr(temp_reg, temp_reg, temp2_reg);
2245 
2246   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2247 
2248   // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
2249   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
2250                  /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
2251                  /*where=*/obj_reg,
2252                  MacroAssembler::MemBarAcq,
2253                  MacroAssembler::cmpxchgx_hint_acquire_lock());
2254 
2255   // reload markOop in mark_reg before continuing with lightweight locking
2256   ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);
2257 
2258   // Fall through to the normal CAS-based lock, because no matter what
2259   // the result of the above CAS, some thread must have succeeded in
2260   // removing the bias bit from the object's header.
2261   if (PrintBiasedLockingStatistics) {
2262     Label l;
2263     bne(cr_reg, l);
2264     load_const(temp2_reg, (address) BiasedLocking::revoked_lock_entry_count_addr(), temp_reg);
2265     lwzx(temp_reg, temp2_reg);
2266     addi(temp_reg, temp_reg, 1);
2267     stwx(temp_reg, temp2_reg);
2268     bind(l);
2269   }
2270 
2271   bind(cas_label);
2272 }
2273 
2274 void MacroAssembler::biased_locking_exit (ConditionRegister cr_reg, Register mark_addr, Register temp_reg, Label& done) {
2275   // Check for biased locking unlock case, which is a no-op
2276   // Note: we do not have to check the thread ID for two reasons.
2277   // First, the interpreter checks for IllegalMonitorStateException at
2278   // a higher level. Second, if the bias was revoked while we held the
2279   // lock, the object could not be rebiased toward another thread, so
2280   // the bias bit would be clear.
2281 
2282   ld(temp_reg, 0, mark_addr);
2283   andi(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
2284 
2285   cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern);
2286   beq(cr_reg, done);
2287 }
2288 
2289 // allocation (for C1)
2290 void MacroAssembler::eden_allocate(
2291   Register obj,                      // result: pointer to object after successful allocation
2292   Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
2293   int      con_size_in_bytes,        // object size in bytes if   known at compile time
2294   Register t1,                       // temp register
2295   Register t2,                       // temp register
2296   Label&   slow_case                 // continuation point if fast allocation fails
2297 ) {
2298   b(slow_case);
2299 }
2300 
2301 void MacroAssembler::tlab_allocate(
2302   Register obj,                      // result: pointer to object after successful allocation
2303   Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
2304   int      con_size_in_bytes,        // object size in bytes if   known at compile time
2305   Register t1,                       // temp register
2306   Label&   slow_case                 // continuation point if fast allocation fails
2307 ) {
2308   // make sure arguments make sense
2309   assert_different_registers(obj, var_size_in_bytes, t1);
2310   assert(0 <= con_size_in_bytes && is_simm13(con_size_in_bytes), "illegal object size");
2311   assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment");
2312 
2313   const Register new_top = t1;
2314   //verify_tlab(); not implemented
2315 
2316   ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2317   ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread);
2318   if (var_size_in_bytes == noreg) {
2319     addi(new_top, obj, con_size_in_bytes);
2320   } else {
2321     add(new_top, obj, var_size_in_bytes);
2322   }
2323   cmpld(CCR0, new_top, R0);
2324   bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case);
2325 
2326 #ifdef ASSERT
2327   // make sure new free pointer is properly aligned
2328   {
2329     Label L;
2330     andi_(R0, new_top, MinObjAlignmentInBytesMask);
2331     beq(CCR0, L);
2332     stop("updated TLAB free is not properly aligned", 0x934);
2333     bind(L);
2334   }
2335 #endif // ASSERT
2336 
2337   // update the tlab top pointer
2338   std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2339   //verify_tlab(); not implemented
2340 }
2341 void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register t1, Register t2) {
2342   unimplemented("incr_allocated_bytes");
2343 }
2344 
2345 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset,
2346                                              int insts_call_instruction_offset, Register Rtoc) {
2347   // Start the stub.
2348   address stub = start_a_stub(64);
2349   if (stub == NULL) { return NULL; } // CodeCache full: bail out
2350 
2351   // Create a trampoline stub relocation which relates this trampoline stub
2352   // with the call instruction at insts_call_instruction_offset in the
2353   // instructions code-section.
2354   relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset));
2355   const int stub_start_offset = offset();
2356 
2357   // For java_to_interp stubs we use R11_scratch1 as scratch register
2358   // and in call trampoline stubs we use R12_scratch2. This way we
2359   // can distinguish them (see is_NativeCallTrampolineStub_at()).
2360   Register reg_scratch = R12_scratch2;
2361 
2362   // Now, create the trampoline stub's code:
2363   // - load the TOC
2364   // - load the call target from the constant pool
2365   // - call
2366   if (Rtoc == noreg) {
2367     calculate_address_from_global_toc(reg_scratch, method_toc());
2368     Rtoc = reg_scratch;
2369   }
2370 
2371   ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false);
2372   mtctr(reg_scratch);
2373   bctr();
2374 
2375   const address stub_start_addr = addr_at(stub_start_offset);
2376 
2377   // Assert that the encoded destination_toc_offset can be identified and that it is correct.
2378   assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(),
2379          "encoded offset into the constant pool must match");
2380   // Trampoline_stub_size should be good.
2381   assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size");
2382   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
2383 
2384   // End the stub.
2385   end_a_stub();
2386   return stub;
2387 }
2388 
2389 // TM on PPC64.
2390 void MacroAssembler::atomic_inc_ptr(Register addr, Register result, int simm16) {
2391   Label retry;
2392   bind(retry);
2393   ldarx(result, addr, /*hint*/ false);
2394   addi(result, result, simm16);
2395   stdcx_(result, addr);
2396   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
2397     bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
2398   } else {
2399     bne(                  CCR0, retry); // stXcx_ sets CCR0
2400   }
2401 }
2402 
2403 void MacroAssembler::atomic_ori_int(Register addr, Register result, int uimm16) {
2404   Label retry;
2405   bind(retry);
2406   lwarx(result, addr, /*hint*/ false);
2407   ori(result, result, uimm16);
2408   stwcx_(result, addr);
2409   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
2410     bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
2411   } else {
2412     bne(                  CCR0, retry); // stXcx_ sets CCR0
2413   }
2414 }
2415 
2416 #if INCLUDE_RTM_OPT
2417 
2418 // Update rtm_counters based on abort status
2419 // input: abort_status
2420 //        rtm_counters (RTMLockingCounters*)
2421 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters_Reg) {
2422   // Mapping to keep PreciseRTMLockingStatistics similar to x86.
2423   // x86 ppc (! means inverted, ? means not the same)
2424   //  0   31  Set if abort caused by XABORT instruction.
2425   //  1  ! 7  If set, the transaction may succeed on a retry. This bit is always clear if bit 0 is set.
2426   //  2   13  Set if another logical processor conflicted with a memory address that was part of the transaction that aborted.
2427   //  3   10  Set if an internal buffer overflowed.
2428   //  4  ?12  Set if a debug breakpoint was hit.
2429   //  5  ?32  Set if an abort occurred during execution of a nested transaction.
2430   const  int tm_failure_bit[] = {Assembler::tm_tabort, // Note: Seems like signal handler sets this, too.
2431                                  Assembler::tm_failure_persistent, // inverted: transient
2432                                  Assembler::tm_trans_cf,
2433                                  Assembler::tm_footprint_of,
2434                                  Assembler::tm_non_trans_cf,
2435                                  Assembler::tm_suspended};
2436   const bool tm_failure_inv[] = {false, true, false, false, false, false};
2437   assert(sizeof(tm_failure_bit)/sizeof(int) == RTMLockingCounters::ABORT_STATUS_LIMIT, "adapt mapping!");
2438 
2439   const Register addr_Reg = R0;
2440   // Keep track of offset to where rtm_counters_Reg had pointed to.
2441   int counters_offs = RTMLockingCounters::abort_count_offset();
2442   addi(addr_Reg, rtm_counters_Reg, counters_offs);
2443   const Register temp_Reg = rtm_counters_Reg;
2444 
2445   //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically
2446   ldx(temp_Reg, addr_Reg);
2447   addi(temp_Reg, temp_Reg, 1);
2448   stdx(temp_Reg, addr_Reg);
2449 
2450   if (PrintPreciseRTMLockingStatistics) {
2451     int counters_offs_delta = RTMLockingCounters::abortX_count_offset() - counters_offs;
2452 
2453     //mftexasr(abort_status); done by caller
2454     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
2455       counters_offs += counters_offs_delta;
2456       li(temp_Reg, counters_offs_delta); // can't use addi with R0
2457       add(addr_Reg, addr_Reg, temp_Reg); // point to next counter
2458       counters_offs_delta = sizeof(uintx);
2459 
2460       Label check_abort;
2461       rldicr_(temp_Reg, abort_status, tm_failure_bit[i], 0);
2462       if (tm_failure_inv[i]) {
2463         bne(CCR0, check_abort);
2464       } else {
2465         beq(CCR0, check_abort);
2466       }
2467       //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically
2468       ldx(temp_Reg, addr_Reg);
2469       addi(temp_Reg, temp_Reg, 1);
2470       stdx(temp_Reg, addr_Reg);
2471       bind(check_abort);
2472     }
2473   }
2474   li(temp_Reg, -counters_offs); // can't use addi with R0
2475   add(rtm_counters_Reg, addr_Reg, temp_Reg); // restore
2476 }
2477 
2478 // Branch if (random & (count-1) != 0), count is 2^n
2479 // tmp and CR0 are killed
2480 void MacroAssembler::branch_on_random_using_tb(Register tmp, int count, Label& brLabel) {
2481   mftb(tmp);
2482   andi_(tmp, tmp, count-1);
2483   bne(CCR0, brLabel);
2484 }
2485 
2486 // Perform abort ratio calculation, set no_rtm bit if high ratio.
2487 // input:  rtm_counters_Reg (RTMLockingCounters* address) - KILLED
2488 void MacroAssembler::rtm_abort_ratio_calculation(Register rtm_counters_Reg,
2489                                                  RTMLockingCounters* rtm_counters,
2490                                                  Metadata* method_data) {
2491   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
2492 
2493   if (RTMLockingCalculationDelay > 0) {
2494     // Delay calculation.
2495     ld(rtm_counters_Reg, (RegisterOrConstant)(intptr_t)RTMLockingCounters::rtm_calculation_flag_addr());
2496     cmpdi(CCR0, rtm_counters_Reg, 0);
2497     beq(CCR0, L_done);
2498     load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2499   }
2500   // Abort ratio calculation only if abort_count > RTMAbortThreshold.
2501   //   Aborted transactions = abort_count * 100
2502   //   All transactions = total_count *  RTMTotalCountIncrRate
2503   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
2504   ld(R0, RTMLockingCounters::abort_count_offset(), rtm_counters_Reg);
2505   if (is_simm(RTMAbortThreshold, 16)) {   // cmpdi can handle 16bit immediate only.
2506     cmpdi(CCR0, R0, RTMAbortThreshold);
2507     blt(CCR0, L_check_always_rtm2);  // reload of rtm_counters_Reg not necessary
2508   } else {
2509     load_const_optimized(rtm_counters_Reg, RTMAbortThreshold);
2510     cmpd(CCR0, R0, rtm_counters_Reg);
2511     blt(CCR0, L_check_always_rtm1);  // reload of rtm_counters_Reg required
2512   }
2513   mulli(R0, R0, 100);
2514 
2515   const Register tmpReg = rtm_counters_Reg;
2516   ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2517   mulli(tmpReg, tmpReg, RTMTotalCountIncrRate); // allowable range: int16
2518   mulli(tmpReg, tmpReg, RTMAbortRatio);         // allowable range: int16
2519   cmpd(CCR0, R0, tmpReg);
2520   blt(CCR0, L_check_always_rtm1); // jump to reload
2521   if (method_data != NULL) {
2522     // Set rtm_state to "no rtm" in MDO.
2523     // Not using a metadata relocation. Method and Class Loader are kept alive anyway.
2524     // (See nmethod::metadata_do and CodeBuffer::finalize_oop_references.)
2525     load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2526     atomic_ori_int(R0, tmpReg, NoRTM);
2527   }
2528   b(L_done);
2529 
2530   bind(L_check_always_rtm1);
2531   load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2532   bind(L_check_always_rtm2);
2533   ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2534   int64_t thresholdValue = RTMLockingThreshold / RTMTotalCountIncrRate;
2535   if (is_simm(thresholdValue, 16)) {   // cmpdi can handle 16bit immediate only.
2536     cmpdi(CCR0, tmpReg, thresholdValue);
2537   } else {
2538     load_const_optimized(R0, thresholdValue);
2539     cmpd(CCR0, tmpReg, R0);
2540   }
2541   blt(CCR0, L_done);
2542   if (method_data != NULL) {
2543     // Set rtm_state to "always rtm" in MDO.
2544     // Not using a metadata relocation. See above.
2545     load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2546     atomic_ori_int(R0, tmpReg, UseRTM);
2547   }
2548   bind(L_done);
2549 }
2550 
2551 // Update counters and perform abort ratio calculation.
2552 // input: abort_status_Reg
2553 void MacroAssembler::rtm_profiling(Register abort_status_Reg, Register temp_Reg,
2554                                    RTMLockingCounters* rtm_counters,
2555                                    Metadata* method_data,
2556                                    bool profile_rtm) {
2557 
2558   assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2559   // Update rtm counters based on state at abort.
2560   // Reads abort_status_Reg, updates flags.
2561   assert_different_registers(abort_status_Reg, temp_Reg);
2562   load_const_optimized(temp_Reg, (address)rtm_counters, R0);
2563   rtm_counters_update(abort_status_Reg, temp_Reg);
2564   if (profile_rtm) {
2565     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2566     rtm_abort_ratio_calculation(temp_Reg, rtm_counters, method_data);
2567   }
2568 }
2569 
2570 // Retry on abort if abort's status indicates non-persistent failure.
2571 // inputs: retry_count_Reg
2572 //       : abort_status_Reg
2573 // output: retry_count_Reg decremented by 1
2574 void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg,
2575                                              Label& retryLabel, Label* checkRetry) {
2576   Label doneRetry;
2577   rldicr_(R0, abort_status_Reg, tm_failure_persistent, 0);
2578   bne(CCR0, doneRetry);
2579   if (checkRetry) { bind(*checkRetry); }
2580   addic_(retry_count_Reg, retry_count_Reg, -1);
2581   blt(CCR0, doneRetry);
2582   b(retryLabel);
2583   bind(doneRetry);
2584 }
2585 
2586 // Spin and retry if lock is busy.
2587 // inputs: owner_addr_Reg (monitor address)
2588 //       : retry_count_Reg
2589 // output: retry_count_Reg decremented by 1
2590 // CTR is killed
2591 void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register owner_addr_Reg, Label& retryLabel) {
2592   Label SpinLoop, doneRetry, doRetry;
2593   addic_(retry_count_Reg, retry_count_Reg, -1);
2594   blt(CCR0, doneRetry);
2595 
2596   if (RTMSpinLoopCount > 1) {
2597     li(R0, RTMSpinLoopCount);
2598     mtctr(R0);
2599   }
2600 
2601   // low thread priority
2602   smt_prio_low();
2603   bind(SpinLoop);
2604 
2605   if (RTMSpinLoopCount > 1) {
2606     bdz(doRetry);
2607     ld(R0, 0, owner_addr_Reg);
2608     cmpdi(CCR0, R0, 0);
2609     bne(CCR0, SpinLoop);
2610   }
2611 
2612   bind(doRetry);
2613 
2614   // restore thread priority to default in userspace
2615 #ifdef LINUX
2616   smt_prio_medium_low();
2617 #else
2618   smt_prio_medium();
2619 #endif
2620 
2621   b(retryLabel);
2622 
2623   bind(doneRetry);
2624 }
2625 
2626 // Use RTM for normal stack locks.
2627 // Input: objReg (object to lock)
2628 void MacroAssembler::rtm_stack_locking(ConditionRegister flag,
2629                                        Register obj, Register mark_word, Register tmp,
2630                                        Register retry_on_abort_count_Reg,
2631                                        RTMLockingCounters* stack_rtm_counters,
2632                                        Metadata* method_data, bool profile_rtm,
2633                                        Label& DONE_LABEL, Label& IsInflated) {
2634   assert(UseRTMForStackLocks, "why call this otherwise?");
2635   assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
2636   Label L_rtm_retry, L_decrement_retry, L_on_abort;
2637 
2638   if (RTMRetryCount > 0) {
2639     load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
2640     bind(L_rtm_retry);
2641   }
2642   andi_(R0, mark_word, markOopDesc::monitor_value);  // inflated vs stack-locked|neutral|biased
2643   bne(CCR0, IsInflated);
2644 
2645   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2646     Label L_noincrement;
2647     if (RTMTotalCountIncrRate > 1) {
2648       branch_on_random_using_tb(tmp, RTMTotalCountIncrRate, L_noincrement);
2649     }
2650     assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
2651     load_const_optimized(tmp, (address)stack_rtm_counters->total_count_addr(), R0);
2652     //atomic_inc_ptr(tmp, /*temp, will be reloaded*/mark_word); We don't increment atomically
2653     ldx(mark_word, tmp);
2654     addi(mark_word, mark_word, 1);
2655     stdx(mark_word, tmp);
2656     bind(L_noincrement);
2657   }
2658   tbegin_();
2659   beq(CCR0, L_on_abort);
2660   ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);         // Reload in transaction, conflicts need to be tracked.
2661   andi(R0, mark_word, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
2662   cmpwi(flag, R0, markOopDesc::unlocked_value);                // bits = 001 unlocked
2663   beq(flag, DONE_LABEL);                                       // all done if unlocked
2664 
2665   if (UseRTMXendForLockBusy) {
2666     tend_();
2667     b(L_decrement_retry);
2668   } else {
2669     tabort_();
2670   }
2671   bind(L_on_abort);
2672   const Register abort_status_Reg = tmp;
2673   mftexasr(abort_status_Reg);
2674   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2675     rtm_profiling(abort_status_Reg, /*temp*/mark_word, stack_rtm_counters, method_data, profile_rtm);
2676   }
2677   ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // reload
2678   if (RTMRetryCount > 0) {
2679     // Retry on lock abort if abort status is not permanent.
2680     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry, &L_decrement_retry);
2681   } else {
2682     bind(L_decrement_retry);
2683   }
2684 }
2685 
2686 // Use RTM for inflating locks
2687 // inputs: obj       (object to lock)
2688 //         mark_word (current header - KILLED)
2689 //         boxReg    (on-stack box address (displaced header location) - KILLED)
2690 void MacroAssembler::rtm_inflated_locking(ConditionRegister flag,
2691                                           Register obj, Register mark_word, Register boxReg,
2692                                           Register retry_on_busy_count_Reg, Register retry_on_abort_count_Reg,
2693                                           RTMLockingCounters* rtm_counters,
2694                                           Metadata* method_data, bool profile_rtm,
2695                                           Label& DONE_LABEL) {
2696   assert(UseRTMLocking, "why call this otherwise?");
2697   Label L_rtm_retry, L_decrement_retry, L_on_abort;
2698   // Clean monitor_value bit to get valid pointer.
2699   int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markOopDesc::monitor_value;
2700 
2701   // Store non-null, using boxReg instead of (intptr_t)markOopDesc::unused_mark().
2702   std(boxReg, BasicLock::displaced_header_offset_in_bytes(), boxReg);
2703   const Register tmpReg = boxReg;
2704   const Register owner_addr_Reg = mark_word;
2705   addi(owner_addr_Reg, mark_word, owner_offset);
2706 
2707   if (RTMRetryCount > 0) {
2708     load_const_optimized(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy.
2709     load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort.
2710     bind(L_rtm_retry);
2711   }
2712   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2713     Label L_noincrement;
2714     if (RTMTotalCountIncrRate > 1) {
2715       branch_on_random_using_tb(R0, RTMTotalCountIncrRate, L_noincrement);
2716     }
2717     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2718     load_const(R0, (address)rtm_counters->total_count_addr(), tmpReg);
2719     //atomic_inc_ptr(R0, tmpReg); We don't increment atomically
2720     ldx(tmpReg, R0);
2721     addi(tmpReg, tmpReg, 1);
2722     stdx(tmpReg, R0);
2723     bind(L_noincrement);
2724   }
2725   tbegin_();
2726   beq(CCR0, L_on_abort);
2727   // We don't reload mark word. Will only be reset at safepoint.
2728   ld(R0, 0, owner_addr_Reg); // Load in transaction, conflicts need to be tracked.
2729   cmpdi(flag, R0, 0);
2730   beq(flag, DONE_LABEL);
2731 
2732   if (UseRTMXendForLockBusy) {
2733     tend_();
2734     b(L_decrement_retry);
2735   } else {
2736     tabort_();
2737   }
2738   bind(L_on_abort);
2739   const Register abort_status_Reg = tmpReg;
2740   mftexasr(abort_status_Reg);
2741   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2742     rtm_profiling(abort_status_Reg, /*temp*/ owner_addr_Reg, rtm_counters, method_data, profile_rtm);
2743     // Restore owner_addr_Reg
2744     ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);
2745 #ifdef ASSERT
2746     andi_(R0, mark_word, markOopDesc::monitor_value);
2747     asm_assert_ne("must be inflated", 0xa754); // Deflating only allowed at safepoint.
2748 #endif
2749     addi(owner_addr_Reg, mark_word, owner_offset);
2750   }
2751   if (RTMRetryCount > 0) {
2752     // Retry on lock abort if abort status is not permanent.
2753     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
2754   }
2755 
2756   // Appears unlocked - try to swing _owner from null to non-null.
2757   cmpxchgd(flag, /*current val*/ R0, (intptr_t)0, /*new val*/ R16_thread, owner_addr_Reg,
2758            MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2759            MacroAssembler::cmpxchgx_hint_acquire_lock(), noreg, &L_decrement_retry, true);
2760 
2761   if (RTMRetryCount > 0) {
2762     // success done else retry
2763     b(DONE_LABEL);
2764     bind(L_decrement_retry);
2765     // Spin and retry if lock is busy.
2766     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, owner_addr_Reg, L_rtm_retry);
2767   } else {
2768     bind(L_decrement_retry);
2769   }
2770 }
2771 
2772 #endif //  INCLUDE_RTM_OPT
2773 
2774 // "The box" is the space on the stack where we copy the object mark.
2775 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box,
2776                                                Register temp, Register displaced_header, Register current_header,
2777                                                bool try_bias,
2778                                                RTMLockingCounters* rtm_counters,
2779                                                RTMLockingCounters* stack_rtm_counters,
2780                                                Metadata* method_data,
2781                                                bool use_rtm, bool profile_rtm) {
2782   assert_different_registers(oop, box, temp, displaced_header, current_header);
2783   assert(flag != CCR0, "bad condition register");
2784   Label cont;
2785   Label object_has_monitor;
2786   Label cas_failed;
2787 
2788   // Load markOop from object into displaced_header.
2789   ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop);
2790 
2791 
2792   // Always do locking in runtime.
2793   if (EmitSync & 0x01) {
2794     cmpdi(flag, oop, 0); // Oop can't be 0 here => always false.
2795     return;
2796   }
2797 
2798   if (try_bias) {
2799     biased_locking_enter(flag, oop, displaced_header, temp, current_header, cont);
2800   }
2801 
2802 #if INCLUDE_RTM_OPT
2803   if (UseRTMForStackLocks && use_rtm) {
2804     rtm_stack_locking(flag, oop, displaced_header, temp, /*temp*/ current_header,
2805                       stack_rtm_counters, method_data, profile_rtm,
2806                       cont, object_has_monitor);
2807   }
2808 #endif // INCLUDE_RTM_OPT
2809 
2810   // Handle existing monitor.
2811   if ((EmitSync & 0x02) == 0) {
2812     // The object has an existing monitor iff (mark & monitor_value) != 0.
2813     andi_(temp, displaced_header, markOopDesc::monitor_value);
2814     bne(CCR0, object_has_monitor);
2815   }
2816 
2817   // Set displaced_header to be (markOop of object | UNLOCK_VALUE).
2818   ori(displaced_header, displaced_header, markOopDesc::unlocked_value);
2819 
2820   // Load Compare Value application register.
2821 
2822   // Initialize the box. (Must happen before we update the object mark!)
2823   std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2824 
2825   // Must fence, otherwise, preceding store(s) may float below cmpxchg.
2826   // Compare object markOop with mark and if equal exchange scratch1 with object markOop.
2827   cmpxchgd(/*flag=*/flag,
2828            /*current_value=*/current_header,
2829            /*compare_value=*/displaced_header,
2830            /*exchange_value=*/box,
2831            /*where=*/oop,
2832            MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2833            MacroAssembler::cmpxchgx_hint_acquire_lock(),
2834            noreg,
2835            &cas_failed,
2836            /*check without membar and ldarx first*/true);
2837   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2838 
2839   // If the compare-and-exchange succeeded, then we found an unlocked
2840   // object and we have now locked it.
2841   b(cont);
2842 
2843   bind(cas_failed);
2844   // We did not see an unlocked object so try the fast recursive case.
2845 
2846   // Check if the owner is self by comparing the value in the markOop of object
2847   // (current_header) with the stack pointer.
2848   sub(current_header, current_header, R1_SP);
2849   load_const_optimized(temp, ~(os::vm_page_size()-1) | markOopDesc::lock_mask_in_place);
2850 
2851   and_(R0/*==0?*/, current_header, temp);
2852   // If condition is true we are cont and hence we can store 0 as the
2853   // displaced header in the box, which indicates that it is a recursive lock.
2854   mcrf(flag,CCR0);
2855   std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box);
2856 
2857   // Handle existing monitor.
2858   if ((EmitSync & 0x02) == 0) {
2859     b(cont);
2860 
2861     bind(object_has_monitor);
2862     // The object's monitor m is unlocked iff m->owner == NULL,
2863     // otherwise m->owner may contain a thread or a stack address.
2864 
2865 #if INCLUDE_RTM_OPT
2866     // Use the same RTM locking code in 32- and 64-bit VM.
2867     if (use_rtm) {
2868       rtm_inflated_locking(flag, oop, displaced_header, box, temp, /*temp*/ current_header,
2869                            rtm_counters, method_data, profile_rtm, cont);
2870     } else {
2871 #endif // INCLUDE_RTM_OPT
2872 
2873     // Try to CAS m->owner from NULL to current thread.
2874     addi(temp, displaced_header, ObjectMonitor::owner_offset_in_bytes()-markOopDesc::monitor_value);
2875     cmpxchgd(/*flag=*/flag,
2876              /*current_value=*/current_header,
2877              /*compare_value=*/(intptr_t)0,
2878              /*exchange_value=*/R16_thread,
2879              /*where=*/temp,
2880              MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2881              MacroAssembler::cmpxchgx_hint_acquire_lock());
2882 
2883     // Store a non-null value into the box.
2884     std(box, BasicLock::displaced_header_offset_in_bytes(), box);
2885 
2886 #   ifdef ASSERT
2887     bne(flag, cont);
2888     // We have acquired the monitor, check some invariants.
2889     addi(/*monitor=*/temp, temp, -ObjectMonitor::owner_offset_in_bytes());
2890     // Invariant 1: _recursions should be 0.
2891     //assert(ObjectMonitor::recursions_size_in_bytes() == 8, "unexpected size");
2892     asm_assert_mem8_is_zero(ObjectMonitor::recursions_offset_in_bytes(), temp,
2893                             "monitor->_recursions should be 0", -1);
2894 #   endif
2895 
2896 #if INCLUDE_RTM_OPT
2897     } // use_rtm()
2898 #endif
2899   }
2900 
2901   bind(cont);
2902   // flag == EQ indicates success
2903   // flag == NE indicates failure
2904 }
2905 
2906 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box,
2907                                                  Register temp, Register displaced_header, Register current_header,
2908                                                  bool try_bias, bool use_rtm) {
2909   assert_different_registers(oop, box, temp, displaced_header, current_header);
2910   assert(flag != CCR0, "bad condition register");
2911   Label cont;
2912   Label object_has_monitor;
2913 
2914   // Always do locking in runtime.
2915   if (EmitSync & 0x01) {
2916     cmpdi(flag, oop, 0); // Oop can't be 0 here => always false.
2917     return;
2918   }
2919 
2920   if (try_bias) {
2921     biased_locking_exit(flag, oop, current_header, cont);
2922   }
2923 
2924 #if INCLUDE_RTM_OPT
2925   if (UseRTMForStackLocks && use_rtm) {
2926     assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
2927     Label L_regular_unlock;
2928     ld(current_header, oopDesc::mark_offset_in_bytes(), oop);         // fetch markword
2929     andi(R0, current_header, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
2930     cmpwi(flag, R0, markOopDesc::unlocked_value);                     // bits = 001 unlocked
2931     bne(flag, L_regular_unlock);                                      // else RegularLock
2932     tend_();                                                          // otherwise end...
2933     b(cont);                                                          // ... and we're done
2934     bind(L_regular_unlock);
2935   }
2936 #endif
2937 
2938   // Find the lock address and load the displaced header from the stack.
2939   ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2940 
2941   // If the displaced header is 0, we have a recursive unlock.
2942   cmpdi(flag, displaced_header, 0);
2943   beq(flag, cont);
2944 
2945   // Handle existing monitor.
2946   if ((EmitSync & 0x02) == 0) {
2947     // The object has an existing monitor iff (mark & monitor_value) != 0.
2948     RTM_OPT_ONLY( if (!(UseRTMForStackLocks && use_rtm)) ) // skip load if already done
2949     ld(current_header, oopDesc::mark_offset_in_bytes(), oop);
2950     andi_(R0, current_header, markOopDesc::monitor_value);
2951     bne(CCR0, object_has_monitor);
2952   }
2953 
2954   // Check if it is still a light weight lock, this is is true if we see
2955   // the stack address of the basicLock in the markOop of the object.
2956   // Cmpxchg sets flag to cmpd(current_header, box).
2957   cmpxchgd(/*flag=*/flag,
2958            /*current_value=*/current_header,
2959            /*compare_value=*/box,
2960            /*exchange_value=*/displaced_header,
2961            /*where=*/oop,
2962            MacroAssembler::MemBarRel,
2963            MacroAssembler::cmpxchgx_hint_release_lock(),
2964            noreg,
2965            &cont);
2966 
2967   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2968 
2969   // Handle existing monitor.
2970   if ((EmitSync & 0x02) == 0) {
2971     b(cont);
2972 
2973     bind(object_has_monitor);
2974     addi(current_header, current_header, -markOopDesc::monitor_value); // monitor
2975     ld(temp,             ObjectMonitor::owner_offset_in_bytes(), current_header);
2976 
2977     // It's inflated.
2978 #if INCLUDE_RTM_OPT
2979     if (use_rtm) {
2980       Label L_regular_inflated_unlock;
2981       // Clean monitor_value bit to get valid pointer
2982       cmpdi(flag, temp, 0);
2983       bne(flag, L_regular_inflated_unlock);
2984       tend_();
2985       b(cont);
2986       bind(L_regular_inflated_unlock);
2987     }
2988 #endif
2989 
2990     ld(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header);
2991     xorr(temp, R16_thread, temp);      // Will be 0 if we are the owner.
2992     orr(temp, temp, displaced_header); // Will be 0 if there are 0 recursions.
2993     cmpdi(flag, temp, 0);
2994     bne(flag, cont);
2995 
2996     ld(temp,             ObjectMonitor::EntryList_offset_in_bytes(), current_header);
2997     ld(displaced_header, ObjectMonitor::cxq_offset_in_bytes(), current_header);
2998     orr(temp, temp, displaced_header); // Will be 0 if both are 0.
2999     cmpdi(flag, temp, 0);
3000     bne(flag, cont);
3001     release();
3002     std(temp, ObjectMonitor::owner_offset_in_bytes(), current_header);
3003   }
3004 
3005   bind(cont);
3006   // flag == EQ indicates success
3007   // flag == NE indicates failure
3008 }
3009 
3010 // Write serialization page so VM thread can do a pseudo remote membar.
3011 // We use the current thread pointer to calculate a thread specific
3012 // offset to write to within the page. This minimizes bus traffic
3013 // due to cache line collision.
3014 void MacroAssembler::serialize_memory(Register thread, Register tmp1, Register tmp2) {
3015   srdi(tmp2, thread, os::get_serialize_page_shift_count());
3016 
3017   int mask = os::vm_page_size() - sizeof(int);
3018   if (Assembler::is_simm(mask, 16)) {
3019     andi(tmp2, tmp2, mask);
3020   } else {
3021     lis(tmp1, (int)((signed short) (mask >> 16)));
3022     ori(tmp1, tmp1, mask & 0x0000ffff);
3023     andr(tmp2, tmp2, tmp1);
3024   }
3025 
3026   load_const(tmp1, (long) os::get_memory_serialize_page());
3027   release();
3028   stwx(R0, tmp1, tmp2);
3029 }
3030 
3031 void MacroAssembler::safepoint_poll(Label& slow_path, Register temp_reg) {
3032   if (SafepointMechanism::uses_thread_local_poll()) {
3033     ld(temp_reg, in_bytes(Thread::polling_page_offset()), R16_thread);
3034     // Armed page has poll_bit set.
3035     andi_(temp_reg, temp_reg, SafepointMechanism::poll_bit());
3036   } else {
3037     lwz(temp_reg, (RegisterOrConstant)(intptr_t)SafepointSynchronize::address_of_state());
3038     cmpwi(CCR0, temp_reg, SafepointSynchronize::_not_synchronized);
3039   }
3040   bne(CCR0, slow_path);
3041 }
3042 
3043 
3044 // GC barrier helper macros
3045 
3046 // Write the card table byte if needed.
3047 void MacroAssembler::card_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp) {
3048   CardTableBarrierSet* bs =
3049     barrier_set_cast<CardTableBarrierSet>(Universe::heap()->barrier_set());
3050   assert(bs->kind() == BarrierSet::CardTableBarrierSet, "wrong barrier");
3051   CardTable* ct = bs->card_table();
3052 #ifdef ASSERT
3053   cmpdi(CCR0, Rnew_val, 0);
3054   asm_assert_ne("null oop not allowed", 0x321);
3055 #endif
3056   card_table_write(ct->byte_map_base(), Rtmp, Rstore_addr);
3057 }
3058 
3059 // Write the card table byte.
3060 void MacroAssembler::card_table_write(jbyte* byte_map_base, Register Rtmp, Register Robj) {
3061   assert_different_registers(Robj, Rtmp, R0);
3062   load_const_optimized(Rtmp, (address)byte_map_base, R0);
3063   srdi(Robj, Robj, CardTable::card_shift);
3064   li(R0, 0); // dirty
3065   if (UseConcMarkSweepGC) membar(Assembler::StoreStore);
3066   stbx(R0, Rtmp, Robj);
3067 }
3068 
3069 // Kills R31 if value is a volatile register.
3070 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2, bool needs_frame) {
3071   Label done;
3072   cmpdi(CCR0, value, 0);
3073   beq(CCR0, done);         // Use NULL as-is.
3074 
3075   clrrdi(tmp1, value, JNIHandles::weak_tag_size);
3076 #if INCLUDE_ALL_GCS
3077   if (UseG1GC) { andi_(tmp2, value, JNIHandles::weak_tag_mask); }
3078 #endif
3079   ld(value, 0, tmp1);      // Resolve (untagged) jobject.
3080 
3081 #if INCLUDE_ALL_GCS
3082   if (UseG1GC) {
3083     Label not_weak;
3084     beq(CCR0, not_weak);   // Test for jweak tag.
3085     verify_oop(value);
3086     g1_write_barrier_pre(noreg, // obj
3087                          noreg, // offset
3088                          value, // pre_val
3089                          tmp1, tmp2, needs_frame);
3090     bind(not_weak);
3091   }
3092 #endif // INCLUDE_ALL_GCS
3093   verify_oop(value);
3094   bind(done);
3095 }
3096 
3097 #if INCLUDE_ALL_GCS
3098 // General G1 pre-barrier generator.
3099 // Goal: record the previous value if it is not null.
3100 void MacroAssembler::g1_write_barrier_pre(Register Robj, RegisterOrConstant offset, Register Rpre_val,
3101                                           Register Rtmp1, Register Rtmp2, bool needs_frame) {
3102   Label runtime, filtered;
3103 
3104   // Is marking active?
3105   if (in_bytes(SATBMarkQueue::byte_width_of_active()) == 4) {
3106     lwz(Rtmp1, in_bytes(G1ThreadLocalData::satb_mark_queue_active_offset()), R16_thread);
3107   } else {
3108     guarantee(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, "Assumption");
3109     lbz(Rtmp1, in_bytes(G1ThreadLocalData::satb_mark_queue_active_offset()), R16_thread);
3110   }
3111   cmpdi(CCR0, Rtmp1, 0);
3112   beq(CCR0, filtered);
3113 
3114   // Do we need to load the previous value?
3115   if (Robj != noreg) {
3116     // Load the previous value...
3117     if (UseCompressedOops) {
3118       lwz(Rpre_val, offset, Robj);
3119     } else {
3120       ld(Rpre_val, offset, Robj);
3121     }
3122     // Previous value has been loaded into Rpre_val.
3123   }
3124   assert(Rpre_val != noreg, "must have a real register");
3125 
3126   // Is the previous value null?
3127   cmpdi(CCR0, Rpre_val, 0);
3128   beq(CCR0, filtered);
3129 
3130   if (Robj != noreg && UseCompressedOops) {
3131     decode_heap_oop_not_null(Rpre_val);
3132   }
3133 
3134   // OK, it's not filtered, so we'll need to call enqueue. In the normal
3135   // case, pre_val will be a scratch G-reg, but there are some cases in
3136   // which it's an O-reg. In the first case, do a normal call. In the
3137   // latter, do a save here and call the frameless version.
3138 
3139   // Can we store original value in the thread's buffer?
3140   // Is index == 0?
3141   // (The index field is typed as size_t.)
3142   const Register Rbuffer = Rtmp1, Rindex = Rtmp2;
3143 
3144   ld(Rindex, in_bytes(G1ThreadLocalData::satb_mark_queue_index_offset()), R16_thread);
3145   cmpdi(CCR0, Rindex, 0);
3146   beq(CCR0, runtime); // If index == 0, goto runtime.
3147   ld(Rbuffer, in_bytes(G1ThreadLocalData::satb_mark_queue_buffer_offset()), R16_thread);
3148 
3149   addi(Rindex, Rindex, -wordSize); // Decrement index.
3150   std(Rindex, in_bytes(G1ThreadLocalData::satb_mark_queue_index_offset()), R16_thread);
3151 
3152   // Record the previous value.
3153   stdx(Rpre_val, Rbuffer, Rindex);
3154   b(filtered);
3155 
3156   bind(runtime);
3157 
3158   // May need to preserve LR. Also needed if current frame is not compatible with C calling convention.
3159   if (needs_frame) {
3160     save_LR_CR(Rtmp1);
3161     push_frame_reg_args(0, Rtmp2);
3162   }
3163 
3164   if (Rpre_val->is_volatile() && Robj == noreg) mr(R31, Rpre_val); // Save pre_val across C call if it was preloaded.
3165   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), Rpre_val, R16_thread);
3166   if (Rpre_val->is_volatile() && Robj == noreg) mr(Rpre_val, R31); // restore
3167 
3168   if (needs_frame) {
3169     pop_frame();
3170     restore_LR_CR(Rtmp1);
3171   }
3172 
3173   bind(filtered);
3174 }
3175 
3176 // General G1 post-barrier generator
3177 // Store cross-region card.
3178 void MacroAssembler::g1_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp1, Register Rtmp2, Register Rtmp3, Label *filtered_ext) {
3179   Label runtime, filtered_int;
3180   Label& filtered = (filtered_ext != NULL) ? *filtered_ext : filtered_int;
3181   assert_different_registers(Rstore_addr, Rnew_val, Rtmp1, Rtmp2);
3182 
3183   G1BarrierSet* bs =
3184     barrier_set_cast<G1BarrierSet>(Universe::heap()->barrier_set());
3185   CardTable* ct = bs->card_table();
3186 
3187   // Does store cross heap regions?
3188   if (G1RSBarrierRegionFilter) {
3189     xorr(Rtmp1, Rstore_addr, Rnew_val);
3190     srdi_(Rtmp1, Rtmp1, HeapRegion::LogOfHRGrainBytes);
3191     beq(CCR0, filtered);
3192   }
3193 
3194   // Crosses regions, storing NULL?
3195 #ifdef ASSERT
3196   cmpdi(CCR0, Rnew_val, 0);
3197   asm_assert_ne("null oop not allowed (G1)", 0x322); // Checked by caller on PPC64, so following branch is obsolete:
3198   //beq(CCR0, filtered);
3199 #endif
3200 
3201   // Storing region crossing non-NULL, is card already dirty?
3202   assert(sizeof(*ct->byte_map_base()) == sizeof(jbyte), "adjust this code");
3203   const Register Rcard_addr = Rtmp1;
3204   Register Rbase = Rtmp2;
3205   load_const_optimized(Rbase, (address)ct->byte_map_base(), /*temp*/ Rtmp3);
3206 
3207   srdi(Rcard_addr, Rstore_addr, CardTable::card_shift);
3208 
3209   // Get the address of the card.
3210   lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr);
3211   cmpwi(CCR0, Rtmp3, (int)G1CardTable::g1_young_card_val());
3212   beq(CCR0, filtered);
3213 
3214   membar(Assembler::StoreLoad);
3215   lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr);  // Reload after membar.
3216   cmpwi(CCR0, Rtmp3 /* card value */, CardTable::dirty_card_val());
3217   beq(CCR0, filtered);
3218 
3219   // Storing a region crossing, non-NULL oop, card is clean.
3220   // Dirty card and log.
3221   li(Rtmp3, CardTable::dirty_card_val());
3222   //release(); // G1: oops are allowed to get visible after dirty marking.
3223   stbx(Rtmp3, Rbase, Rcard_addr);
3224 
3225   add(Rcard_addr, Rbase, Rcard_addr); // This is the address which needs to get enqueued.
3226   Rbase = noreg; // end of lifetime
3227 
3228   const Register Rqueue_index = Rtmp2,
3229                  Rqueue_buf   = Rtmp3;
3230   ld(Rqueue_index, in_bytes(G1ThreadLocalData::dirty_card_queue_index_offset()), R16_thread);
3231   cmpdi(CCR0, Rqueue_index, 0);
3232   beq(CCR0, runtime); // index == 0 then jump to runtime
3233   ld(Rqueue_buf, in_bytes(G1ThreadLocalData::dirty_card_queue_buffer_offset()), R16_thread);
3234 
3235   addi(Rqueue_index, Rqueue_index, -wordSize); // decrement index
3236   std(Rqueue_index, in_bytes(G1ThreadLocalData::dirty_card_queue_index_offset()), R16_thread);
3237 
3238   stdx(Rcard_addr, Rqueue_buf, Rqueue_index); // store card
3239   b(filtered);
3240 
3241   bind(runtime);
3242 
3243   // Save the live input values.
3244   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), Rcard_addr, R16_thread);
3245 
3246   bind(filtered_int);
3247 }
3248 #endif // INCLUDE_ALL_GCS
3249 
3250 // Values for last_Java_pc, and last_Java_sp must comply to the rules
3251 // in frame_ppc.hpp.
3252 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) {
3253   // Always set last_Java_pc and flags first because once last_Java_sp
3254   // is visible has_last_Java_frame is true and users will look at the
3255   // rest of the fields. (Note: flags should always be zero before we
3256   // get here so doesn't need to be set.)
3257 
3258   // Verify that last_Java_pc was zeroed on return to Java
3259   asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread,
3260                           "last_Java_pc not zeroed before leaving Java", 0x200);
3261 
3262   // When returning from calling out from Java mode the frame anchor's
3263   // last_Java_pc will always be set to NULL. It is set here so that
3264   // if we are doing a call to native (not VM) that we capture the
3265   // known pc and don't have to rely on the native call having a
3266   // standard frame linkage where we can find the pc.
3267   if (last_Java_pc != noreg)
3268     std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
3269 
3270   // Set last_Java_sp last.
3271   std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
3272 }
3273 
3274 void MacroAssembler::reset_last_Java_frame(void) {
3275   asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()),
3276                              R16_thread, "SP was not set, still zero", 0x202);
3277 
3278   BLOCK_COMMENT("reset_last_Java_frame {");
3279   li(R0, 0);
3280 
3281   // _last_Java_sp = 0
3282   std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
3283 
3284   // _last_Java_pc = 0
3285   std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
3286   BLOCK_COMMENT("} reset_last_Java_frame");
3287 }
3288 
3289 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) {
3290   assert_different_registers(sp, tmp1);
3291 
3292   // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via
3293   // TOP_IJAVA_FRAME_ABI.
3294   // FIXME: assert that we really have a TOP_IJAVA_FRAME here!
3295   address entry = pc();
3296   load_const_optimized(tmp1, entry);
3297 
3298   set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1);
3299 }
3300 
3301 void MacroAssembler::get_vm_result(Register oop_result) {
3302   // Read:
3303   //   R16_thread
3304   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
3305   //
3306   // Updated:
3307   //   oop_result
3308   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
3309 
3310   verify_thread();
3311 
3312   ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread);
3313   li(R0, 0);
3314   std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread);
3315 
3316   verify_oop(oop_result);
3317 }
3318 
3319 void MacroAssembler::get_vm_result_2(Register metadata_result) {
3320   // Read:
3321   //   R16_thread
3322   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
3323   //
3324   // Updated:
3325   //   metadata_result
3326   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
3327 
3328   ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
3329   li(R0, 0);
3330   std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
3331 }
3332 
3333 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3334   Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided.
3335   if (Universe::narrow_klass_base() != 0) {
3336     // Use dst as temp if it is free.
3337     sub_const_optimized(dst, current, Universe::narrow_klass_base(), R0);
3338     current = dst;
3339   }
3340   if (Universe::narrow_klass_shift() != 0) {
3341     srdi(dst, current, Universe::narrow_klass_shift());
3342     current = dst;
3343   }
3344   return current;
3345 }
3346 
3347 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) {
3348   if (UseCompressedClassPointers) {
3349     Register compressedKlass = encode_klass_not_null(ck, klass);
3350     stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop);
3351   } else {
3352     std(klass, oopDesc::klass_offset_in_bytes(), dst_oop);
3353   }
3354 }
3355 
3356 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) {
3357   if (UseCompressedClassPointers) {
3358     if (val == noreg) {
3359       val = R0;
3360       li(val, 0);
3361     }
3362     stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed
3363   }
3364 }
3365 
3366 int MacroAssembler::instr_size_for_decode_klass_not_null() {
3367   if (!UseCompressedClassPointers) return 0;
3368   int num_instrs = 1;  // shift or move
3369   if (Universe::narrow_klass_base() != 0) num_instrs = 7;  // shift + load const + add
3370   return num_instrs * BytesPerInstWord;
3371 }
3372 
3373 void MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3374   assert(dst != R0, "Dst reg may not be R0, as R0 is used here.");
3375   if (src == noreg) src = dst;
3376   Register shifted_src = src;
3377   if (Universe::narrow_klass_shift() != 0 ||
3378       Universe::narrow_klass_base() == 0 && src != dst) {  // Move required.
3379     shifted_src = dst;
3380     sldi(shifted_src, src, Universe::narrow_klass_shift());
3381   }
3382   if (Universe::narrow_klass_base() != 0) {
3383     add_const_optimized(dst, shifted_src, Universe::narrow_klass_base(), R0);
3384   }
3385 }
3386 
3387 void MacroAssembler::load_klass(Register dst, Register src) {
3388   if (UseCompressedClassPointers) {
3389     lwz(dst, oopDesc::klass_offset_in_bytes(), src);
3390     // Attention: no null check here!
3391     decode_klass_not_null(dst, dst);
3392   } else {
3393     ld(dst, oopDesc::klass_offset_in_bytes(), src);
3394   }
3395 }
3396 
3397 // ((OopHandle)result).resolve();
3398 void MacroAssembler::resolve_oop_handle(Register result) {
3399   // OopHandle::resolve is an indirection.
3400   ld(result, 0, result);
3401 }
3402 
3403 void MacroAssembler::load_mirror_from_const_method(Register mirror, Register const_method) {
3404   ld(mirror, in_bytes(ConstMethod::constants_offset()), const_method);
3405   ld(mirror, ConstantPool::pool_holder_offset_in_bytes(), mirror);
3406   ld(mirror, in_bytes(Klass::java_mirror_offset()), mirror);
3407   resolve_oop_handle(mirror);
3408 }
3409 
3410 // Clear Array
3411 // For very short arrays. tmp == R0 is allowed.
3412 void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) {
3413   if (cnt_dwords > 0) { li(tmp, 0); }
3414   for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); }
3415 }
3416 
3417 // Version for constant short array length. Kills base_ptr. tmp == R0 is allowed.
3418 void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) {
3419   if (cnt_dwords < 8) {
3420     clear_memory_unrolled(base_ptr, cnt_dwords, tmp);
3421     return;
3422   }
3423 
3424   Label loop;
3425   const long loopcnt   = cnt_dwords >> 1,
3426              remainder = cnt_dwords & 1;
3427 
3428   li(tmp, loopcnt);
3429   mtctr(tmp);
3430   li(tmp, 0);
3431   bind(loop);
3432     std(tmp, 0, base_ptr);
3433     std(tmp, 8, base_ptr);
3434     addi(base_ptr, base_ptr, 16);
3435     bdnz(loop);
3436   if (remainder) { std(tmp, 0, base_ptr); }
3437 }
3438 
3439 // Kills both input registers. tmp == R0 is allowed.
3440 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) {
3441   // Procedure for large arrays (uses data cache block zero instruction).
3442     Label startloop, fast, fastloop, small_rest, restloop, done;
3443     const int cl_size         = VM_Version::L1_data_cache_line_size(),
3444               cl_dwords       = cl_size >> 3,
3445               cl_dw_addr_bits = exact_log2(cl_dwords),
3446               dcbz_min        = 1,  // Min count of dcbz executions, needs to be >0.
3447               min_cnt         = ((dcbz_min + 1) << cl_dw_addr_bits) - 1;
3448 
3449   if (const_cnt >= 0) {
3450     // Constant case.
3451     if (const_cnt < min_cnt) {
3452       clear_memory_constlen(base_ptr, const_cnt, tmp);
3453       return;
3454     }
3455     load_const_optimized(cnt_dwords, const_cnt, tmp);
3456   } else {
3457     // cnt_dwords already loaded in register. Need to check size.
3458     cmpdi(CCR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included).
3459     blt(CCR1, small_rest);
3460   }
3461     rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line.
3462     beq(CCR0, fast);                                  // Already 128byte aligned.
3463 
3464     subfic(tmp, tmp, cl_dwords);
3465     mtctr(tmp);                        // Set ctr to hit 128byte boundary (0<ctr<cl_dwords).
3466     subf(cnt_dwords, tmp, cnt_dwords); // rest.
3467     li(tmp, 0);
3468 
3469   bind(startloop);                     // Clear at the beginning to reach 128byte boundary.
3470     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3471     addi(base_ptr, base_ptr, 8);
3472     bdnz(startloop);
3473 
3474   bind(fast);                                  // Clear 128byte blocks.
3475     srdi(tmp, cnt_dwords, cl_dw_addr_bits);    // Loop count for 128byte loop (>0).
3476     andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords.
3477     mtctr(tmp);                                // Load counter.
3478 
3479   bind(fastloop);
3480     dcbz(base_ptr);                    // Clear 128byte aligned block.
3481     addi(base_ptr, base_ptr, cl_size);
3482     bdnz(fastloop);
3483 
3484   bind(small_rest);
3485     cmpdi(CCR0, cnt_dwords, 0);        // size 0?
3486     beq(CCR0, done);                   // rest == 0
3487     li(tmp, 0);
3488     mtctr(cnt_dwords);                 // Load counter.
3489 
3490   bind(restloop);                      // Clear rest.
3491     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3492     addi(base_ptr, base_ptr, 8);
3493     bdnz(restloop);
3494 
3495   bind(done);
3496 }
3497 
3498 /////////////////////////////////////////// String intrinsics ////////////////////////////////////////////
3499 
3500 #ifdef COMPILER2
3501 // Intrinsics for CompactStrings
3502 
3503 // Compress char[] to byte[] by compressing 16 bytes at once.
3504 void MacroAssembler::string_compress_16(Register src, Register dst, Register cnt,
3505                                         Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5,
3506                                         Label& Lfailure) {
3507 
3508   const Register tmp0 = R0;
3509   assert_different_registers(src, dst, cnt, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5);
3510   Label Lloop, Lslow;
3511 
3512   // Check if cnt >= 8 (= 16 bytes)
3513   lis(tmp1, 0xFF);                // tmp1 = 0x00FF00FF00FF00FF
3514   srwi_(tmp2, cnt, 3);
3515   beq(CCR0, Lslow);
3516   ori(tmp1, tmp1, 0xFF);
3517   rldimi(tmp1, tmp1, 32, 0);
3518   mtctr(tmp2);
3519 
3520   // 2x unrolled loop
3521   bind(Lloop);
3522   ld(tmp2, 0, src);               // _0_1_2_3 (Big Endian)
3523   ld(tmp4, 8, src);               // _4_5_6_7
3524 
3525   orr(tmp0, tmp2, tmp4);
3526   rldicl(tmp3, tmp2, 6*8, 64-24); // _____1_2
3527   rldimi(tmp2, tmp2, 2*8, 2*8);   // _0_2_3_3
3528   rldicl(tmp5, tmp4, 6*8, 64-24); // _____5_6
3529   rldimi(tmp4, tmp4, 2*8, 2*8);   // _4_6_7_7
3530 
3531   andc_(tmp0, tmp0, tmp1);
3532   bne(CCR0, Lfailure);            // Not latin1.
3533   addi(src, src, 16);
3534 
3535   rlwimi(tmp3, tmp2, 0*8, 24, 31);// _____1_3
3536   srdi(tmp2, tmp2, 3*8);          // ____0_2_
3537   rlwimi(tmp5, tmp4, 0*8, 24, 31);// _____5_7
3538   srdi(tmp4, tmp4, 3*8);          // ____4_6_
3539 
3540   orr(tmp2, tmp2, tmp3);          // ____0123
3541   orr(tmp4, tmp4, tmp5);          // ____4567
3542 
3543   stw(tmp2, 0, dst);
3544   stw(tmp4, 4, dst);
3545   addi(dst, dst, 8);
3546   bdnz(Lloop);
3547 
3548   bind(Lslow);                    // Fallback to slow version
3549 }
3550 
3551 // Compress char[] to byte[]. cnt must be positive int.
3552 void MacroAssembler::string_compress(Register src, Register dst, Register cnt, Register tmp, Label& Lfailure) {
3553   Label Lloop;
3554   mtctr(cnt);
3555 
3556   bind(Lloop);
3557   lhz(tmp, 0, src);
3558   cmplwi(CCR0, tmp, 0xff);
3559   bgt(CCR0, Lfailure);            // Not latin1.
3560   addi(src, src, 2);
3561   stb(tmp, 0, dst);
3562   addi(dst, dst, 1);
3563   bdnz(Lloop);
3564 }
3565 
3566 // Inflate byte[] to char[] by inflating 16 bytes at once.
3567 void MacroAssembler::string_inflate_16(Register src, Register dst, Register cnt,
3568                                        Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
3569   const Register tmp0 = R0;
3570   assert_different_registers(src, dst, cnt, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5);
3571   Label Lloop, Lslow;
3572 
3573   // Check if cnt >= 8
3574   srwi_(tmp2, cnt, 3);
3575   beq(CCR0, Lslow);
3576   lis(tmp1, 0xFF);                // tmp1 = 0x00FF00FF
3577   ori(tmp1, tmp1, 0xFF);
3578   mtctr(tmp2);
3579 
3580   // 2x unrolled loop
3581   bind(Lloop);
3582   lwz(tmp2, 0, src);              // ____0123 (Big Endian)
3583   lwz(tmp4, 4, src);              // ____4567
3584   addi(src, src, 8);
3585 
3586   rldicl(tmp3, tmp2, 7*8, 64-8);  // _______2
3587   rlwimi(tmp2, tmp2, 3*8, 16, 23);// ____0113
3588   rldicl(tmp5, tmp4, 7*8, 64-8);  // _______6
3589   rlwimi(tmp4, tmp4, 3*8, 16, 23);// ____4557
3590 
3591   andc(tmp0, tmp2, tmp1);         // ____0_1_
3592   rlwimi(tmp2, tmp3, 2*8, 0, 23); // _____2_3
3593   andc(tmp3, tmp4, tmp1);         // ____4_5_
3594   rlwimi(tmp4, tmp5, 2*8, 0, 23); // _____6_7
3595 
3596   rldimi(tmp2, tmp0, 3*8, 0*8);   // _0_1_2_3
3597   rldimi(tmp4, tmp3, 3*8, 0*8);   // _4_5_6_7
3598 
3599   std(tmp2, 0, dst);
3600   std(tmp4, 8, dst);
3601   addi(dst, dst, 16);
3602   bdnz(Lloop);
3603 
3604   bind(Lslow);                    // Fallback to slow version
3605 }
3606 
3607 // Inflate byte[] to char[]. cnt must be positive int.
3608 void MacroAssembler::string_inflate(Register src, Register dst, Register cnt, Register tmp) {
3609   Label Lloop;
3610   mtctr(cnt);
3611 
3612   bind(Lloop);
3613   lbz(tmp, 0, src);
3614   addi(src, src, 1);
3615   sth(tmp, 0, dst);
3616   addi(dst, dst, 2);
3617   bdnz(Lloop);
3618 }
3619 
3620 void MacroAssembler::string_compare(Register str1, Register str2,
3621                                     Register cnt1, Register cnt2,
3622                                     Register tmp1, Register result, int ae) {
3623   const Register tmp0 = R0,
3624                  diff = tmp1;
3625 
3626   assert_different_registers(str1, str2, cnt1, cnt2, tmp0, tmp1, result);
3627   Label Ldone, Lslow, Lloop, Lreturn_diff;
3628 
3629   // Note: Making use of the fact that compareTo(a, b) == -compareTo(b, a)
3630   // we interchange str1 and str2 in the UL case and negate the result.
3631   // Like this, str1 is always latin1 encoded, except for the UU case.
3632   // In addition, we need 0 (or sign which is 0) extend.
3633 
3634   if (ae == StrIntrinsicNode::UU) {
3635     srwi(cnt1, cnt1, 1);
3636   } else {
3637     clrldi(cnt1, cnt1, 32);
3638   }
3639 
3640   if (ae != StrIntrinsicNode::LL) {
3641     srwi(cnt2, cnt2, 1);
3642   } else {
3643     clrldi(cnt2, cnt2, 32);
3644   }
3645 
3646   // See if the lengths are different, and calculate min in cnt1.
3647   // Save diff in case we need it for a tie-breaker.
3648   subf_(diff, cnt2, cnt1); // diff = cnt1 - cnt2
3649   // if (diff > 0) { cnt1 = cnt2; }
3650   if (VM_Version::has_isel()) {
3651     isel(cnt1, CCR0, Assembler::greater, /*invert*/ false, cnt2);
3652   } else {
3653     Label Lskip;
3654     blt(CCR0, Lskip);
3655     mr(cnt1, cnt2);
3656     bind(Lskip);
3657   }
3658 
3659   // Rename registers
3660   Register chr1 = result;
3661   Register chr2 = tmp0;
3662 
3663   // Compare multiple characters in fast loop (only implemented for same encoding).
3664   int stride1 = 8, stride2 = 8;
3665   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3666     int log2_chars_per_iter = (ae == StrIntrinsicNode::LL) ? 3 : 2;
3667     Label Lfastloop, Lskipfast;
3668 
3669     srwi_(tmp0, cnt1, log2_chars_per_iter);
3670     beq(CCR0, Lskipfast);
3671     rldicl(cnt2, cnt1, 0, 64 - log2_chars_per_iter); // Remaining characters.
3672     li(cnt1, 1 << log2_chars_per_iter); // Initialize for failure case: Rescan characters from current iteration.
3673     mtctr(tmp0);
3674 
3675     bind(Lfastloop);
3676     ld(chr1, 0, str1);
3677     ld(chr2, 0, str2);
3678     cmpd(CCR0, chr1, chr2);
3679     bne(CCR0, Lslow);
3680     addi(str1, str1, stride1);
3681     addi(str2, str2, stride2);
3682     bdnz(Lfastloop);
3683     mr(cnt1, cnt2); // Remaining characters.
3684     bind(Lskipfast);
3685   }
3686 
3687   // Loop which searches the first difference character by character.
3688   cmpwi(CCR0, cnt1, 0);
3689   beq(CCR0, Lreturn_diff);
3690   bind(Lslow);
3691   mtctr(cnt1);
3692 
3693   switch (ae) {
3694     case StrIntrinsicNode::LL: stride1 = 1; stride2 = 1; break;
3695     case StrIntrinsicNode::UL: // fallthru (see comment above)
3696     case StrIntrinsicNode::LU: stride1 = 1; stride2 = 2; break;
3697     case StrIntrinsicNode::UU: stride1 = 2; stride2 = 2; break;
3698     default: ShouldNotReachHere(); break;
3699   }
3700 
3701   bind(Lloop);
3702   if (stride1 == 1) { lbz(chr1, 0, str1); } else { lhz(chr1, 0, str1); }
3703   if (stride2 == 1) { lbz(chr2, 0, str2); } else { lhz(chr2, 0, str2); }
3704   subf_(result, chr2, chr1); // result = chr1 - chr2
3705   bne(CCR0, Ldone);
3706   addi(str1, str1, stride1);
3707   addi(str2, str2, stride2);
3708   bdnz(Lloop);
3709 
3710   // If strings are equal up to min length, return the length difference.
3711   bind(Lreturn_diff);
3712   mr(result, diff);
3713 
3714   // Otherwise, return the difference between the first mismatched chars.
3715   bind(Ldone);
3716   if (ae == StrIntrinsicNode::UL) {
3717     neg(result, result); // Negate result (see note above).
3718   }
3719 }
3720 
3721 void MacroAssembler::array_equals(bool is_array_equ, Register ary1, Register ary2,
3722                                   Register limit, Register tmp1, Register result, bool is_byte) {
3723   const Register tmp0 = R0;
3724   assert_different_registers(ary1, ary2, limit, tmp0, tmp1, result);
3725   Label Ldone, Lskiploop, Lloop, Lfastloop, Lskipfast;
3726   bool limit_needs_shift = false;
3727 
3728   if (is_array_equ) {
3729     const int length_offset = arrayOopDesc::length_offset_in_bytes();
3730     const int base_offset   = arrayOopDesc::base_offset_in_bytes(is_byte ? T_BYTE : T_CHAR);
3731 
3732     // Return true if the same array.
3733     cmpd(CCR0, ary1, ary2);
3734     beq(CCR0, Lskiploop);
3735 
3736     // Return false if one of them is NULL.
3737     cmpdi(CCR0, ary1, 0);
3738     cmpdi(CCR1, ary2, 0);
3739     li(result, 0);
3740     cror(CCR0, Assembler::equal, CCR1, Assembler::equal);
3741     beq(CCR0, Ldone);
3742 
3743     // Load the lengths of arrays.
3744     lwz(limit, length_offset, ary1);
3745     lwz(tmp0, length_offset, ary2);
3746 
3747     // Return false if the two arrays are not equal length.
3748     cmpw(CCR0, limit, tmp0);
3749     bne(CCR0, Ldone);
3750 
3751     // Load array addresses.
3752     addi(ary1, ary1, base_offset);
3753     addi(ary2, ary2, base_offset);
3754   } else {
3755     limit_needs_shift = !is_byte;
3756     li(result, 0); // Assume not equal.
3757   }
3758 
3759   // Rename registers
3760   Register chr1 = tmp0;
3761   Register chr2 = tmp1;
3762 
3763   // Compare 8 bytes per iteration in fast loop.
3764   const int log2_chars_per_iter = is_byte ? 3 : 2;
3765 
3766   srwi_(tmp0, limit, log2_chars_per_iter + (limit_needs_shift ? 1 : 0));
3767   beq(CCR0, Lskipfast);
3768   mtctr(tmp0);
3769 
3770   bind(Lfastloop);
3771   ld(chr1, 0, ary1);
3772   ld(chr2, 0, ary2);
3773   addi(ary1, ary1, 8);
3774   addi(ary2, ary2, 8);
3775   cmpd(CCR0, chr1, chr2);
3776   bne(CCR0, Ldone);
3777   bdnz(Lfastloop);
3778 
3779   bind(Lskipfast);
3780   rldicl_(limit, limit, limit_needs_shift ? 64 - 1 : 0, 64 - log2_chars_per_iter); // Remaining characters.
3781   beq(CCR0, Lskiploop);
3782   mtctr(limit);
3783 
3784   // Character by character.
3785   bind(Lloop);
3786   if (is_byte) {
3787     lbz(chr1, 0, ary1);
3788     lbz(chr2, 0, ary2);
3789     addi(ary1, ary1, 1);
3790     addi(ary2, ary2, 1);
3791   } else {
3792     lhz(chr1, 0, ary1);
3793     lhz(chr2, 0, ary2);
3794     addi(ary1, ary1, 2);
3795     addi(ary2, ary2, 2);
3796   }
3797   cmpw(CCR0, chr1, chr2);
3798   bne(CCR0, Ldone);
3799   bdnz(Lloop);
3800 
3801   bind(Lskiploop);
3802   li(result, 1); // All characters are equal.
3803   bind(Ldone);
3804 }
3805 
3806 void MacroAssembler::string_indexof(Register result, Register haystack, Register haycnt,
3807                                     Register needle, ciTypeArray* needle_values, Register needlecnt, int needlecntval,
3808                                     Register tmp1, Register tmp2, Register tmp3, Register tmp4, int ae) {
3809 
3810   // Ensure 0<needlecnt<=haycnt in ideal graph as prerequisite!
3811   Label L_TooShort, L_Found, L_NotFound, L_End;
3812   Register last_addr = haycnt, // Kill haycnt at the beginning.
3813   addr      = tmp1,
3814   n_start   = tmp2,
3815   ch1       = tmp3,
3816   ch2       = R0;
3817 
3818   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
3819   const int h_csize = (ae == StrIntrinsicNode::LL) ? 1 : 2;
3820   const int n_csize = (ae == StrIntrinsicNode::UU) ? 2 : 1;
3821 
3822   // **************************************************************************************************
3823   // Prepare for main loop: optimized for needle count >=2, bail out otherwise.
3824   // **************************************************************************************************
3825 
3826   // Compute last haystack addr to use if no match gets found.
3827   clrldi(haycnt, haycnt, 32);         // Ensure positive int is valid as 64 bit value.
3828   addi(addr, haystack, -h_csize);     // Accesses use pre-increment.
3829   if (needlecntval == 0) { // variable needlecnt
3830    cmpwi(CCR6, needlecnt, 2);
3831    clrldi(needlecnt, needlecnt, 32);  // Ensure positive int is valid as 64 bit value.
3832    blt(CCR6, L_TooShort);             // Variable needlecnt: handle short needle separately.
3833   }
3834 
3835   if (n_csize == 2) { lwz(n_start, 0, needle); } else { lhz(n_start, 0, needle); } // Load first 2 characters of needle.
3836 
3837   if (needlecntval == 0) { // variable needlecnt
3838    subf(ch1, needlecnt, haycnt);      // Last character index to compare is haycnt-needlecnt.
3839    addi(needlecnt, needlecnt, -2);    // Rest of needle.
3840   } else { // constant needlecnt
3841   guarantee(needlecntval != 1, "IndexOf with single-character needle must be handled separately");
3842   assert((needlecntval & 0x7fff) == needlecntval, "wrong immediate");
3843    addi(ch1, haycnt, -needlecntval);  // Last character index to compare is haycnt-needlecnt.
3844    if (needlecntval > 3) { li(needlecnt, needlecntval - 2); } // Rest of needle.
3845   }
3846 
3847   if (h_csize == 2) { slwi(ch1, ch1, 1); } // Scale to number of bytes.
3848 
3849   if (ae ==StrIntrinsicNode::UL) {
3850    srwi(tmp4, n_start, 1*8);          // ___0
3851    rlwimi(n_start, tmp4, 2*8, 0, 23); // _0_1
3852   }
3853 
3854   add(last_addr, haystack, ch1);      // Point to last address to compare (haystack+2*(haycnt-needlecnt)).
3855 
3856   // Main Loop (now we have at least 2 characters).
3857   Label L_OuterLoop, L_InnerLoop, L_FinalCheck, L_Comp1, L_Comp2;
3858   bind(L_OuterLoop); // Search for 1st 2 characters.
3859   Register addr_diff = tmp4;
3860    subf(addr_diff, addr, last_addr);  // Difference between already checked address and last address to check.
3861    addi(addr, addr, h_csize);         // This is the new address we want to use for comparing.
3862    srdi_(ch2, addr_diff, h_csize);
3863    beq(CCR0, L_FinalCheck);           // 2 characters left?
3864    mtctr(ch2);                        // num of characters / 2
3865   bind(L_InnerLoop);                  // Main work horse (2x unrolled search loop)
3866    if (h_csize == 2) {                // Load 2 characters of haystack (ignore alignment).
3867     lwz(ch1, 0, addr);
3868     lwz(ch2, 2, addr);
3869    } else {
3870     lhz(ch1, 0, addr);
3871     lhz(ch2, 1, addr);
3872    }
3873    cmpw(CCR0, ch1, n_start);          // Compare 2 characters (1 would be sufficient but try to reduce branches to CompLoop).
3874    cmpw(CCR1, ch2, n_start);
3875    beq(CCR0, L_Comp1);                // Did we find the needle start?
3876    beq(CCR1, L_Comp2);
3877    addi(addr, addr, 2 * h_csize);
3878    bdnz(L_InnerLoop);
3879   bind(L_FinalCheck);
3880    andi_(addr_diff, addr_diff, h_csize); // Remaining characters not covered by InnerLoop: (num of characters) & 1.
3881    beq(CCR0, L_NotFound);
3882    if (h_csize == 2) { lwz(ch1, 0, addr); } else { lhz(ch1, 0, addr); } // One position left at which we have to compare.
3883    cmpw(CCR1, ch1, n_start);
3884    beq(CCR1, L_Comp1);
3885   bind(L_NotFound);
3886    li(result, -1);                    // not found
3887    b(L_End);
3888 
3889    // **************************************************************************************************
3890    // Special Case: unfortunately, the variable needle case can be called with needlecnt<2
3891    // **************************************************************************************************
3892   if (needlecntval == 0) {           // We have to handle these cases separately.
3893   Label L_OneCharLoop;
3894   bind(L_TooShort);
3895    mtctr(haycnt);
3896    if (n_csize == 2) { lhz(n_start, 0, needle); } else { lbz(n_start, 0, needle); } // First character of needle
3897   bind(L_OneCharLoop);
3898    if (h_csize == 2) { lhzu(ch1, 2, addr); } else { lbzu(ch1, 1, addr); }
3899    cmpw(CCR1, ch1, n_start);
3900    beq(CCR1, L_Found);               // Did we find the one character needle?
3901    bdnz(L_OneCharLoop);
3902    li(result, -1);                   // Not found.
3903    b(L_End);
3904   }
3905 
3906   // **************************************************************************************************
3907   // Regular Case Part II: compare rest of needle (first 2 characters have been compared already)
3908   // **************************************************************************************************
3909 
3910   // Compare the rest
3911   bind(L_Comp2);
3912    addi(addr, addr, h_csize);        // First comparison has failed, 2nd one hit.
3913   bind(L_Comp1);                     // Addr points to possible needle start.
3914   if (needlecntval != 2) {           // Const needlecnt==2?
3915    if (needlecntval != 3) {
3916     if (needlecntval == 0) { beq(CCR6, L_Found); } // Variable needlecnt==2?
3917     Register n_ind = tmp4,
3918              h_ind = n_ind;
3919     li(n_ind, 2 * n_csize);          // First 2 characters are already compared, use index 2.
3920     mtctr(needlecnt);                // Decremented by 2, still > 0.
3921    Label L_CompLoop;
3922    bind(L_CompLoop);
3923     if (ae ==StrIntrinsicNode::UL) {
3924       h_ind = ch1;
3925       sldi(h_ind, n_ind, 1);
3926     }
3927     if (n_csize == 2) { lhzx(ch2, needle, n_ind); } else { lbzx(ch2, needle, n_ind); }
3928     if (h_csize == 2) { lhzx(ch1, addr, h_ind); } else { lbzx(ch1, addr, h_ind); }
3929     cmpw(CCR1, ch1, ch2);
3930     bne(CCR1, L_OuterLoop);
3931     addi(n_ind, n_ind, n_csize);
3932     bdnz(L_CompLoop);
3933    } else { // No loop required if there's only one needle character left.
3934     if (n_csize == 2) { lhz(ch2, 2 * 2, needle); } else { lbz(ch2, 2 * 1, needle); }
3935     if (h_csize == 2) { lhz(ch1, 2 * 2, addr); } else { lbz(ch1, 2 * 1, addr); }
3936     cmpw(CCR1, ch1, ch2);
3937     bne(CCR1, L_OuterLoop);
3938    }
3939   }
3940   // Return index ...
3941   bind(L_Found);
3942    subf(result, haystack, addr);     // relative to haystack, ...
3943    if (h_csize == 2) { srdi(result, result, 1); } // in characters.
3944   bind(L_End);
3945 } // string_indexof
3946 
3947 void MacroAssembler::string_indexof_char(Register result, Register haystack, Register haycnt,
3948                                          Register needle, jchar needleChar, Register tmp1, Register tmp2, bool is_byte) {
3949   assert_different_registers(haystack, haycnt, needle, tmp1, tmp2);
3950 
3951   Label L_InnerLoop, L_FinalCheck, L_Found1, L_Found2, L_NotFound, L_End;
3952   Register addr = tmp1,
3953            ch1 = tmp2,
3954            ch2 = R0;
3955 
3956   const int h_csize = is_byte ? 1 : 2;
3957 
3958 //4:
3959    srwi_(tmp2, haycnt, 1);   // Shift right by exact_log2(UNROLL_FACTOR).
3960    mr(addr, haystack);
3961    beq(CCR0, L_FinalCheck);
3962    mtctr(tmp2);              // Move to count register.
3963 //8:
3964   bind(L_InnerLoop);         // Main work horse (2x unrolled search loop).
3965    if (!is_byte) {
3966     lhz(ch1, 0, addr);
3967     lhz(ch2, 2, addr);
3968    } else {
3969     lbz(ch1, 0, addr);
3970     lbz(ch2, 1, addr);
3971    }
3972    (needle != R0) ? cmpw(CCR0, ch1, needle) : cmplwi(CCR0, ch1, (unsigned int)needleChar);
3973    (needle != R0) ? cmpw(CCR1, ch2, needle) : cmplwi(CCR1, ch2, (unsigned int)needleChar);
3974    beq(CCR0, L_Found1);      // Did we find the needle?
3975    beq(CCR1, L_Found2);
3976    addi(addr, addr, 2 * h_csize);
3977    bdnz(L_InnerLoop);
3978 //16:
3979   bind(L_FinalCheck);
3980    andi_(R0, haycnt, 1);
3981    beq(CCR0, L_NotFound);
3982    if (!is_byte) { lhz(ch1, 0, addr); } else { lbz(ch1, 0, addr); } // One position left at which we have to compare.
3983    (needle != R0) ? cmpw(CCR1, ch1, needle) : cmplwi(CCR1, ch1, (unsigned int)needleChar);
3984    beq(CCR1, L_Found1);
3985 //21:
3986   bind(L_NotFound);
3987    li(result, -1);           // Not found.
3988    b(L_End);
3989 
3990   bind(L_Found2);
3991    addi(addr, addr, h_csize);
3992 //24:
3993   bind(L_Found1);            // Return index ...
3994    subf(result, haystack, addr); // relative to haystack, ...
3995    if (!is_byte) { srdi(result, result, 1); } // in characters.
3996   bind(L_End);
3997 } // string_indexof_char
3998 
3999 
4000 void MacroAssembler::has_negatives(Register src, Register cnt, Register result,
4001                                    Register tmp1, Register tmp2) {
4002   const Register tmp0 = R0;
4003   assert_different_registers(src, result, cnt, tmp0, tmp1, tmp2);
4004   Label Lfastloop, Lslow, Lloop, Lnoneg, Ldone;
4005 
4006   // Check if cnt >= 8 (= 16 bytes)
4007   lis(tmp1, (int)(short)0x8080);  // tmp1 = 0x8080808080808080
4008   srwi_(tmp2, cnt, 4);
4009   li(result, 1);                  // Assume there's a negative byte.
4010   beq(CCR0, Lslow);
4011   ori(tmp1, tmp1, 0x8080);
4012   rldimi(tmp1, tmp1, 32, 0);
4013   mtctr(tmp2);
4014 
4015   // 2x unrolled loop
4016   bind(Lfastloop);
4017   ld(tmp2, 0, src);
4018   ld(tmp0, 8, src);
4019 
4020   orr(tmp0, tmp2, tmp0);
4021 
4022   and_(tmp0, tmp0, tmp1);
4023   bne(CCR0, Ldone);               // Found negative byte.
4024   addi(src, src, 16);
4025 
4026   bdnz(Lfastloop);
4027 
4028   bind(Lslow);                    // Fallback to slow version
4029   rldicl_(tmp0, cnt, 0, 64-4);
4030   beq(CCR0, Lnoneg);
4031   mtctr(tmp0);
4032   bind(Lloop);
4033   lbz(tmp0, 0, src);
4034   addi(src, src, 1);
4035   andi_(tmp0, tmp0, 0x80);
4036   bne(CCR0, Ldone);               // Found negative byte.
4037   bdnz(Lloop);
4038   bind(Lnoneg);
4039   li(result, 0);
4040 
4041   bind(Ldone);
4042 }
4043 
4044 #endif // Compiler2
4045 
4046 // Helpers for Intrinsic Emitters
4047 //
4048 // Revert the byte order of a 32bit value in a register
4049 //   src: 0x44556677
4050 //   dst: 0x77665544
4051 // Three steps to obtain the result:
4052 //  1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word
4053 //     into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared.
4054 //     This value initializes dst.
4055 //  2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost
4056 //     byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go.
4057 //     This value is mask inserted into dst with a [0..23] mask of 1s.
4058 //  3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position.
4059 //     This value is mask inserted into dst with a [8..15] mask of 1s.
4060 void MacroAssembler::load_reverse_32(Register dst, Register src) {
4061   assert_different_registers(dst, src);
4062 
4063   rldicl(dst, src, (4+1)*8, 56);       // Rotate byte 4 into position 7 (rightmost), clear all to the left.
4064   rlwimi(dst, src,     3*8,  0, 23);   // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone.
4065   rlwimi(dst, src,     1*8,  8, 15);   // Insert byte 6 into position 5, leave the rest alone.
4066 }
4067 
4068 // Calculate the column addresses of the crc32 lookup table into distinct registers.
4069 // This loop-invariant calculation is moved out of the loop body, reducing the loop
4070 // body size from 20 to 16 instructions.
4071 // Returns the offset that was used to calculate the address of column tc3.
4072 // Due to register shortage, setting tc3 may overwrite table. With the return offset
4073 // at hand, the original table address can be easily reconstructed.
4074 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {
4075 
4076 #ifdef VM_LITTLE_ENDIAN
4077   // This is what we implement (the DOLIT4 part):
4078   // ========================================================================= */
4079   // #define DOLIT4 c ^= *buf4++; \
4080   //         c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \
4081   //             crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24]
4082   // #define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4
4083   // ========================================================================= */
4084   const int ix0 = 3*(4*CRC32_COLUMN_SIZE);
4085   const int ix1 = 2*(4*CRC32_COLUMN_SIZE);
4086   const int ix2 = 1*(4*CRC32_COLUMN_SIZE);
4087   const int ix3 = 0*(4*CRC32_COLUMN_SIZE);
4088 #else
4089   // This is what we implement (the DOBIG4 part):
4090   // =========================================================================
4091   // #define DOBIG4 c ^= *++buf4; \
4092   //         c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \
4093   //             crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24]
4094   // #define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4
4095   // =========================================================================
4096   const int ix0 = 4*(4*CRC32_COLUMN_SIZE);
4097   const int ix1 = 5*(4*CRC32_COLUMN_SIZE);
4098   const int ix2 = 6*(4*CRC32_COLUMN_SIZE);
4099   const int ix3 = 7*(4*CRC32_COLUMN_SIZE);
4100 #endif
4101   assert_different_registers(table, tc0, tc1, tc2);
4102   assert(table == tc3, "must be!");
4103 
4104   addi(tc0, table, ix0);
4105   addi(tc1, table, ix1);
4106   addi(tc2, table, ix2);
4107   if (ix3 != 0) addi(tc3, table, ix3);
4108 
4109   return ix3;
4110 }
4111 
4112 /**
4113  * uint32_t crc;
4114  * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
4115  */
4116 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) {
4117   assert_different_registers(crc, table, tmp);
4118   assert_different_registers(val, table);
4119 
4120   if (crc == val) {                   // Must rotate first to use the unmodified value.
4121     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
4122                                       // As we use a word (4-byte) instruction, we have to adapt the mask bit positions.
4123     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
4124   } else {
4125     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
4126     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
4127   }
4128   lwzx(tmp, table, tmp);
4129   xorr(crc, crc, tmp);
4130 }
4131 
4132 /**
4133  * uint32_t crc;
4134  * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
4135  */
4136 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
4137   fold_byte_crc32(crc, crc, table, tmp);
4138 }
4139 
4140 /**
4141  * Emits code to update CRC-32 with a byte value according to constants in table.
4142  *
4143  * @param [in,out]crc   Register containing the crc.
4144  * @param [in]val       Register containing the byte to fold into the CRC.
4145  * @param [in]table     Register containing the table of crc constants.
4146  *
4147  * uint32_t crc;
4148  * val = crc_table[(val ^ crc) & 0xFF];
4149  * crc = val ^ (crc >> 8);
4150  */
4151 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
4152   BLOCK_COMMENT("update_byte_crc32:");
4153   xorr(val, val, crc);
4154   fold_byte_crc32(crc, val, table, val);
4155 }
4156 
4157 /**
4158  * @param crc   register containing existing CRC (32-bit)
4159  * @param buf   register pointing to input byte buffer (byte*)
4160  * @param len   register containing number of bytes
4161  * @param table register pointing to CRC table
4162  */
4163 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,
4164                                            Register data, bool loopAlignment) {
4165   assert_different_registers(crc, buf, len, table, data);
4166 
4167   Label L_mainLoop, L_done;
4168   const int mainLoop_stepping  = 1;
4169   const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4;
4170 
4171   // Process all bytes in a single-byte loop.
4172   clrldi_(len, len, 32);                         // Enforce 32 bit. Anything to do?
4173   beq(CCR0, L_done);
4174 
4175   mtctr(len);
4176   align(mainLoop_alignment);
4177   BIND(L_mainLoop);
4178     lbz(data, 0, buf);                           // Byte from buffer, zero-extended.
4179     addi(buf, buf, mainLoop_stepping);           // Advance buffer position.
4180     update_byte_crc32(crc, data, table);
4181     bdnz(L_mainLoop);                            // Iterate.
4182 
4183   bind(L_done);
4184 }
4185 
4186 /**
4187  * Emits code to update CRC-32 with a 4-byte value according to constants in table
4188  * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
4189  */
4190 // A not on the lookup table address(es):
4191 // The lookup table consists of two sets of four columns each.
4192 // The columns {0..3} are used for little-endian machines.
4193 // The columns {4..7} are used for big-endian machines.
4194 // To save the effort of adding the column offset to the table address each time
4195 // a table element is looked up, it is possible to pass the pre-calculated
4196 // column addresses.
4197 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary.
4198 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,
4199                                         Register t0,  Register t1,  Register t2,  Register t3,
4200                                         Register tc0, Register tc1, Register tc2, Register tc3) {
4201   assert_different_registers(crc, t3);
4202 
4203   // XOR crc with next four bytes of buffer.
4204   lwz(t3, bufDisp, buf);
4205   if (bufInc != 0) {
4206     addi(buf, buf, bufInc);
4207   }
4208   xorr(t3, t3, crc);
4209 
4210   // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices.
4211   rlwinm(t0, t3,  2,         24-2, 31-2);  // ((t1 >>  0) & 0xff) << 2
4212   rlwinm(t1, t3,  32+(2- 8), 24-2, 31-2);  // ((t1 >>  8) & 0xff) << 2
4213   rlwinm(t2, t3,  32+(2-16), 24-2, 31-2);  // ((t1 >> 16) & 0xff) << 2
4214   rlwinm(t3, t3,  32+(2-24), 24-2, 31-2);  // ((t1 >> 24) & 0xff) << 2
4215 
4216   // Use the pre-calculated column addresses.
4217   // Load pre-calculated table values.
4218   lwzx(t0, tc0, t0);
4219   lwzx(t1, tc1, t1);
4220   lwzx(t2, tc2, t2);
4221   lwzx(t3, tc3, t3);
4222 
4223   // Calculate new crc from table values.
4224   xorr(t0,  t0, t1);
4225   xorr(t2,  t2, t3);
4226   xorr(crc, t0, t2);  // Now crc contains the final checksum value.
4227 }
4228 
4229 /**
4230  * @param crc   register containing existing CRC (32-bit)
4231  * @param buf   register pointing to input byte buffer (byte*)
4232  * @param len   register containing number of bytes
4233  * @param table register pointing to CRC table
4234  *
4235  * Uses R9..R12 as work register. Must be saved/restored by caller!
4236  */
4237 void MacroAssembler::kernel_crc32_2word(Register crc, Register buf, Register len, Register table,
4238                                         Register t0,  Register t1,  Register t2,  Register t3,
4239                                         Register tc0, Register tc1, Register tc2, Register tc3,
4240                                         bool invertCRC) {
4241   assert_different_registers(crc, buf, len, table);
4242 
4243   Label L_mainLoop, L_tail;
4244   Register  tmp  = t0;
4245   Register  data = t0;
4246   Register  tmp2 = t1;
4247   const int mainLoop_stepping  = 8;
4248   const int tailLoop_stepping  = 1;
4249   const int log_stepping       = exact_log2(mainLoop_stepping);
4250   const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
4251   const int complexThreshold   = 2*mainLoop_stepping;
4252 
4253   // Don't test for len <= 0 here. This pathological case should not occur anyway.
4254   // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles
4255   // for all well-behaved cases. The situation itself is detected and handled correctly
4256   // within update_byteLoop_crc32.
4257   assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
4258 
4259   BLOCK_COMMENT("kernel_crc32_2word {");
4260 
4261   if (invertCRC) {
4262     nand(crc, crc, crc);                      // 1s complement of crc
4263   }
4264 
4265   // Check for short (<mainLoop_stepping) buffer.
4266   cmpdi(CCR0, len, complexThreshold);
4267   blt(CCR0, L_tail);
4268 
4269   // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
4270   // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
4271   {
4272     // Align buf addr to mainLoop_stepping boundary.
4273     neg(tmp2, buf);                           // Calculate # preLoop iterations for alignment.
4274     rldicl(tmp2, tmp2, 0, 64-log_stepping);   // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
4275 
4276     if (complexThreshold > mainLoop_stepping) {
4277       sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4278     } else {
4279       sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
4280       cmpdi(CCR0, tmp, mainLoop_stepping);
4281       blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
4282       mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4283     }
4284     update_byteLoop_crc32(crc, buf, tmp2, table, data, false);
4285   }
4286 
4287   srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
4288   andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
4289   mtctr(tmp2);
4290 
4291 #ifdef VM_LITTLE_ENDIAN
4292   Register crc_rv = crc;
4293 #else
4294   Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
4295                                                  // Occupies tmp, but frees up crc.
4296   load_reverse_32(crc_rv, crc);                  // Revert byte order because we are dealing with big-endian data.
4297   tmp = crc;
4298 #endif
4299 
4300   int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
4301 
4302   align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
4303   BIND(L_mainLoop);
4304     update_1word_crc32(crc_rv, buf, table, 0, 0, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
4305     update_1word_crc32(crc_rv, buf, table, 4, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
4306     bdnz(L_mainLoop);
4307 
4308 #ifndef VM_LITTLE_ENDIAN
4309   load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
4310   tmp = crc_rv;                                  // Tmp uses it's original register again.
4311 #endif
4312 
4313   // Restore original table address for tailLoop.
4314   if (reconstructTableOffset != 0) {
4315     addi(table, table, -reconstructTableOffset);
4316   }
4317 
4318   // Process last few (<complexThreshold) bytes of buffer.
4319   BIND(L_tail);
4320   update_byteLoop_crc32(crc, buf, len, table, data, false);
4321 
4322   if (invertCRC) {
4323     nand(crc, crc, crc);                      // 1s complement of crc
4324   }
4325   BLOCK_COMMENT("} kernel_crc32_2word");
4326 }
4327 
4328 /**
4329  * @param crc   register containing existing CRC (32-bit)
4330  * @param buf   register pointing to input byte buffer (byte*)
4331  * @param len   register containing number of bytes
4332  * @param table register pointing to CRC table
4333  *
4334  * uses R9..R12 as work register. Must be saved/restored by caller!
4335  */
4336 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table,
4337                                         Register t0,  Register t1,  Register t2,  Register t3,
4338                                         Register tc0, Register tc1, Register tc2, Register tc3,
4339                                         bool invertCRC) {
4340   assert_different_registers(crc, buf, len, table);
4341 
4342   Label L_mainLoop, L_tail;
4343   Register  tmp          = t0;
4344   Register  data         = t0;
4345   Register  tmp2         = t1;
4346   const int mainLoop_stepping  = 4;
4347   const int tailLoop_stepping  = 1;
4348   const int log_stepping       = exact_log2(mainLoop_stepping);
4349   const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
4350   const int complexThreshold   = 2*mainLoop_stepping;
4351 
4352   // Don't test for len <= 0 here. This pathological case should not occur anyway.
4353   // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles
4354   // for all well-behaved cases. The situation itself is detected and handled correctly
4355   // within update_byteLoop_crc32.
4356   assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
4357 
4358   BLOCK_COMMENT("kernel_crc32_1word {");
4359 
4360   if (invertCRC) {
4361     nand(crc, crc, crc);                      // 1s complement of crc
4362   }
4363 
4364   // Check for short (<mainLoop_stepping) buffer.
4365   cmpdi(CCR0, len, complexThreshold);
4366   blt(CCR0, L_tail);
4367 
4368   // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
4369   // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
4370   {
4371     // Align buf addr to mainLoop_stepping boundary.
4372     neg(tmp2, buf);                              // Calculate # preLoop iterations for alignment.
4373     rldicl(tmp2, tmp2, 0, 64-log_stepping);      // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
4374 
4375     if (complexThreshold > mainLoop_stepping) {
4376       sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4377     } else {
4378       sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
4379       cmpdi(CCR0, tmp, mainLoop_stepping);
4380       blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
4381       mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4382     }
4383     update_byteLoop_crc32(crc, buf, tmp2, table, data, false);
4384   }
4385 
4386   srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
4387   andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
4388   mtctr(tmp2);
4389 
4390 #ifdef VM_LITTLE_ENDIAN
4391   Register crc_rv = crc;
4392 #else
4393   Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
4394                                                  // Occupies tmp, but frees up crc.
4395   load_reverse_32(crc_rv, crc);                  // Revert byte order because we are dealing with big-endian data.
4396   tmp = crc;
4397 #endif
4398 
4399   int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
4400 
4401   align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
4402   BIND(L_mainLoop);
4403     update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
4404     bdnz(L_mainLoop);
4405 
4406 #ifndef VM_LITTLE_ENDIAN
4407   load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
4408   tmp = crc_rv;                                  // Tmp uses it's original register again.
4409 #endif
4410 
4411   // Restore original table address for tailLoop.
4412   if (reconstructTableOffset != 0) {
4413     addi(table, table, -reconstructTableOffset);
4414   }
4415 
4416   // Process last few (<complexThreshold) bytes of buffer.
4417   BIND(L_tail);
4418   update_byteLoop_crc32(crc, buf, len, table, data, false);
4419 
4420   if (invertCRC) {
4421     nand(crc, crc, crc);                      // 1s complement of crc
4422   }
4423   BLOCK_COMMENT("} kernel_crc32_1word");
4424 }
4425 
4426 /**
4427  * @param crc   register containing existing CRC (32-bit)
4428  * @param buf   register pointing to input byte buffer (byte*)
4429  * @param len   register containing number of bytes
4430  * @param table register pointing to CRC table
4431  *
4432  * Uses R7_ARG5, R8_ARG6 as work registers.
4433  */
4434 void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len, Register table,
4435                                         Register t0,  Register t1,  Register t2,  Register t3,
4436                                         bool invertCRC) {
4437   assert_different_registers(crc, buf, len, table);
4438 
4439   Register  data = t0;                   // Holds the current byte to be folded into crc.
4440 
4441   BLOCK_COMMENT("kernel_crc32_1byte {");
4442 
4443   if (invertCRC) {
4444     nand(crc, crc, crc);                      // 1s complement of crc
4445   }
4446 
4447   // Process all bytes in a single-byte loop.
4448   update_byteLoop_crc32(crc, buf, len, table, data, true);
4449 
4450   if (invertCRC) {
4451     nand(crc, crc, crc);                      // 1s complement of crc
4452   }
4453   BLOCK_COMMENT("} kernel_crc32_1byte");
4454 }
4455 
4456 /**
4457  * @param crc             register containing existing CRC (32-bit)
4458  * @param buf             register pointing to input byte buffer (byte*)
4459  * @param len             register containing number of bytes
4460  * @param table           register pointing to CRC table
4461  * @param constants       register pointing to CRC table for 128-bit aligned memory
4462  * @param barretConstants register pointing to table for barrett reduction
4463  * @param t0-t4           temp registers
4464  */
4465 void MacroAssembler::kernel_crc32_1word_vpmsum(Register crc, Register buf, Register len, Register table,
4466                                                Register constants, Register barretConstants,
4467                                                Register t0, Register t1, Register t2, Register t3, Register t4,
4468                                                bool invertCRC) {
4469   assert_different_registers(crc, buf, len, table);
4470 
4471   Label L_alignedHead, L_tail;
4472 
4473   BLOCK_COMMENT("kernel_crc32_1word_vpmsum {");
4474 
4475   // 1. ~c
4476   if (invertCRC) {
4477     nand(crc, crc, crc);                      // 1s complement of crc
4478   }
4479 
4480   // 2. use kernel_crc32_1word for short len
4481   clrldi(len, len, 32);
4482   cmpdi(CCR0, len, 512);
4483   blt(CCR0, L_tail);
4484 
4485   // 3. calculate from 0 to first aligned address
4486   const int alignment = 16;
4487   Register prealign = t0;
4488 
4489   andi_(prealign, buf, alignment - 1);
4490   beq(CCR0, L_alignedHead);
4491   subfic(prealign, prealign, alignment);
4492 
4493   subf(len, prealign, len);
4494   update_byteLoop_crc32(crc, buf, prealign, table, t2, false);
4495 
4496   // 4. calculate from first aligned address as far as possible
4497   BIND(L_alignedHead);
4498   kernel_crc32_1word_aligned(crc, buf, len, constants, barretConstants, t0, t1, t2, t3, t4);
4499 
4500   // 5. remaining bytes
4501   BIND(L_tail);
4502   Register tc0 = t4;
4503   Register tc1 = constants;
4504   Register tc2 = barretConstants;
4505   kernel_crc32_1word(crc, buf, len, table, t0, t1, t2, t3, tc0, tc1, tc2, table, false);
4506 
4507   // 6. ~c
4508   if (invertCRC) {
4509     nand(crc, crc, crc);                      // 1s complement of crc
4510   }
4511 
4512   BLOCK_COMMENT("} kernel_crc32_1word_vpmsum");
4513 }
4514 
4515 /**
4516  * @param crc             register containing existing CRC (32-bit)
4517  * @param buf             register pointing to input byte buffer (byte*)
4518  * @param len             register containing number of bytes (will get updated to remaining bytes)
4519  * @param constants       register pointing to CRC table for 128-bit aligned memory
4520  * @param barretConstants register pointing to table for barrett reduction
4521  * @param t0-t4           temp registers
4522  * Precondition: len should be >= 512. Otherwise, nothing will be done.
4523  */
4524 void MacroAssembler::kernel_crc32_1word_aligned(Register crc, Register buf, Register len,
4525     Register constants, Register barretConstants,
4526     Register t0, Register t1, Register t2, Register t3, Register t4) {
4527 
4528   // Save non-volatile vector registers (frameless).
4529   Register offset = t1;
4530   int offsetInt = 0;
4531   offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP);
4532   offsetInt -= 16; li(offset, offsetInt); stvx(VR21, offset, R1_SP);
4533   offsetInt -= 16; li(offset, offsetInt); stvx(VR22, offset, R1_SP);
4534   offsetInt -= 16; li(offset, offsetInt); stvx(VR23, offset, R1_SP);
4535   offsetInt -= 16; li(offset, offsetInt); stvx(VR24, offset, R1_SP);
4536   offsetInt -= 16; li(offset, offsetInt); stvx(VR25, offset, R1_SP);
4537 #ifndef VM_LITTLE_ENDIAN
4538   offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP);
4539 #endif
4540   offsetInt -= 8; std(R14, offsetInt, R1_SP);
4541   offsetInt -= 8; std(R15, offsetInt, R1_SP);
4542   offsetInt -= 8; std(R16, offsetInt, R1_SP);
4543   offsetInt -= 8; std(R17, offsetInt, R1_SP);
4544 
4545   // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor
4546   // bytes per iteration. The basic scheme is:
4547   // lvx: load vector (Big Endian needs reversal)
4548   // vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift
4549   // vxor: xor partial results together to get unroll_factor2 vectors
4550 
4551   // Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors.
4552 
4553   // Using 16 * unroll_factor / unroll_factor_2 bytes for constants.
4554   const int unroll_factor = 2048;
4555   const int unroll_factor2 = 8;
4556 
4557   // Support registers.
4558   Register offs[] = { noreg, t0, t1, t2, t3, t4, crc /* will live in VCRC */, R14 };
4559   Register num_bytes = R15,
4560            loop_count = R16,
4561            cur_const = R17;
4562   // Constant array for outer loop: unroll_factor2 - 1 registers,
4563   // Constant array for inner loop: unroll_factor / unroll_factor2 registers.
4564   VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 },
4565                  consts1[] = { VR23, VR24 };
4566   // Data register arrays: 2 arrays with unroll_factor2 registers.
4567   VectorRegister data0[] = { VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7 },
4568                  data1[] = { VR8, VR9, VR10, VR11, VR12, VR13, VR14, VR15 };
4569 
4570   VectorRegister VCRC = data0[0];
4571   VectorRegister Vc = VR25;
4572   VectorRegister swap_bytes = VR26; // Only for Big Endian.
4573 
4574   // We have at least 1 iteration (ensured by caller).
4575   Label L_outer_loop, L_inner_loop, L_last;
4576 
4577   // If supported set DSCR pre-fetch to deepest.
4578   if (VM_Version::has_mfdscr()) {
4579     load_const_optimized(t0, VM_Version::_dscr_val | 7);
4580     mtdscr(t0);
4581   }
4582 
4583   mtvrwz(VCRC, crc); // crc lives lives in VCRC, now
4584 
4585   for (int i = 1; i < unroll_factor2; ++i) {
4586     li(offs[i], 16 * i);
4587   }
4588 
4589   // Load consts for outer loop
4590   lvx(consts0[0], constants);
4591   for (int i = 1; i < unroll_factor2 - 1; ++i) {
4592     lvx(consts0[i], offs[i], constants);
4593   }
4594   addi(constants, constants, (unroll_factor2 - 1) * 16);
4595 
4596   load_const_optimized(num_bytes, 16 * unroll_factor);
4597   load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off.
4598 
4599   // Reuse data registers outside of the loop.
4600   VectorRegister Vtmp = data1[0];
4601   VectorRegister Vtmp2 = data1[1];
4602   VectorRegister zeroes = data1[2];
4603 
4604   vspltisb(Vtmp, 0);
4605   vsldoi(VCRC, Vtmp, VCRC, 8); // 96 bit zeroes, 32 bit CRC.
4606 
4607   // Load vector for vpermxor (to xor both 64 bit parts together)
4608   lvsl(Vtmp, buf);   // 000102030405060708090a0b0c0d0e0f
4609   vspltisb(Vc, 4);
4610   vsl(Vc, Vtmp, Vc); // 00102030405060708090a0b0c0d0e0f0
4611   xxspltd(Vc->to_vsr(), Vc->to_vsr(), 0);
4612   vor(Vc, Vtmp, Vc); // 001122334455667708192a3b4c5d6e7f
4613 
4614 #ifdef VM_LITTLE_ENDIAN
4615 #define BE_swap_bytes(x)
4616 #else
4617   vspltisb(Vtmp2, 0xf);
4618   vxor(swap_bytes, Vtmp, Vtmp2);
4619 #define BE_swap_bytes(x) vperm(x, x, x, swap_bytes)
4620 #endif
4621 
4622   cmpd(CCR0, len, num_bytes);
4623   blt(CCR0, L_last);
4624 
4625   // ********** Main loop start **********
4626   align(32);
4627   bind(L_outer_loop);
4628 
4629   // Begin of unrolled first iteration (no xor).
4630   lvx(data1[0], buf);
4631   mr(cur_const, constants);
4632   for (int i = 1; i < unroll_factor2 / 2; ++i) {
4633     lvx(data1[i], offs[i], buf);
4634   }
4635   vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
4636   lvx(consts1[0], cur_const);
4637   mtctr(loop_count);
4638   for (int i = 0; i < unroll_factor2 / 2; ++i) {
4639     BE_swap_bytes(data1[i]);
4640     if (i == 0) { vxor(data1[0], data1[0], VCRC); } // xor in previous CRC.
4641     lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
4642     vpmsumw(data0[i], data1[i], consts1[0]);
4643   }
4644   addi(buf, buf, 16 * unroll_factor2);
4645   subf(len, num_bytes, len);
4646   lvx(consts1[1], offs[1], cur_const);
4647   addi(cur_const, cur_const, 32);
4648   // Begin of unrolled second iteration (head).
4649   for (int i = 0; i < unroll_factor2 / 2; ++i) {
4650     BE_swap_bytes(data1[i + unroll_factor2 / 2]);
4651     if (i == 0) { lvx(data1[0], buf); } else { lvx(data1[i], offs[i], buf); }
4652     vpmsumw(data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[0]);
4653   }
4654   for (int i = 0; i < unroll_factor2 / 2; ++i) {
4655     BE_swap_bytes(data1[i]);
4656     lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
4657     vpmsumw(data1[i], data1[i], consts1[1]);
4658   }
4659   addi(buf, buf, 16 * unroll_factor2);
4660 
4661   // Generate most performance relevant code. Loads + half of the vpmsumw have been generated.
4662   // Double-iteration allows using the 2 constant registers alternatingly.
4663   align(32);
4664   bind(L_inner_loop);
4665   for (int j = 1; j < 3; ++j) { // j < unroll_factor / unroll_factor2 - 1 for complete unrolling.
4666     if (j & 1) {
4667       lvx(consts1[0], cur_const);
4668     } else {
4669       lvx(consts1[1], offs[1], cur_const);
4670       addi(cur_const, cur_const, 32);
4671     }
4672     for (int i = 0; i < unroll_factor2; ++i) {
4673       int idx = i + unroll_factor2 / 2, inc = 0; // For modulo-scheduled input.
4674       if (idx >= unroll_factor2) { idx -= unroll_factor2; inc = 1; }
4675       BE_swap_bytes(data1[idx]);
4676       vxor(data0[i], data0[i], data1[i]);
4677       if (i == 0) lvx(data1[0], buf); else lvx(data1[i], offs[i], buf);
4678       vpmsumw(data1[idx], data1[idx], consts1[(j + inc) & 1]);
4679     }
4680     addi(buf, buf, 16 * unroll_factor2);
4681   }
4682   bdnz(L_inner_loop);
4683 
4684   // Tail of last iteration (no loads).
4685   for (int i = 0; i < unroll_factor2 / 2; ++i) {
4686     BE_swap_bytes(data1[i + unroll_factor2 / 2]);
4687     vxor(data0[i], data0[i], data1[i]);
4688     vpmsumw(data1[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[1]);
4689   }
4690   for (int i = 0; i < unroll_factor2 / 2; ++i) {
4691     vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); // First half of fixup shifts.
4692     vxor(data0[i + unroll_factor2 / 2], data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2]);
4693   }
4694 
4695   // Last data register is ok, other ones need fixup shift.
4696   for (int i = unroll_factor2 / 2; i < unroll_factor2 - 1; ++i) {
4697     vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]);
4698   }
4699 
4700   // Combine to 128 bit result vector VCRC = data0[0].
4701   for (int i = 1; i < unroll_factor2; i<<=1) {
4702     for (int j = 0; j <= unroll_factor2 - 2*i; j+=2*i) {
4703       vxor(data0[j], data0[j], data0[j+i]);
4704     }
4705   }
4706   cmpd(CCR0, len, num_bytes);
4707   bge(CCR0, L_outer_loop);
4708 
4709   // Last chance with lower num_bytes.
4710   bind(L_last);
4711   srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations.
4712   add_const_optimized(constants, constants, 16 * (unroll_factor / unroll_factor2)); // Point behind last one.
4713   sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used.
4714   clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2));
4715   subf(constants, R0, constants); // Point to constant to be used first.
4716 
4717   addic_(loop_count, loop_count, -1); // One double-iteration peeled off.
4718   bgt(CCR0, L_outer_loop);
4719   // ********** Main loop end **********
4720 #undef BE_swap_bytes
4721 
4722   // Restore DSCR pre-fetch value.
4723   if (VM_Version::has_mfdscr()) {
4724     load_const_optimized(t0, VM_Version::_dscr_val);
4725     mtdscr(t0);
4726   }
4727 
4728   vspltisb(zeroes, 0);
4729 
4730   // Combine to 64 bit result.
4731   vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
4732 
4733   // Reduce to 32 bit CRC: Remainder by multiply-high.
4734   lvx(Vtmp, barretConstants);
4735   vsldoi(Vtmp2, zeroes, VCRC, 12);  // Extract high 32 bit.
4736   vpmsumd(Vtmp2, Vtmp2, Vtmp);      // Multiply by inverse long poly.
4737   vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit.
4738   vsldoi(Vtmp, zeroes, Vtmp, 8);
4739   vpmsumd(Vtmp2, Vtmp2, Vtmp);      // Multiply quotient by long poly.
4740   vxor(VCRC, VCRC, Vtmp2);          // Remainder fits into 32 bit.
4741 
4742   // Move result. len is already updated.
4743   vsldoi(VCRC, VCRC, zeroes, 8);
4744   mfvrd(crc, VCRC);
4745 
4746   // Restore non-volatile Vector registers (frameless).
4747   offsetInt = 0;
4748   offsetInt -= 16; li(offset, offsetInt); lvx(VR20, offset, R1_SP);
4749   offsetInt -= 16; li(offset, offsetInt); lvx(VR21, offset, R1_SP);
4750   offsetInt -= 16; li(offset, offsetInt); lvx(VR22, offset, R1_SP);
4751   offsetInt -= 16; li(offset, offsetInt); lvx(VR23, offset, R1_SP);
4752   offsetInt -= 16; li(offset, offsetInt); lvx(VR24, offset, R1_SP);
4753   offsetInt -= 16; li(offset, offsetInt); lvx(VR25, offset, R1_SP);
4754 #ifndef VM_LITTLE_ENDIAN
4755   offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP);
4756 #endif
4757   offsetInt -= 8;  ld(R14, offsetInt, R1_SP);
4758   offsetInt -= 8;  ld(R15, offsetInt, R1_SP);
4759   offsetInt -= 8;  ld(R16, offsetInt, R1_SP);
4760   offsetInt -= 8;  ld(R17, offsetInt, R1_SP);
4761 }
4762 
4763 void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp, bool invertCRC) {
4764   assert_different_registers(crc, buf, /* len,  not used!! */ table, tmp);
4765 
4766   BLOCK_COMMENT("kernel_crc32_singleByte:");
4767   if (invertCRC) {
4768     nand(crc, crc, crc);                // 1s complement of crc
4769   }
4770 
4771   lbz(tmp, 0, buf);                     // Byte from buffer, zero-extended.
4772   update_byte_crc32(crc, tmp, table);
4773 
4774   if (invertCRC) {
4775     nand(crc, crc, crc);                // 1s complement of crc
4776   }
4777 }
4778 
4779 void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) {
4780   assert_different_registers(crc, val, table);
4781 
4782   BLOCK_COMMENT("kernel_crc32_singleByteReg:");
4783   if (invertCRC) {
4784     nand(crc, crc, crc);                // 1s complement of crc
4785   }
4786 
4787   update_byte_crc32(crc, val, table);
4788 
4789   if (invertCRC) {
4790     nand(crc, crc, crc);                // 1s complement of crc
4791   }
4792 }
4793 
4794 // dest_lo += src1 + src2
4795 // dest_hi += carry1 + carry2
4796 void MacroAssembler::add2_with_carry(Register dest_hi,
4797                                      Register dest_lo,
4798                                      Register src1, Register src2) {
4799   li(R0, 0);
4800   addc(dest_lo, dest_lo, src1);
4801   adde(dest_hi, dest_hi, R0);
4802   addc(dest_lo, dest_lo, src2);
4803   adde(dest_hi, dest_hi, R0);
4804 }
4805 
4806 // Multiply 64 bit by 64 bit first loop.
4807 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,
4808                                            Register x_xstart,
4809                                            Register y, Register y_idx,
4810                                            Register z,
4811                                            Register carry,
4812                                            Register product_high, Register product,
4813                                            Register idx, Register kdx,
4814                                            Register tmp) {
4815   //  jlong carry, x[], y[], z[];
4816   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
4817   //    huge_128 product = y[idx] * x[xstart] + carry;
4818   //    z[kdx] = (jlong)product;
4819   //    carry  = (jlong)(product >>> 64);
4820   //  }
4821   //  z[xstart] = carry;
4822 
4823   Label L_first_loop, L_first_loop_exit;
4824   Label L_one_x, L_one_y, L_multiply;
4825 
4826   addic_(xstart, xstart, -1);
4827   blt(CCR0, L_one_x);   // Special case: length of x is 1.
4828 
4829   // Load next two integers of x.
4830   sldi(tmp, xstart, LogBytesPerInt);
4831   ldx(x_xstart, x, tmp);
4832 #ifdef VM_LITTLE_ENDIAN
4833   rldicl(x_xstart, x_xstart, 32, 0);
4834 #endif
4835 
4836   align(32, 16);
4837   bind(L_first_loop);
4838 
4839   cmpdi(CCR0, idx, 1);
4840   blt(CCR0, L_first_loop_exit);
4841   addi(idx, idx, -2);
4842   beq(CCR0, L_one_y);
4843 
4844   // Load next two integers of y.
4845   sldi(tmp, idx, LogBytesPerInt);
4846   ldx(y_idx, y, tmp);
4847 #ifdef VM_LITTLE_ENDIAN
4848   rldicl(y_idx, y_idx, 32, 0);
4849 #endif
4850 
4851 
4852   bind(L_multiply);
4853   multiply64(product_high, product, x_xstart, y_idx);
4854 
4855   li(tmp, 0);
4856   addc(product, product, carry);         // Add carry to result.
4857   adde(product_high, product_high, tmp); // Add carry of the last addition.
4858   addi(kdx, kdx, -2);
4859 
4860   // Store result.
4861 #ifdef VM_LITTLE_ENDIAN
4862   rldicl(product, product, 32, 0);
4863 #endif
4864   sldi(tmp, kdx, LogBytesPerInt);
4865   stdx(product, z, tmp);
4866   mr_if_needed(carry, product_high);
4867   b(L_first_loop);
4868 
4869 
4870   bind(L_one_y); // Load one 32 bit portion of y as (0,value).
4871 
4872   lwz(y_idx, 0, y);
4873   b(L_multiply);
4874 
4875 
4876   bind(L_one_x); // Load one 32 bit portion of x as (0,value).
4877 
4878   lwz(x_xstart, 0, x);
4879   b(L_first_loop);
4880 
4881   bind(L_first_loop_exit);
4882 }
4883 
4884 // Multiply 64 bit by 64 bit and add 128 bit.
4885 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y,
4886                                             Register z, Register yz_idx,
4887                                             Register idx, Register carry,
4888                                             Register product_high, Register product,
4889                                             Register tmp, int offset) {
4890 
4891   //  huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
4892   //  z[kdx] = (jlong)product;
4893 
4894   sldi(tmp, idx, LogBytesPerInt);
4895   if (offset) {
4896     addi(tmp, tmp, offset);
4897   }
4898   ldx(yz_idx, y, tmp);
4899 #ifdef VM_LITTLE_ENDIAN
4900   rldicl(yz_idx, yz_idx, 32, 0);
4901 #endif
4902 
4903   multiply64(product_high, product, x_xstart, yz_idx);
4904   ldx(yz_idx, z, tmp);
4905 #ifdef VM_LITTLE_ENDIAN
4906   rldicl(yz_idx, yz_idx, 32, 0);
4907 #endif
4908 
4909   add2_with_carry(product_high, product, carry, yz_idx);
4910 
4911   sldi(tmp, idx, LogBytesPerInt);
4912   if (offset) {
4913     addi(tmp, tmp, offset);
4914   }
4915 #ifdef VM_LITTLE_ENDIAN
4916   rldicl(product, product, 32, 0);
4917 #endif
4918   stdx(product, z, tmp);
4919 }
4920 
4921 // Multiply 128 bit by 128 bit. Unrolled inner loop.
4922 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart,
4923                                              Register y, Register z,
4924                                              Register yz_idx, Register idx, Register carry,
4925                                              Register product_high, Register product,
4926                                              Register carry2, Register tmp) {
4927 
4928   //  jlong carry, x[], y[], z[];
4929   //  int kdx = ystart+1;
4930   //  for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
4931   //    huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
4932   //    z[kdx+idx+1] = (jlong)product;
4933   //    jlong carry2 = (jlong)(product >>> 64);
4934   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
4935   //    z[kdx+idx] = (jlong)product;
4936   //    carry = (jlong)(product >>> 64);
4937   //  }
4938   //  idx += 2;
4939   //  if (idx > 0) {
4940   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
4941   //    z[kdx+idx] = (jlong)product;
4942   //    carry = (jlong)(product >>> 64);
4943   //  }
4944 
4945   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
4946   const Register jdx = R0;
4947 
4948   // Scale the index.
4949   srdi_(jdx, idx, 2);
4950   beq(CCR0, L_third_loop_exit);
4951   mtctr(jdx);
4952 
4953   align(32, 16);
4954   bind(L_third_loop);
4955 
4956   addi(idx, idx, -4);
4957 
4958   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8);
4959   mr_if_needed(carry2, product_high);
4960 
4961   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0);
4962   mr_if_needed(carry, product_high);
4963   bdnz(L_third_loop);
4964 
4965   bind(L_third_loop_exit);  // Handle any left-over operand parts.
4966 
4967   andi_(idx, idx, 0x3);
4968   beq(CCR0, L_post_third_loop_done);
4969 
4970   Label L_check_1;
4971 
4972   addic_(idx, idx, -2);
4973   blt(CCR0, L_check_1);
4974 
4975   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0);
4976   mr_if_needed(carry, product_high);
4977 
4978   bind(L_check_1);
4979 
4980   addi(idx, idx, 0x2);
4981   andi_(idx, idx, 0x1);
4982   addic_(idx, idx, -1);
4983   blt(CCR0, L_post_third_loop_done);
4984 
4985   sldi(tmp, idx, LogBytesPerInt);
4986   lwzx(yz_idx, y, tmp);
4987   multiply64(product_high, product, x_xstart, yz_idx);
4988   lwzx(yz_idx, z, tmp);
4989 
4990   add2_with_carry(product_high, product, yz_idx, carry);
4991 
4992   sldi(tmp, idx, LogBytesPerInt);
4993   stwx(product, z, tmp);
4994   srdi(product, product, 32);
4995 
4996   sldi(product_high, product_high, 32);
4997   orr(product, product, product_high);
4998   mr_if_needed(carry, product);
4999 
5000   bind(L_post_third_loop_done);
5001 }   // multiply_128_x_128_loop
5002 
5003 void MacroAssembler::muladd(Register out, Register in,
5004                             Register offset, Register len, Register k,
5005                             Register tmp1, Register tmp2, Register carry) {
5006 
5007   // Labels
5008   Label LOOP, SKIP;
5009 
5010   // Make sure length is positive.
5011   cmpdi  (CCR0,    len,     0);
5012 
5013   // Prepare variables
5014   subi   (offset,  offset,  4);
5015   li     (carry,   0);
5016   ble    (CCR0,    SKIP);
5017 
5018   mtctr  (len);
5019   subi   (len,     len,     1    );
5020   sldi   (len,     len,     2    );
5021 
5022   // Main loop
5023   bind(LOOP);
5024   lwzx   (tmp1,    len,     in   );
5025   lwzx   (tmp2,    offset,  out  );
5026   mulld  (tmp1,    tmp1,    k    );
5027   add    (tmp2,    carry,   tmp2 );
5028   add    (tmp2,    tmp1,    tmp2 );
5029   stwx   (tmp2,    offset,  out  );
5030   srdi   (carry,   tmp2,    32   );
5031   subi   (offset,  offset,  4    );
5032   subi   (len,     len,     4    );
5033   bdnz   (LOOP);
5034   bind(SKIP);
5035 }
5036 
5037 void MacroAssembler::multiply_to_len(Register x, Register xlen,
5038                                      Register y, Register ylen,
5039                                      Register z, Register zlen,
5040                                      Register tmp1, Register tmp2,
5041                                      Register tmp3, Register tmp4,
5042                                      Register tmp5, Register tmp6,
5043                                      Register tmp7, Register tmp8,
5044                                      Register tmp9, Register tmp10,
5045                                      Register tmp11, Register tmp12,
5046                                      Register tmp13) {
5047 
5048   ShortBranchVerifier sbv(this);
5049 
5050   assert_different_registers(x, xlen, y, ylen, z, zlen,
5051                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
5052   assert_different_registers(x, xlen, y, ylen, z, zlen,
5053                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp7);
5054   assert_different_registers(x, xlen, y, ylen, z, zlen,
5055                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp8);
5056 
5057   const Register idx = tmp1;
5058   const Register kdx = tmp2;
5059   const Register xstart = tmp3;
5060 
5061   const Register y_idx = tmp4;
5062   const Register carry = tmp5;
5063   const Register product = tmp6;
5064   const Register product_high = tmp7;
5065   const Register x_xstart = tmp8;
5066   const Register tmp = tmp9;
5067 
5068   // First Loop.
5069   //
5070   //  final static long LONG_MASK = 0xffffffffL;
5071   //  int xstart = xlen - 1;
5072   //  int ystart = ylen - 1;
5073   //  long carry = 0;
5074   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
5075   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
5076   //    z[kdx] = (int)product;
5077   //    carry = product >>> 32;
5078   //  }
5079   //  z[xstart] = (int)carry;
5080 
5081   mr_if_needed(idx, ylen);        // idx = ylen
5082   mr_if_needed(kdx, zlen);        // kdx = xlen + ylen
5083   li(carry, 0);                   // carry = 0
5084 
5085   Label L_done;
5086 
5087   addic_(xstart, xlen, -1);
5088   blt(CCR0, L_done);
5089 
5090   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z,
5091                         carry, product_high, product, idx, kdx, tmp);
5092 
5093   Label L_second_loop;
5094 
5095   cmpdi(CCR0, kdx, 0);
5096   beq(CCR0, L_second_loop);
5097 
5098   Label L_carry;
5099 
5100   addic_(kdx, kdx, -1);
5101   beq(CCR0, L_carry);
5102 
5103   // Store lower 32 bits of carry.
5104   sldi(tmp, kdx, LogBytesPerInt);
5105   stwx(carry, z, tmp);
5106   srdi(carry, carry, 32);
5107   addi(kdx, kdx, -1);
5108 
5109 
5110   bind(L_carry);
5111 
5112   // Store upper 32 bits of carry.
5113   sldi(tmp, kdx, LogBytesPerInt);
5114   stwx(carry, z, tmp);
5115 
5116   // Second and third (nested) loops.
5117   //
5118   //  for (int i = xstart-1; i >= 0; i--) { // Second loop
5119   //    carry = 0;
5120   //    for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
5121   //      long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
5122   //                     (z[k] & LONG_MASK) + carry;
5123   //      z[k] = (int)product;
5124   //      carry = product >>> 32;
5125   //    }
5126   //    z[i] = (int)carry;
5127   //  }
5128   //
5129   //  i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
5130 
5131   bind(L_second_loop);
5132 
5133   li(carry, 0);                   // carry = 0;
5134 
5135   addic_(xstart, xstart, -1);     // i = xstart-1;
5136   blt(CCR0, L_done);
5137 
5138   Register zsave = tmp10;
5139 
5140   mr(zsave, z);
5141 
5142 
5143   Label L_last_x;
5144 
5145   sldi(tmp, xstart, LogBytesPerInt);
5146   add(z, z, tmp);                 // z = z + k - j
5147   addi(z, z, 4);
5148   addic_(xstart, xstart, -1);     // i = xstart-1;
5149   blt(CCR0, L_last_x);
5150 
5151   sldi(tmp, xstart, LogBytesPerInt);
5152   ldx(x_xstart, x, tmp);
5153 #ifdef VM_LITTLE_ENDIAN
5154   rldicl(x_xstart, x_xstart, 32, 0);
5155 #endif
5156 
5157 
5158   Label L_third_loop_prologue;
5159 
5160   bind(L_third_loop_prologue);
5161 
5162   Register xsave = tmp11;
5163   Register xlensave = tmp12;
5164   Register ylensave = tmp13;
5165 
5166   mr(xsave, x);
5167   mr(xlensave, xstart);
5168   mr(ylensave, ylen);
5169 
5170 
5171   multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen,
5172                           carry, product_high, product, x, tmp);
5173 
5174   mr(z, zsave);
5175   mr(x, xsave);
5176   mr(xlen, xlensave);   // This is the decrement of the loop counter!
5177   mr(ylen, ylensave);
5178 
5179   addi(tmp3, xlen, 1);
5180   sldi(tmp, tmp3, LogBytesPerInt);
5181   stwx(carry, z, tmp);
5182   addic_(tmp3, tmp3, -1);
5183   blt(CCR0, L_done);
5184 
5185   srdi(carry, carry, 32);
5186   sldi(tmp, tmp3, LogBytesPerInt);
5187   stwx(carry, z, tmp);
5188   b(L_second_loop);
5189 
5190   // Next infrequent code is moved outside loops.
5191   bind(L_last_x);
5192 
5193   lwz(x_xstart, 0, x);
5194   b(L_third_loop_prologue);
5195 
5196   bind(L_done);
5197 }   // multiply_to_len
5198 
5199 void MacroAssembler::asm_assert(bool check_equal, const char *msg, int id) {
5200 #ifdef ASSERT
5201   Label ok;
5202   if (check_equal) {
5203     beq(CCR0, ok);
5204   } else {
5205     bne(CCR0, ok);
5206   }
5207   stop(msg, id);
5208   bind(ok);
5209 #endif
5210 }
5211 
5212 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset,
5213                                           Register mem_base, const char* msg, int id) {
5214 #ifdef ASSERT
5215   switch (size) {
5216     case 4:
5217       lwz(R0, mem_offset, mem_base);
5218       cmpwi(CCR0, R0, 0);
5219       break;
5220     case 8:
5221       ld(R0, mem_offset, mem_base);
5222       cmpdi(CCR0, R0, 0);
5223       break;
5224     default:
5225       ShouldNotReachHere();
5226   }
5227   asm_assert(check_equal, msg, id);
5228 #endif // ASSERT
5229 }
5230 
5231 void MacroAssembler::verify_thread() {
5232   if (VerifyThread) {
5233     unimplemented("'VerifyThread' currently not implemented on PPC");
5234   }
5235 }
5236 
5237 // READ: oop. KILL: R0. Volatile floats perhaps.
5238 void MacroAssembler::verify_oop(Register oop, const char* msg) {
5239   if (!VerifyOops) {
5240     return;
5241   }
5242 
5243   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
5244   const Register tmp = R11; // Will be preserved.
5245   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
5246   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
5247 
5248   mr_if_needed(R4_ARG2, oop);
5249   save_LR_CR(tmp); // save in old frame
5250   push_frame_reg_args(nbytes_save, tmp);
5251   // load FunctionDescriptor** / entry_address *
5252   load_const_optimized(tmp, fd, R0);
5253   // load FunctionDescriptor* / entry_address
5254   ld(tmp, 0, tmp);
5255   load_const_optimized(R3_ARG1, (address)msg, R0);
5256   // Call destination for its side effect.
5257   call_c(tmp);
5258 
5259   pop_frame();
5260   restore_LR_CR(tmp);
5261   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
5262 }
5263 
5264 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) {
5265   if (!VerifyOops) {
5266     return;
5267   }
5268 
5269   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
5270   const Register tmp = R11; // Will be preserved.
5271   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
5272   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
5273 
5274   ld(R4_ARG2, offs, base);
5275   save_LR_CR(tmp); // save in old frame
5276   push_frame_reg_args(nbytes_save, tmp);
5277   // load FunctionDescriptor** / entry_address *
5278   load_const_optimized(tmp, fd, R0);
5279   // load FunctionDescriptor* / entry_address
5280   ld(tmp, 0, tmp);
5281   load_const_optimized(R3_ARG1, (address)msg, R0);
5282   // Call destination for its side effect.
5283   call_c(tmp);
5284 
5285   pop_frame();
5286   restore_LR_CR(tmp);
5287   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
5288 }
5289 
5290 const char* stop_types[] = {
5291   "stop",
5292   "untested",
5293   "unimplemented",
5294   "shouldnotreachhere"
5295 };
5296 
5297 static void stop_on_request(int tp, const char* msg) {
5298   tty->print("PPC assembly code requires stop: (%s) %s\n", stop_types[tp%/*stop_end*/4], msg);
5299   guarantee(false, "PPC assembly code requires stop: %s", msg);
5300 }
5301 
5302 // Call a C-function that prints output.
5303 void MacroAssembler::stop(int type, const char* msg, int id) {
5304 #ifndef PRODUCT
5305   block_comment(err_msg("stop: %s %s {", stop_types[type%stop_end], msg));
5306 #else
5307   block_comment("stop {");
5308 #endif
5309 
5310   // setup arguments
5311   load_const_optimized(R3_ARG1, type);
5312   load_const_optimized(R4_ARG2, (void *)msg, /*tmp=*/R0);
5313   call_VM_leaf(CAST_FROM_FN_PTR(address, stop_on_request), R3_ARG1, R4_ARG2);
5314   illtrap();
5315   emit_int32(id);
5316   block_comment("} stop;");
5317 }
5318 
5319 #ifndef PRODUCT
5320 // Write pattern 0x0101010101010101 in memory region [low-before, high+after].
5321 // Val, addr are temp registers.
5322 // If low == addr, addr is killed.
5323 // High is preserved.
5324 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) {
5325   if (!ZapMemory) return;
5326 
5327   assert_different_registers(low, val);
5328 
5329   BLOCK_COMMENT("zap memory region {");
5330   load_const_optimized(val, 0x0101010101010101);
5331   int size = before + after;
5332   if (low == high && size < 5 && size > 0) {
5333     int offset = -before*BytesPerWord;
5334     for (int i = 0; i < size; ++i) {
5335       std(val, offset, low);
5336       offset += (1*BytesPerWord);
5337     }
5338   } else {
5339     addi(addr, low, -before*BytesPerWord);
5340     assert_different_registers(high, val);
5341     if (after) addi(high, high, after * BytesPerWord);
5342     Label loop;
5343     bind(loop);
5344     std(val, 0, addr);
5345     addi(addr, addr, 8);
5346     cmpd(CCR6, addr, high);
5347     ble(CCR6, loop);
5348     if (after) addi(high, high, -after * BytesPerWord);  // Correct back to old value.
5349   }
5350   BLOCK_COMMENT("} zap memory region");
5351 }
5352 
5353 #endif // !PRODUCT
5354 
5355 void SkipIfEqualZero::skip_to_label_if_equal_zero(MacroAssembler* masm, Register temp,
5356                                                   const bool* flag_addr, Label& label) {
5357   int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true);
5358   assert(sizeof(bool) == 1, "PowerPC ABI");
5359   masm->lbz(temp, simm16_offset, temp);
5360   masm->cmpwi(CCR0, temp, 0);
5361   masm->beq(CCR0, label);
5362 }
5363 
5364 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() {
5365   skip_to_label_if_equal_zero(masm, temp, flag_addr, _label);
5366 }
5367 
5368 SkipIfEqualZero::~SkipIfEqualZero() {
5369   _masm->bind(_label);
5370 }