1 /*
   2  * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2012, 2018, SAP SE. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "compiler/disassembler.hpp"
  29 #include "gc/shared/cardTable.hpp"
  30 #include "gc/shared/cardTableBarrierSet.hpp"
  31 #include "gc/shared/collectedHeap.inline.hpp"
  32 #include "interpreter/interpreter.hpp"
  33 #include "memory/resourceArea.hpp"
  34 #include "nativeInst_ppc.hpp"
  35 #include "prims/methodHandles.hpp"
  36 #include "runtime/biasedLocking.hpp"
  37 #include "runtime/icache.hpp"
  38 #include "runtime/interfaceSupport.inline.hpp"
  39 #include "runtime/objectMonitor.hpp"
  40 #include "runtime/os.hpp"
  41 #include "runtime/safepoint.hpp"
  42 #include "runtime/safepointMechanism.hpp"
  43 #include "runtime/sharedRuntime.hpp"
  44 #include "runtime/stubRoutines.hpp"
  45 #include "utilities/macros.hpp"
  46 #if INCLUDE_ALL_GCS
  47 #include "gc/g1/g1BarrierSet.hpp"
  48 #include "gc/g1/g1CardTable.hpp"
  49 #include "gc/g1/heapRegion.hpp"
  50 #endif // INCLUDE_ALL_GCS
  51 #ifdef COMPILER2
  52 #include "opto/intrinsicnode.hpp"
  53 #endif
  54 
  55 #ifdef PRODUCT
  56 #define BLOCK_COMMENT(str) // nothing
  57 #else
  58 #define BLOCK_COMMENT(str) block_comment(str)
  59 #endif
  60 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  61 
  62 #ifdef ASSERT
  63 // On RISC, there's no benefit to verifying instruction boundaries.
  64 bool AbstractAssembler::pd_check_instruction_mark() { return false; }
  65 #endif
  66 
  67 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) {
  68   assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range");
  69   if (Assembler::is_simm(si31, 16)) {
  70     ld(d, si31, a);
  71     if (emit_filler_nop) nop();
  72   } else {
  73     const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31);
  74     const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31);
  75     addis(d, a, hi);
  76     ld(d, lo, d);
  77   }
  78 }
  79 
  80 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) {
  81   assert_different_registers(d, a);
  82   ld_largeoffset_unchecked(d, si31, a, emit_filler_nop);
  83 }
  84 
  85 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base,
  86                                       size_t size_in_bytes, bool is_signed) {
  87   switch (size_in_bytes) {
  88   case  8:              ld(dst, offs, base);                         break;
  89   case  4:  is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break;
  90   case  2:  is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break;
  91   case  1:  lbz(dst, offs, base); if (is_signed) extsb(dst, dst);    break; // lba doesn't exist :(
  92   default:  ShouldNotReachHere();
  93   }
  94 }
  95 
  96 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base,
  97                                        size_t size_in_bytes) {
  98   switch (size_in_bytes) {
  99   case  8:  std(dst, offs, base); break;
 100   case  4:  stw(dst, offs, base); break;
 101   case  2:  sth(dst, offs, base); break;
 102   case  1:  stb(dst, offs, base); break;
 103   default:  ShouldNotReachHere();
 104   }
 105 }
 106 
 107 void MacroAssembler::align(int modulus, int max, int rem) {
 108   int padding = (rem + modulus - (offset() % modulus)) % modulus;
 109   if (padding > max) return;
 110   for (int c = (padding >> 2); c > 0; --c) { nop(); }
 111 }
 112 
 113 // Issue instructions that calculate given TOC from global TOC.
 114 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16,
 115                                                        bool add_relocation, bool emit_dummy_addr) {
 116   int offset = -1;
 117   if (emit_dummy_addr) {
 118     offset = -128; // dummy address
 119   } else if (addr != (address)(intptr_t)-1) {
 120     offset = MacroAssembler::offset_to_global_toc(addr);
 121   }
 122 
 123   if (hi16) {
 124     addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset));
 125   }
 126   if (lo16) {
 127     if (add_relocation) {
 128       // Relocate at the addi to avoid confusion with a load from the method's TOC.
 129       relocate(internal_word_Relocation::spec(addr));
 130     }
 131     addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset));
 132   }
 133 }
 134 
 135 address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) {
 136   const int offset = MacroAssembler::offset_to_global_toc(addr);
 137 
 138   const address inst2_addr = a;
 139   const int inst2 = *(int *)inst2_addr;
 140 
 141   // The relocation points to the second instruction, the addi,
 142   // and the addi reads and writes the same register dst.
 143   const int dst = inv_rt_field(inst2);
 144   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 145 
 146   // Now, find the preceding addis which writes to dst.
 147   int inst1 = 0;
 148   address inst1_addr = inst2_addr - BytesPerInstWord;
 149   while (inst1_addr >= bound) {
 150     inst1 = *(int *) inst1_addr;
 151     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 152       // Stop, found the addis which writes dst.
 153       break;
 154     }
 155     inst1_addr -= BytesPerInstWord;
 156   }
 157 
 158   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 159   set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset));
 160   set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset));
 161   return inst1_addr;
 162 }
 163 
 164 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) {
 165   const address inst2_addr = a;
 166   const int inst2 = *(int *)inst2_addr;
 167 
 168   // The relocation points to the second instruction, the addi,
 169   // and the addi reads and writes the same register dst.
 170   const int dst = inv_rt_field(inst2);
 171   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 172 
 173   // Now, find the preceding addis which writes to dst.
 174   int inst1 = 0;
 175   address inst1_addr = inst2_addr - BytesPerInstWord;
 176   while (inst1_addr >= bound) {
 177     inst1 = *(int *) inst1_addr;
 178     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 179       // stop, found the addis which writes dst
 180       break;
 181     }
 182     inst1_addr -= BytesPerInstWord;
 183   }
 184 
 185   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 186 
 187   int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0);
 188   // -1 is a special case
 189   if (offset == -1) {
 190     return (address)(intptr_t)-1;
 191   } else {
 192     return global_toc() + offset;
 193   }
 194 }
 195 
 196 #ifdef _LP64
 197 // Patch compressed oops or klass constants.
 198 // Assembler sequence is
 199 // 1) compressed oops:
 200 //    lis  rx = const.hi
 201 //    ori rx = rx | const.lo
 202 // 2) compressed klass:
 203 //    lis  rx = const.hi
 204 //    clrldi rx = rx & 0xFFFFffff // clearMS32b, optional
 205 //    ori rx = rx | const.lo
 206 // Clrldi will be passed by.
 207 address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) {
 208   assert(UseCompressedOops, "Should only patch compressed oops");
 209 
 210   const address inst2_addr = a;
 211   const int inst2 = *(int *)inst2_addr;
 212 
 213   // The relocation points to the second instruction, the ori,
 214   // and the ori reads and writes the same register dst.
 215   const int dst = inv_rta_field(inst2);
 216   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 217   // Now, find the preceding addis which writes to dst.
 218   int inst1 = 0;
 219   address inst1_addr = inst2_addr - BytesPerInstWord;
 220   bool inst1_found = false;
 221   while (inst1_addr >= bound) {
 222     inst1 = *(int *)inst1_addr;
 223     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; }
 224     inst1_addr -= BytesPerInstWord;
 225   }
 226   assert(inst1_found, "inst is not lis");
 227 
 228   int xc = (data >> 16) & 0xffff;
 229   int xd = (data >>  0) & 0xffff;
 230 
 231   set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo
 232   set_imm((int *)inst2_addr,        (xd)); // unsigned int
 233   return inst1_addr;
 234 }
 235 
 236 // Get compressed oop or klass constant.
 237 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) {
 238   assert(UseCompressedOops, "Should only patch compressed oops");
 239 
 240   const address inst2_addr = a;
 241   const int inst2 = *(int *)inst2_addr;
 242 
 243   // The relocation points to the second instruction, the ori,
 244   // and the ori reads and writes the same register dst.
 245   const int dst = inv_rta_field(inst2);
 246   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 247   // Now, find the preceding lis which writes to dst.
 248   int inst1 = 0;
 249   address inst1_addr = inst2_addr - BytesPerInstWord;
 250   bool inst1_found = false;
 251 
 252   while (inst1_addr >= bound) {
 253     inst1 = *(int *) inst1_addr;
 254     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;}
 255     inst1_addr -= BytesPerInstWord;
 256   }
 257   assert(inst1_found, "inst is not lis");
 258 
 259   uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff));
 260   uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16);
 261 
 262   return (int) (xl | xh);
 263 }
 264 #endif // _LP64
 265 
 266 // Returns true if successful.
 267 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a,
 268                                                 Register toc, bool fixed_size) {
 269   int toc_offset = 0;
 270   // Use RelocationHolder::none for the constant pool entry, otherwise
 271   // we will end up with a failing NativeCall::verify(x) where x is
 272   // the address of the constant pool entry.
 273   // FIXME: We should insert relocation information for oops at the constant
 274   // pool entries instead of inserting it at the loads; patching of a constant
 275   // pool entry should be less expensive.
 276   address const_address = address_constant((address)a.value(), RelocationHolder::none);
 277   if (const_address == NULL) { return false; } // allocation failure
 278   // Relocate at the pc of the load.
 279   relocate(a.rspec());
 280   toc_offset = (int)(const_address - code()->consts()->start());
 281   ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size);
 282   return true;
 283 }
 284 
 285 bool MacroAssembler::is_load_const_from_method_toc_at(address a) {
 286   const address inst1_addr = a;
 287   const int inst1 = *(int *)inst1_addr;
 288 
 289    // The relocation points to the ld or the addis.
 290    return (is_ld(inst1)) ||
 291           (is_addis(inst1) && inv_ra_field(inst1) != 0);
 292 }
 293 
 294 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) {
 295   assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc");
 296 
 297   const address inst1_addr = a;
 298   const int inst1 = *(int *)inst1_addr;
 299 
 300   if (is_ld(inst1)) {
 301     return inv_d1_field(inst1);
 302   } else if (is_addis(inst1)) {
 303     const int dst = inv_rt_field(inst1);
 304 
 305     // Now, find the succeeding ld which reads and writes to dst.
 306     address inst2_addr = inst1_addr + BytesPerInstWord;
 307     int inst2 = 0;
 308     while (true) {
 309       inst2 = *(int *) inst2_addr;
 310       if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) {
 311         // Stop, found the ld which reads and writes dst.
 312         break;
 313       }
 314       inst2_addr += BytesPerInstWord;
 315     }
 316     return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2);
 317   }
 318   ShouldNotReachHere();
 319   return 0;
 320 }
 321 
 322 // Get the constant from a `load_const' sequence.
 323 long MacroAssembler::get_const(address a) {
 324   assert(is_load_const_at(a), "not a load of a constant");
 325   const int *p = (const int*) a;
 326   unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48);
 327   if (is_ori(*(p+1))) {
 328     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32);
 329     x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16);
 330     x |= (((unsigned long) (get_imm(a,4) & 0xffff)));
 331   } else if (is_lis(*(p+1))) {
 332     x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32);
 333     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16);
 334     x |= (((unsigned long) (get_imm(a,3) & 0xffff)));
 335   } else {
 336     ShouldNotReachHere();
 337     return (long) 0;
 338   }
 339   return (long) x;
 340 }
 341 
 342 // Patch the 64 bit constant of a `load_const' sequence. This is a low
 343 // level procedure. It neither flushes the instruction cache nor is it
 344 // mt safe.
 345 void MacroAssembler::patch_const(address a, long x) {
 346   assert(is_load_const_at(a), "not a load of a constant");
 347   int *p = (int*) a;
 348   if (is_ori(*(p+1))) {
 349     set_imm(0 + p, (x >> 48) & 0xffff);
 350     set_imm(1 + p, (x >> 32) & 0xffff);
 351     set_imm(3 + p, (x >> 16) & 0xffff);
 352     set_imm(4 + p, x & 0xffff);
 353   } else if (is_lis(*(p+1))) {
 354     set_imm(0 + p, (x >> 48) & 0xffff);
 355     set_imm(2 + p, (x >> 32) & 0xffff);
 356     set_imm(1 + p, (x >> 16) & 0xffff);
 357     set_imm(3 + p, x & 0xffff);
 358   } else {
 359     ShouldNotReachHere();
 360   }
 361 }
 362 
 363 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) {
 364   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
 365   int index = oop_recorder()->allocate_metadata_index(obj);
 366   RelocationHolder rspec = metadata_Relocation::spec(index);
 367   return AddressLiteral((address)obj, rspec);
 368 }
 369 
 370 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) {
 371   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
 372   int index = oop_recorder()->find_index(obj);
 373   RelocationHolder rspec = metadata_Relocation::spec(index);
 374   return AddressLiteral((address)obj, rspec);
 375 }
 376 
 377 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) {
 378   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
 379   int oop_index = oop_recorder()->allocate_oop_index(obj);
 380   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 381 }
 382 
 383 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) {
 384   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
 385   int oop_index = oop_recorder()->find_index(obj);
 386   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 387 }
 388 
 389 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
 390                                                       Register tmp, int offset) {
 391   intptr_t value = *delayed_value_addr;
 392   if (value != 0) {
 393     return RegisterOrConstant(value + offset);
 394   }
 395 
 396   // Load indirectly to solve generation ordering problem.
 397   // static address, no relocation
 398   int simm16_offset = load_const_optimized(tmp, delayed_value_addr, noreg, true);
 399   ld(tmp, simm16_offset, tmp); // must be aligned ((xa & 3) == 0)
 400 
 401   if (offset != 0) {
 402     addi(tmp, tmp, offset);
 403   }
 404 
 405   return RegisterOrConstant(tmp);
 406 }
 407 
 408 #ifndef PRODUCT
 409 void MacroAssembler::pd_print_patched_instruction(address branch) {
 410   Unimplemented(); // TODO: PPC port
 411 }
 412 #endif // ndef PRODUCT
 413 
 414 // Conditional far branch for destinations encodable in 24+2 bits.
 415 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) {
 416 
 417   // If requested by flag optimize, relocate the bc_far as a
 418   // runtime_call and prepare for optimizing it when the code gets
 419   // relocated.
 420   if (optimize == bc_far_optimize_on_relocate) {
 421     relocate(relocInfo::runtime_call_type);
 422   }
 423 
 424   // variant 2:
 425   //
 426   //    b!cxx SKIP
 427   //    bxx   DEST
 428   //  SKIP:
 429   //
 430 
 431   const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 432                                                 opposite_bcond(inv_boint_bcond(boint)));
 433 
 434   // We emit two branches.
 435   // First, a conditional branch which jumps around the far branch.
 436   const address not_taken_pc = pc() + 2 * BytesPerInstWord;
 437   const address bc_pc        = pc();
 438   bc(opposite_boint, biint, not_taken_pc);
 439 
 440   const int bc_instr = *(int*)bc_pc;
 441   assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition");
 442   assert(opposite_boint == inv_bo_field(bc_instr), "postcondition");
 443   assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))),
 444                                      opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))),
 445          "postcondition");
 446   assert(biint == inv_bi_field(bc_instr), "postcondition");
 447 
 448   // Second, an unconditional far branch which jumps to dest.
 449   // Note: target(dest) remembers the current pc (see CodeSection::target)
 450   //       and returns the current pc if the label is not bound yet; when
 451   //       the label gets bound, the unconditional far branch will be patched.
 452   const address target_pc = target(dest);
 453   const address b_pc  = pc();
 454   b(target_pc);
 455 
 456   assert(not_taken_pc == pc(),                     "postcondition");
 457   assert(dest.is_bound() || target_pc == b_pc, "postcondition");
 458 }
 459 
 460 // 1 or 2 instructions
 461 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) {
 462   if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) {
 463     bc(boint, biint, dest);
 464   } else {
 465     bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate);
 466   }
 467 }
 468 
 469 bool MacroAssembler::is_bc_far_at(address instruction_addr) {
 470   return is_bc_far_variant1_at(instruction_addr) ||
 471          is_bc_far_variant2_at(instruction_addr) ||
 472          is_bc_far_variant3_at(instruction_addr);
 473 }
 474 
 475 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) {
 476   if (is_bc_far_variant1_at(instruction_addr)) {
 477     const address instruction_1_addr = instruction_addr;
 478     const int instruction_1 = *(int*)instruction_1_addr;
 479     return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr);
 480   } else if (is_bc_far_variant2_at(instruction_addr)) {
 481     const address instruction_2_addr = instruction_addr + 4;
 482     return bxx_destination(instruction_2_addr);
 483   } else if (is_bc_far_variant3_at(instruction_addr)) {
 484     return instruction_addr + 8;
 485   }
 486   // variant 4 ???
 487   ShouldNotReachHere();
 488   return NULL;
 489 }
 490 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) {
 491 
 492   if (is_bc_far_variant3_at(instruction_addr)) {
 493     // variant 3, far cond branch to the next instruction, already patched to nops:
 494     //
 495     //    nop
 496     //    endgroup
 497     //  SKIP/DEST:
 498     //
 499     return;
 500   }
 501 
 502   // first, extract boint and biint from the current branch
 503   int boint = 0;
 504   int biint = 0;
 505 
 506   ResourceMark rm;
 507   const int code_size = 2 * BytesPerInstWord;
 508   CodeBuffer buf(instruction_addr, code_size);
 509   MacroAssembler masm(&buf);
 510   if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) {
 511     // Far branch to next instruction: Optimize it by patching nops (produce variant 3).
 512     masm.nop();
 513     masm.endgroup();
 514   } else {
 515     if (is_bc_far_variant1_at(instruction_addr)) {
 516       // variant 1, the 1st instruction contains the destination address:
 517       //
 518       //    bcxx  DEST
 519       //    nop
 520       //
 521       const int instruction_1 = *(int*)(instruction_addr);
 522       boint = inv_bo_field(instruction_1);
 523       biint = inv_bi_field(instruction_1);
 524     } else if (is_bc_far_variant2_at(instruction_addr)) {
 525       // variant 2, the 2nd instruction contains the destination address:
 526       //
 527       //    b!cxx SKIP
 528       //    bxx   DEST
 529       //  SKIP:
 530       //
 531       const int instruction_1 = *(int*)(instruction_addr);
 532       boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))),
 533           opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1))));
 534       biint = inv_bi_field(instruction_1);
 535     } else {
 536       // variant 4???
 537       ShouldNotReachHere();
 538     }
 539 
 540     // second, set the new branch destination and optimize the code
 541     if (dest != instruction_addr + 4 && // the bc_far is still unbound!
 542         masm.is_within_range_of_bcxx(dest, instruction_addr)) {
 543       // variant 1:
 544       //
 545       //    bcxx  DEST
 546       //    nop
 547       //
 548       masm.bc(boint, biint, dest);
 549       masm.nop();
 550     } else {
 551       // variant 2:
 552       //
 553       //    b!cxx SKIP
 554       //    bxx   DEST
 555       //  SKIP:
 556       //
 557       const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 558                                                     opposite_bcond(inv_boint_bcond(boint)));
 559       const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord;
 560       masm.bc(opposite_boint, biint, not_taken_pc);
 561       masm.b(dest);
 562     }
 563   }
 564   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 565 }
 566 
 567 // Emit a NOT mt-safe patchable 64 bit absolute call/jump.
 568 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) {
 569   // get current pc
 570   uint64_t start_pc = (uint64_t) pc();
 571 
 572   const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last
 573   const address pc_of_b  = (address) (start_pc + (0*BytesPerInstWord)); // b is first
 574 
 575   // relocate here
 576   if (rt != relocInfo::none) {
 577     relocate(rt);
 578   }
 579 
 580   if ( ReoptimizeCallSequences &&
 581        (( link && is_within_range_of_b(dest, pc_of_bl)) ||
 582         (!link && is_within_range_of_b(dest, pc_of_b)))) {
 583     // variant 2:
 584     // Emit an optimized, pc-relative call/jump.
 585 
 586     if (link) {
 587       // some padding
 588       nop();
 589       nop();
 590       nop();
 591       nop();
 592       nop();
 593       nop();
 594 
 595       // do the call
 596       assert(pc() == pc_of_bl, "just checking");
 597       bl(dest, relocInfo::none);
 598     } else {
 599       // do the jump
 600       assert(pc() == pc_of_b, "just checking");
 601       b(dest, relocInfo::none);
 602 
 603       // some padding
 604       nop();
 605       nop();
 606       nop();
 607       nop();
 608       nop();
 609       nop();
 610     }
 611 
 612     // Assert that we can identify the emitted call/jump.
 613     assert(is_bxx64_patchable_variant2_at((address)start_pc, link),
 614            "can't identify emitted call");
 615   } else {
 616     // variant 1:
 617     mr(R0, R11);  // spill R11 -> R0.
 618 
 619     // Load the destination address into CTR,
 620     // calculate destination relative to global toc.
 621     calculate_address_from_global_toc(R11, dest, true, true, false);
 622 
 623     mtctr(R11);
 624     mr(R11, R0);  // spill R11 <- R0.
 625     nop();
 626 
 627     // do the call/jump
 628     if (link) {
 629       bctrl();
 630     } else{
 631       bctr();
 632     }
 633     // Assert that we can identify the emitted call/jump.
 634     assert(is_bxx64_patchable_variant1b_at((address)start_pc, link),
 635            "can't identify emitted call");
 636   }
 637 
 638   // Assert that we can identify the emitted call/jump.
 639   assert(is_bxx64_patchable_at((address)start_pc, link),
 640          "can't identify emitted call");
 641   assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest,
 642          "wrong encoding of dest address");
 643 }
 644 
 645 // Identify a bxx64_patchable instruction.
 646 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) {
 647   return is_bxx64_patchable_variant1b_at(instruction_addr, link)
 648     //|| is_bxx64_patchable_variant1_at(instruction_addr, link)
 649       || is_bxx64_patchable_variant2_at(instruction_addr, link);
 650 }
 651 
 652 // Does the call64_patchable instruction use a pc-relative encoding of
 653 // the call destination?
 654 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) {
 655   // variant 2 is pc-relative
 656   return is_bxx64_patchable_variant2_at(instruction_addr, link);
 657 }
 658 
 659 // Identify variant 1.
 660 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) {
 661   unsigned int* instr = (unsigned int*) instruction_addr;
 662   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 663       && is_mtctr(instr[5]) // mtctr
 664     && is_load_const_at(instruction_addr);
 665 }
 666 
 667 // Identify variant 1b: load destination relative to global toc.
 668 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) {
 669   unsigned int* instr = (unsigned int*) instruction_addr;
 670   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 671     && is_mtctr(instr[3]) // mtctr
 672     && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr);
 673 }
 674 
 675 // Identify variant 2.
 676 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) {
 677   unsigned int* instr = (unsigned int*) instruction_addr;
 678   if (link) {
 679     return is_bl (instr[6])  // bl dest is last
 680       && is_nop(instr[0])  // nop
 681       && is_nop(instr[1])  // nop
 682       && is_nop(instr[2])  // nop
 683       && is_nop(instr[3])  // nop
 684       && is_nop(instr[4])  // nop
 685       && is_nop(instr[5]); // nop
 686   } else {
 687     return is_b  (instr[0])  // b  dest is first
 688       && is_nop(instr[1])  // nop
 689       && is_nop(instr[2])  // nop
 690       && is_nop(instr[3])  // nop
 691       && is_nop(instr[4])  // nop
 692       && is_nop(instr[5])  // nop
 693       && is_nop(instr[6]); // nop
 694   }
 695 }
 696 
 697 // Set dest address of a bxx64_patchable instruction.
 698 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) {
 699   ResourceMark rm;
 700   int code_size = MacroAssembler::bxx64_patchable_size;
 701   CodeBuffer buf(instruction_addr, code_size);
 702   MacroAssembler masm(&buf);
 703   masm.bxx64_patchable(dest, relocInfo::none, link);
 704   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 705 }
 706 
 707 // Get dest address of a bxx64_patchable instruction.
 708 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) {
 709   if (is_bxx64_patchable_variant1_at(instruction_addr, link)) {
 710     return (address) (unsigned long) get_const(instruction_addr);
 711   } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) {
 712     unsigned int* instr = (unsigned int*) instruction_addr;
 713     if (link) {
 714       const int instr_idx = 6; // bl is last
 715       int branchoffset = branch_destination(instr[instr_idx], 0);
 716       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 717     } else {
 718       const int instr_idx = 0; // b is first
 719       int branchoffset = branch_destination(instr[instr_idx], 0);
 720       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 721     }
 722   // Load dest relative to global toc.
 723   } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) {
 724     return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord,
 725                                                                instruction_addr);
 726   } else {
 727     ShouldNotReachHere();
 728     return NULL;
 729   }
 730 }
 731 
 732 // Uses ordering which corresponds to ABI:
 733 //    _savegpr0_14:  std  r14,-144(r1)
 734 //    _savegpr0_15:  std  r15,-136(r1)
 735 //    _savegpr0_16:  std  r16,-128(r1)
 736 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) {
 737   std(R14, offset, dst);   offset += 8;
 738   std(R15, offset, dst);   offset += 8;
 739   std(R16, offset, dst);   offset += 8;
 740   std(R17, offset, dst);   offset += 8;
 741   std(R18, offset, dst);   offset += 8;
 742   std(R19, offset, dst);   offset += 8;
 743   std(R20, offset, dst);   offset += 8;
 744   std(R21, offset, dst);   offset += 8;
 745   std(R22, offset, dst);   offset += 8;
 746   std(R23, offset, dst);   offset += 8;
 747   std(R24, offset, dst);   offset += 8;
 748   std(R25, offset, dst);   offset += 8;
 749   std(R26, offset, dst);   offset += 8;
 750   std(R27, offset, dst);   offset += 8;
 751   std(R28, offset, dst);   offset += 8;
 752   std(R29, offset, dst);   offset += 8;
 753   std(R30, offset, dst);   offset += 8;
 754   std(R31, offset, dst);   offset += 8;
 755 
 756   stfd(F14, offset, dst);   offset += 8;
 757   stfd(F15, offset, dst);   offset += 8;
 758   stfd(F16, offset, dst);   offset += 8;
 759   stfd(F17, offset, dst);   offset += 8;
 760   stfd(F18, offset, dst);   offset += 8;
 761   stfd(F19, offset, dst);   offset += 8;
 762   stfd(F20, offset, dst);   offset += 8;
 763   stfd(F21, offset, dst);   offset += 8;
 764   stfd(F22, offset, dst);   offset += 8;
 765   stfd(F23, offset, dst);   offset += 8;
 766   stfd(F24, offset, dst);   offset += 8;
 767   stfd(F25, offset, dst);   offset += 8;
 768   stfd(F26, offset, dst);   offset += 8;
 769   stfd(F27, offset, dst);   offset += 8;
 770   stfd(F28, offset, dst);   offset += 8;
 771   stfd(F29, offset, dst);   offset += 8;
 772   stfd(F30, offset, dst);   offset += 8;
 773   stfd(F31, offset, dst);
 774 }
 775 
 776 // Uses ordering which corresponds to ABI:
 777 //    _restgpr0_14:  ld   r14,-144(r1)
 778 //    _restgpr0_15:  ld   r15,-136(r1)
 779 //    _restgpr0_16:  ld   r16,-128(r1)
 780 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) {
 781   ld(R14, offset, src);   offset += 8;
 782   ld(R15, offset, src);   offset += 8;
 783   ld(R16, offset, src);   offset += 8;
 784   ld(R17, offset, src);   offset += 8;
 785   ld(R18, offset, src);   offset += 8;
 786   ld(R19, offset, src);   offset += 8;
 787   ld(R20, offset, src);   offset += 8;
 788   ld(R21, offset, src);   offset += 8;
 789   ld(R22, offset, src);   offset += 8;
 790   ld(R23, offset, src);   offset += 8;
 791   ld(R24, offset, src);   offset += 8;
 792   ld(R25, offset, src);   offset += 8;
 793   ld(R26, offset, src);   offset += 8;
 794   ld(R27, offset, src);   offset += 8;
 795   ld(R28, offset, src);   offset += 8;
 796   ld(R29, offset, src);   offset += 8;
 797   ld(R30, offset, src);   offset += 8;
 798   ld(R31, offset, src);   offset += 8;
 799 
 800   // FP registers
 801   lfd(F14, offset, src);   offset += 8;
 802   lfd(F15, offset, src);   offset += 8;
 803   lfd(F16, offset, src);   offset += 8;
 804   lfd(F17, offset, src);   offset += 8;
 805   lfd(F18, offset, src);   offset += 8;
 806   lfd(F19, offset, src);   offset += 8;
 807   lfd(F20, offset, src);   offset += 8;
 808   lfd(F21, offset, src);   offset += 8;
 809   lfd(F22, offset, src);   offset += 8;
 810   lfd(F23, offset, src);   offset += 8;
 811   lfd(F24, offset, src);   offset += 8;
 812   lfd(F25, offset, src);   offset += 8;
 813   lfd(F26, offset, src);   offset += 8;
 814   lfd(F27, offset, src);   offset += 8;
 815   lfd(F28, offset, src);   offset += 8;
 816   lfd(F29, offset, src);   offset += 8;
 817   lfd(F30, offset, src);   offset += 8;
 818   lfd(F31, offset, src);
 819 }
 820 
 821 // For verify_oops.
 822 void MacroAssembler::save_volatile_gprs(Register dst, int offset) {
 823   std(R2,  offset, dst);   offset += 8;
 824   std(R3,  offset, dst);   offset += 8;
 825   std(R4,  offset, dst);   offset += 8;
 826   std(R5,  offset, dst);   offset += 8;
 827   std(R6,  offset, dst);   offset += 8;
 828   std(R7,  offset, dst);   offset += 8;
 829   std(R8,  offset, dst);   offset += 8;
 830   std(R9,  offset, dst);   offset += 8;
 831   std(R10, offset, dst);   offset += 8;
 832   std(R11, offset, dst);   offset += 8;
 833   std(R12, offset, dst);   offset += 8;
 834 
 835   stfd(F0, offset, dst);   offset += 8;
 836   stfd(F1, offset, dst);   offset += 8;
 837   stfd(F2, offset, dst);   offset += 8;
 838   stfd(F3, offset, dst);   offset += 8;
 839   stfd(F4, offset, dst);   offset += 8;
 840   stfd(F5, offset, dst);   offset += 8;
 841   stfd(F6, offset, dst);   offset += 8;
 842   stfd(F7, offset, dst);   offset += 8;
 843   stfd(F8, offset, dst);   offset += 8;
 844   stfd(F9, offset, dst);   offset += 8;
 845   stfd(F10, offset, dst);  offset += 8;
 846   stfd(F11, offset, dst);  offset += 8;
 847   stfd(F12, offset, dst);  offset += 8;
 848   stfd(F13, offset, dst);
 849 }
 850 
 851 // For verify_oops.
 852 void MacroAssembler::restore_volatile_gprs(Register src, int offset) {
 853   ld(R2,  offset, src);   offset += 8;
 854   ld(R3,  offset, src);   offset += 8;
 855   ld(R4,  offset, src);   offset += 8;
 856   ld(R5,  offset, src);   offset += 8;
 857   ld(R6,  offset, src);   offset += 8;
 858   ld(R7,  offset, src);   offset += 8;
 859   ld(R8,  offset, src);   offset += 8;
 860   ld(R9,  offset, src);   offset += 8;
 861   ld(R10, offset, src);   offset += 8;
 862   ld(R11, offset, src);   offset += 8;
 863   ld(R12, offset, src);   offset += 8;
 864 
 865   lfd(F0, offset, src);   offset += 8;
 866   lfd(F1, offset, src);   offset += 8;
 867   lfd(F2, offset, src);   offset += 8;
 868   lfd(F3, offset, src);   offset += 8;
 869   lfd(F4, offset, src);   offset += 8;
 870   lfd(F5, offset, src);   offset += 8;
 871   lfd(F6, offset, src);   offset += 8;
 872   lfd(F7, offset, src);   offset += 8;
 873   lfd(F8, offset, src);   offset += 8;
 874   lfd(F9, offset, src);   offset += 8;
 875   lfd(F10, offset, src);  offset += 8;
 876   lfd(F11, offset, src);  offset += 8;
 877   lfd(F12, offset, src);  offset += 8;
 878   lfd(F13, offset, src);
 879 }
 880 
 881 void MacroAssembler::save_LR_CR(Register tmp) {
 882   mfcr(tmp);
 883   std(tmp, _abi(cr), R1_SP);
 884   mflr(tmp);
 885   std(tmp, _abi(lr), R1_SP);
 886   // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad)
 887 }
 888 
 889 void MacroAssembler::restore_LR_CR(Register tmp) {
 890   assert(tmp != R1_SP, "must be distinct");
 891   ld(tmp, _abi(lr), R1_SP);
 892   mtlr(tmp);
 893   ld(tmp, _abi(cr), R1_SP);
 894   mtcr(tmp);
 895 }
 896 
 897 address MacroAssembler::get_PC_trash_LR(Register result) {
 898   Label L;
 899   bl(L);
 900   bind(L);
 901   address lr_pc = pc();
 902   mflr(result);
 903   return lr_pc;
 904 }
 905 
 906 void MacroAssembler::resize_frame(Register offset, Register tmp) {
 907 #ifdef ASSERT
 908   assert_different_registers(offset, tmp, R1_SP);
 909   andi_(tmp, offset, frame::alignment_in_bytes-1);
 910   asm_assert_eq("resize_frame: unaligned", 0x204);
 911 #endif
 912 
 913   // tmp <- *(SP)
 914   ld(tmp, _abi(callers_sp), R1_SP);
 915   // addr <- SP + offset;
 916   // *(addr) <- tmp;
 917   // SP <- addr
 918   stdux(tmp, R1_SP, offset);
 919 }
 920 
 921 void MacroAssembler::resize_frame(int offset, Register tmp) {
 922   assert(is_simm(offset, 16), "too big an offset");
 923   assert_different_registers(tmp, R1_SP);
 924   assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned");
 925   // tmp <- *(SP)
 926   ld(tmp, _abi(callers_sp), R1_SP);
 927   // addr <- SP + offset;
 928   // *(addr) <- tmp;
 929   // SP <- addr
 930   stdu(tmp, offset, R1_SP);
 931 }
 932 
 933 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) {
 934   // (addr == tmp1) || (addr == tmp2) is allowed here!
 935   assert(tmp1 != tmp2, "must be distinct");
 936 
 937   // compute offset w.r.t. current stack pointer
 938   // tmp_1 <- addr - SP (!)
 939   subf(tmp1, R1_SP, addr);
 940 
 941   // atomically update SP keeping back link.
 942   resize_frame(tmp1/* offset */, tmp2/* tmp */);
 943 }
 944 
 945 void MacroAssembler::push_frame(Register bytes, Register tmp) {
 946 #ifdef ASSERT
 947   assert(bytes != R0, "r0 not allowed here");
 948   andi_(R0, bytes, frame::alignment_in_bytes-1);
 949   asm_assert_eq("push_frame(Reg, Reg): unaligned", 0x203);
 950 #endif
 951   neg(tmp, bytes);
 952   stdux(R1_SP, R1_SP, tmp);
 953 }
 954 
 955 // Push a frame of size `bytes'.
 956 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) {
 957   long offset = align_addr(bytes, frame::alignment_in_bytes);
 958   if (is_simm(-offset, 16)) {
 959     stdu(R1_SP, -offset, R1_SP);
 960   } else {
 961     load_const_optimized(tmp, -offset);
 962     stdux(R1_SP, R1_SP, tmp);
 963   }
 964 }
 965 
 966 // Push a frame of size `bytes' plus abi_reg_args on top.
 967 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) {
 968   push_frame(bytes + frame::abi_reg_args_size, tmp);
 969 }
 970 
 971 // Setup up a new C frame with a spill area for non-volatile GPRs and
 972 // additional space for local variables.
 973 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes,
 974                                                       Register tmp) {
 975   push_frame(bytes + frame::abi_reg_args_size + frame::spill_nonvolatiles_size, tmp);
 976 }
 977 
 978 // Pop current C frame.
 979 void MacroAssembler::pop_frame() {
 980   ld(R1_SP, _abi(callers_sp), R1_SP);
 981 }
 982 
 983 #if defined(ABI_ELFv2)
 984 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) {
 985   // TODO(asmundak): make sure the caller uses R12 as function descriptor
 986   // most of the times.
 987   if (R12 != r_function_entry) {
 988     mr(R12, r_function_entry);
 989   }
 990   mtctr(R12);
 991   // Do a call or a branch.
 992   if (and_link) {
 993     bctrl();
 994   } else {
 995     bctr();
 996   }
 997   _last_calls_return_pc = pc();
 998 
 999   return _last_calls_return_pc;
1000 }
1001 
1002 // Call a C function via a function descriptor and use full C
1003 // calling conventions. Updates and returns _last_calls_return_pc.
1004 address MacroAssembler::call_c(Register r_function_entry) {
1005   return branch_to(r_function_entry, /*and_link=*/true);
1006 }
1007 
1008 // For tail calls: only branch, don't link, so callee returns to caller of this function.
1009 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) {
1010   return branch_to(r_function_entry, /*and_link=*/false);
1011 }
1012 
1013 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) {
1014   load_const(R12, function_entry, R0);
1015   return branch_to(R12,  /*and_link=*/true);
1016 }
1017 
1018 #else
1019 // Generic version of a call to C function via a function descriptor
1020 // with variable support for C calling conventions (TOC, ENV, etc.).
1021 // Updates and returns _last_calls_return_pc.
1022 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call,
1023                                   bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) {
1024   // we emit standard ptrgl glue code here
1025   assert((function_descriptor != R0), "function_descriptor cannot be R0");
1026 
1027   // retrieve necessary entries from the function descriptor
1028   ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor);
1029   mtctr(R0);
1030 
1031   if (load_toc_of_callee) {
1032     ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor);
1033   }
1034   if (load_env_of_callee) {
1035     ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor);
1036   } else if (load_toc_of_callee) {
1037     li(R11, 0);
1038   }
1039 
1040   // do a call or a branch
1041   if (and_link) {
1042     bctrl();
1043   } else {
1044     bctr();
1045   }
1046   _last_calls_return_pc = pc();
1047 
1048   return _last_calls_return_pc;
1049 }
1050 
1051 // Call a C function via a function descriptor and use full C calling
1052 // conventions.
1053 // We don't use the TOC in generated code, so there is no need to save
1054 // and restore its value.
1055 address MacroAssembler::call_c(Register fd) {
1056   return branch_to(fd, /*and_link=*/true,
1057                        /*save toc=*/false,
1058                        /*restore toc=*/false,
1059                        /*load toc=*/true,
1060                        /*load env=*/true);
1061 }
1062 
1063 address MacroAssembler::call_c_and_return_to_caller(Register fd) {
1064   return branch_to(fd, /*and_link=*/false,
1065                        /*save toc=*/false,
1066                        /*restore toc=*/false,
1067                        /*load toc=*/true,
1068                        /*load env=*/true);
1069 }
1070 
1071 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) {
1072   if (rt != relocInfo::none) {
1073     // this call needs to be relocatable
1074     if (!ReoptimizeCallSequences
1075         || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1076         || fd == NULL   // support code-size estimation
1077         || !fd->is_friend_function()
1078         || fd->entry() == NULL) {
1079       // it's not a friend function as defined by class FunctionDescriptor,
1080       // so do a full call-c here.
1081       load_const(R11, (address)fd, R0);
1082 
1083       bool has_env = (fd != NULL && fd->env() != NULL);
1084       return branch_to(R11, /*and_link=*/true,
1085                             /*save toc=*/false,
1086                             /*restore toc=*/false,
1087                             /*load toc=*/true,
1088                             /*load env=*/has_env);
1089     } else {
1090       // It's a friend function. Load the entry point and don't care about
1091       // toc and env. Use an optimizable call instruction, but ensure the
1092       // same code-size as in the case of a non-friend function.
1093       nop();
1094       nop();
1095       nop();
1096       bl64_patchable(fd->entry(), rt);
1097       _last_calls_return_pc = pc();
1098       return _last_calls_return_pc;
1099     }
1100   } else {
1101     // This call does not need to be relocatable, do more aggressive
1102     // optimizations.
1103     if (!ReoptimizeCallSequences
1104       || !fd->is_friend_function()) {
1105       // It's not a friend function as defined by class FunctionDescriptor,
1106       // so do a full call-c here.
1107       load_const(R11, (address)fd, R0);
1108       return branch_to(R11, /*and_link=*/true,
1109                             /*save toc=*/false,
1110                             /*restore toc=*/false,
1111                             /*load toc=*/true,
1112                             /*load env=*/true);
1113     } else {
1114       // it's a friend function, load the entry point and don't care about
1115       // toc and env.
1116       address dest = fd->entry();
1117       if (is_within_range_of_b(dest, pc())) {
1118         bl(dest);
1119       } else {
1120         bl64_patchable(dest, rt);
1121       }
1122       _last_calls_return_pc = pc();
1123       return _last_calls_return_pc;
1124     }
1125   }
1126 }
1127 
1128 // Call a C function.  All constants needed reside in TOC.
1129 //
1130 // Read the address to call from the TOC.
1131 // Read env from TOC, if fd specifies an env.
1132 // Read new TOC from TOC.
1133 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd,
1134                                          relocInfo::relocType rt, Register toc) {
1135   if (!ReoptimizeCallSequences
1136     || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1137     || !fd->is_friend_function()) {
1138     // It's not a friend function as defined by class FunctionDescriptor,
1139     // so do a full call-c here.
1140     assert(fd->entry() != NULL, "function must be linked");
1141 
1142     AddressLiteral fd_entry(fd->entry());
1143     bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true);
1144     mtctr(R11);
1145     if (fd->env() == NULL) {
1146       li(R11, 0);
1147       nop();
1148     } else {
1149       AddressLiteral fd_env(fd->env());
1150       success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true);
1151     }
1152     AddressLiteral fd_toc(fd->toc());
1153     // Set R2_TOC (load from toc)
1154     success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true);
1155     bctrl();
1156     _last_calls_return_pc = pc();
1157     if (!success) { return NULL; }
1158   } else {
1159     // It's a friend function, load the entry point and don't care about
1160     // toc and env. Use an optimizable call instruction, but ensure the
1161     // same code-size as in the case of a non-friend function.
1162     nop();
1163     bl64_patchable(fd->entry(), rt);
1164     _last_calls_return_pc = pc();
1165   }
1166   return _last_calls_return_pc;
1167 }
1168 #endif // ABI_ELFv2
1169 
1170 void MacroAssembler::call_VM_base(Register oop_result,
1171                                   Register last_java_sp,
1172                                   address  entry_point,
1173                                   bool     check_exceptions) {
1174   BLOCK_COMMENT("call_VM {");
1175   // Determine last_java_sp register.
1176   if (!last_java_sp->is_valid()) {
1177     last_java_sp = R1_SP;
1178   }
1179   set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1);
1180 
1181   // ARG1 must hold thread address.
1182   mr(R3_ARG1, R16_thread);
1183 #if defined(ABI_ELFv2)
1184   address return_pc = call_c(entry_point, relocInfo::none);
1185 #else
1186   address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none);
1187 #endif
1188 
1189   reset_last_Java_frame();
1190 
1191   // Check for pending exceptions.
1192   if (check_exceptions) {
1193     // We don't check for exceptions here.
1194     ShouldNotReachHere();
1195   }
1196 
1197   // Get oop result if there is one and reset the value in the thread.
1198   if (oop_result->is_valid()) {
1199     get_vm_result(oop_result);
1200   }
1201 
1202   _last_calls_return_pc = return_pc;
1203   BLOCK_COMMENT("} call_VM");
1204 }
1205 
1206 void MacroAssembler::call_VM_leaf_base(address entry_point) {
1207   BLOCK_COMMENT("call_VM_leaf {");
1208 #if defined(ABI_ELFv2)
1209   call_c(entry_point, relocInfo::none);
1210 #else
1211   call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none);
1212 #endif
1213   BLOCK_COMMENT("} call_VM_leaf");
1214 }
1215 
1216 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) {
1217   call_VM_base(oop_result, noreg, entry_point, check_exceptions);
1218 }
1219 
1220 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1,
1221                              bool check_exceptions) {
1222   // R3_ARG1 is reserved for the thread.
1223   mr_if_needed(R4_ARG2, arg_1);
1224   call_VM(oop_result, entry_point, check_exceptions);
1225 }
1226 
1227 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2,
1228                              bool check_exceptions) {
1229   // R3_ARG1 is reserved for the thread
1230   mr_if_needed(R4_ARG2, arg_1);
1231   assert(arg_2 != R4_ARG2, "smashed argument");
1232   mr_if_needed(R5_ARG3, arg_2);
1233   call_VM(oop_result, entry_point, check_exceptions);
1234 }
1235 
1236 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3,
1237                              bool check_exceptions) {
1238   // R3_ARG1 is reserved for the thread
1239   mr_if_needed(R4_ARG2, arg_1);
1240   assert(arg_2 != R4_ARG2, "smashed argument");
1241   mr_if_needed(R5_ARG3, arg_2);
1242   mr_if_needed(R6_ARG4, arg_3);
1243   call_VM(oop_result, entry_point, check_exceptions);
1244 }
1245 
1246 void MacroAssembler::call_VM_leaf(address entry_point) {
1247   call_VM_leaf_base(entry_point);
1248 }
1249 
1250 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) {
1251   mr_if_needed(R3_ARG1, arg_1);
1252   call_VM_leaf(entry_point);
1253 }
1254 
1255 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) {
1256   mr_if_needed(R3_ARG1, arg_1);
1257   assert(arg_2 != R3_ARG1, "smashed argument");
1258   mr_if_needed(R4_ARG2, arg_2);
1259   call_VM_leaf(entry_point);
1260 }
1261 
1262 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) {
1263   mr_if_needed(R3_ARG1, arg_1);
1264   assert(arg_2 != R3_ARG1, "smashed argument");
1265   mr_if_needed(R4_ARG2, arg_2);
1266   assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument");
1267   mr_if_needed(R5_ARG3, arg_3);
1268   call_VM_leaf(entry_point);
1269 }
1270 
1271 // Check whether instruction is a read access to the polling page
1272 // which was emitted by load_from_polling_page(..).
1273 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext,
1274                                                address* polling_address_ptr) {
1275   if (!is_ld(instruction))
1276     return false; // It's not a ld. Fail.
1277 
1278   int rt = inv_rt_field(instruction);
1279   int ra = inv_ra_field(instruction);
1280   int ds = inv_ds_field(instruction);
1281   if (!(ds == 0 && ra != 0 && rt == 0)) {
1282     return false; // It's not a ld(r0, X, ra). Fail.
1283   }
1284 
1285   if (!ucontext) {
1286     // Set polling address.
1287     if (polling_address_ptr != NULL) {
1288       *polling_address_ptr = NULL;
1289     }
1290     return true; // No ucontext given. Can't check value of ra. Assume true.
1291   }
1292 
1293 #ifdef LINUX
1294   // Ucontext given. Check that register ra contains the address of
1295   // the safepoing polling page.
1296   ucontext_t* uc = (ucontext_t*) ucontext;
1297   // Set polling address.
1298   address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds;
1299   if (polling_address_ptr != NULL) {
1300     *polling_address_ptr = addr;
1301   }
1302   return os::is_poll_address(addr);
1303 #else
1304   // Not on Linux, ucontext must be NULL.
1305   ShouldNotReachHere();
1306   return false;
1307 #endif
1308 }
1309 
1310 bool MacroAssembler::is_memory_serialization(int instruction, JavaThread* thread, void* ucontext) {
1311 #ifdef LINUX
1312   ucontext_t* uc = (ucontext_t*) ucontext;
1313 
1314   if (is_stwx(instruction) || is_stwux(instruction)) {
1315     int ra = inv_ra_field(instruction);
1316     int rb = inv_rb_field(instruction);
1317 
1318     // look up content of ra and rb in ucontext
1319     address ra_val=(address)uc->uc_mcontext.regs->gpr[ra];
1320     long rb_val=(long)uc->uc_mcontext.regs->gpr[rb];
1321     return os::is_memory_serialize_page(thread, ra_val+rb_val);
1322   } else if (is_stw(instruction) || is_stwu(instruction)) {
1323     int ra = inv_ra_field(instruction);
1324     int d1 = inv_d1_field(instruction);
1325 
1326     // look up content of ra in ucontext
1327     address ra_val=(address)uc->uc_mcontext.regs->gpr[ra];
1328     return os::is_memory_serialize_page(thread, ra_val+d1);
1329   } else {
1330     return false;
1331   }
1332 #else
1333   // workaround not needed on !LINUX :-)
1334   ShouldNotCallThis();
1335   return false;
1336 #endif
1337 }
1338 
1339 void MacroAssembler::bang_stack_with_offset(int offset) {
1340   // When increasing the stack, the old stack pointer will be written
1341   // to the new top of stack according to the PPC64 abi.
1342   // Therefore, stack banging is not necessary when increasing
1343   // the stack by <= os::vm_page_size() bytes.
1344   // When increasing the stack by a larger amount, this method is
1345   // called repeatedly to bang the intermediate pages.
1346 
1347   // Stack grows down, caller passes positive offset.
1348   assert(offset > 0, "must bang with positive offset");
1349 
1350   long stdoffset = -offset;
1351 
1352   if (is_simm(stdoffset, 16)) {
1353     // Signed 16 bit offset, a simple std is ok.
1354     if (UseLoadInstructionsForStackBangingPPC64) {
1355       ld(R0, (int)(signed short)stdoffset, R1_SP);
1356     } else {
1357       std(R0,(int)(signed short)stdoffset, R1_SP);
1358     }
1359   } else if (is_simm(stdoffset, 31)) {
1360     const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset);
1361     const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset);
1362 
1363     Register tmp = R11;
1364     addis(tmp, R1_SP, hi);
1365     if (UseLoadInstructionsForStackBangingPPC64) {
1366       ld(R0,  lo, tmp);
1367     } else {
1368       std(R0, lo, tmp);
1369     }
1370   } else {
1371     ShouldNotReachHere();
1372   }
1373 }
1374 
1375 // If instruction is a stack bang of the form
1376 //    std    R0,    x(Ry),       (see bang_stack_with_offset())
1377 //    stdu   R1_SP, x(R1_SP),    (see push_frame(), resize_frame())
1378 // or stdux  R1_SP, Rx, R1_SP    (see push_frame(), resize_frame())
1379 // return the banged address. Otherwise, return 0.
1380 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) {
1381 #ifdef LINUX
1382   ucontext_t* uc = (ucontext_t*) ucontext;
1383   int rs = inv_rs_field(instruction);
1384   int ra = inv_ra_field(instruction);
1385   if (   (is_ld(instruction)   && rs == 0 &&  UseLoadInstructionsForStackBangingPPC64)
1386       || (is_std(instruction)  && rs == 0 && !UseLoadInstructionsForStackBangingPPC64)
1387       || (is_stdu(instruction) && rs == 1)) {
1388     int ds = inv_ds_field(instruction);
1389     // return banged address
1390     return ds+(address)uc->uc_mcontext.regs->gpr[ra];
1391   } else if (is_stdux(instruction) && rs == 1) {
1392     int rb = inv_rb_field(instruction);
1393     address sp = (address)uc->uc_mcontext.regs->gpr[1];
1394     long rb_val = (long)uc->uc_mcontext.regs->gpr[rb];
1395     return ra != 1 || rb_val >= 0 ? NULL         // not a stack bang
1396                                   : sp + rb_val; // banged address
1397   }
1398   return NULL; // not a stack bang
1399 #else
1400   // workaround not needed on !LINUX :-)
1401   ShouldNotCallThis();
1402   return NULL;
1403 #endif
1404 }
1405 
1406 void MacroAssembler::reserved_stack_check(Register return_pc) {
1407   // Test if reserved zone needs to be enabled.
1408   Label no_reserved_zone_enabling;
1409 
1410   ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread);
1411   cmpld(CCR0, R1_SP, R0);
1412   blt_predict_taken(CCR0, no_reserved_zone_enabling);
1413 
1414   // Enable reserved zone again, throw stack overflow exception.
1415   push_frame_reg_args(0, R0);
1416   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread);
1417   pop_frame();
1418   mtlr(return_pc);
1419   load_const_optimized(R0, StubRoutines::throw_delayed_StackOverflowError_entry());
1420   mtctr(R0);
1421   bctr();
1422 
1423   should_not_reach_here();
1424 
1425   bind(no_reserved_zone_enabling);
1426 }
1427 
1428 void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base,
1429                                 bool cmpxchgx_hint) {
1430   Label retry;
1431   bind(retry);
1432   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1433   stdcx_(exchange_value, addr_base);
1434   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1435     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1436   } else {
1437     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1438   }
1439 }
1440 
1441 void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base,
1442                                 Register tmp, bool cmpxchgx_hint) {
1443   Label retry;
1444   bind(retry);
1445   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1446   add(tmp, dest_current_value, inc_value);
1447   stdcx_(tmp, addr_base);
1448   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1449     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1450   } else {
1451     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1452   }
1453 }
1454 
1455 // Word/sub-word atomic helper functions
1456 
1457 // Temps and addr_base are killed if size < 4 and processor does not support respective instructions.
1458 // Only signed types are supported with size < 4.
1459 // Atomic add always kills tmp1.
1460 void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value,
1461                                                    Register addr_base, Register tmp1, Register tmp2, Register tmp3,
1462                                                    bool cmpxchgx_hint, bool is_add, int size) {
1463   // Sub-word instructions are available since Power 8.
1464   // For older processors, instruction_type != size holds, and we
1465   // emulate the sub-word instructions by constructing a 4-byte value
1466   // that leaves the other bytes unchanged.
1467   const int instruction_type = VM_Version::has_lqarx() ? size : 4;
1468 
1469   Label retry;
1470   Register shift_amount = noreg,
1471            val32 = dest_current_value,
1472            modval = is_add ? tmp1 : exchange_value;
1473 
1474   if (instruction_type != size) {
1475     assert_different_registers(tmp1, tmp2, tmp3, dest_current_value, exchange_value, addr_base);
1476     modval = tmp1;
1477     shift_amount = tmp2;
1478     val32 = tmp3;
1479     // Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.
1480 #ifdef VM_LITTLE_ENDIAN
1481     rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;
1482     clrrdi(addr_base, addr_base, 2);
1483 #else
1484     xori(shift_amount, addr_base, (size == 1) ? 3 : 2);
1485     clrrdi(addr_base, addr_base, 2);
1486     rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;
1487 #endif
1488   }
1489 
1490   // atomic emulation loop
1491   bind(retry);
1492 
1493   switch (instruction_type) {
1494     case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1495     case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1496     case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1497     default: ShouldNotReachHere();
1498   }
1499 
1500   if (instruction_type != size) {
1501     srw(dest_current_value, val32, shift_amount);
1502   }
1503 
1504   if (is_add) { add(modval, dest_current_value, exchange_value); }
1505 
1506   if (instruction_type != size) {
1507     // Transform exchange value such that the replacement can be done by one xor instruction.
1508     xorr(modval, dest_current_value, is_add ? modval : exchange_value);
1509     clrldi(modval, modval, (size == 1) ? 56 : 48);
1510     slw(modval, modval, shift_amount);
1511     xorr(modval, val32, modval);
1512   }
1513 
1514   switch (instruction_type) {
1515     case 4: stwcx_(modval, addr_base); break;
1516     case 2: sthcx_(modval, addr_base); break;
1517     case 1: stbcx_(modval, addr_base); break;
1518     default: ShouldNotReachHere();
1519   }
1520 
1521   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1522     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1523   } else {
1524     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1525   }
1526 
1527   // l?arx zero-extends, but Java wants byte/short values sign-extended.
1528   if (size == 1) {
1529     extsb(dest_current_value, dest_current_value);
1530   } else if (size == 2) {
1531     extsh(dest_current_value, dest_current_value);
1532   };
1533 }
1534 
1535 // Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions.
1536 // Only signed types are supported with size < 4.
1537 void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value,
1538                                        Register compare_value, Register exchange_value,
1539                                        Register addr_base, Register tmp1, Register tmp2,
1540                                        Label &retry, Label &failed, bool cmpxchgx_hint, int size) {
1541   // Sub-word instructions are available since Power 8.
1542   // For older processors, instruction_type != size holds, and we
1543   // emulate the sub-word instructions by constructing a 4-byte value
1544   // that leaves the other bytes unchanged.
1545   const int instruction_type = VM_Version::has_lqarx() ? size : 4;
1546 
1547   Register shift_amount = noreg,
1548            val32 = dest_current_value,
1549            modval = exchange_value;
1550 
1551   if (instruction_type != size) {
1552     assert_different_registers(tmp1, tmp2, dest_current_value, compare_value, exchange_value, addr_base);
1553     shift_amount = tmp1;
1554     val32 = tmp2;
1555     modval = tmp2;
1556     // Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.
1557 #ifdef VM_LITTLE_ENDIAN
1558     rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;
1559     clrrdi(addr_base, addr_base, 2);
1560 #else
1561     xori(shift_amount, addr_base, (size == 1) ? 3 : 2);
1562     clrrdi(addr_base, addr_base, 2);
1563     rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;
1564 #endif
1565     // Transform exchange value such that the replacement can be done by one xor instruction.
1566     xorr(exchange_value, compare_value, exchange_value);
1567     clrldi(exchange_value, exchange_value, (size == 1) ? 56 : 48);
1568     slw(exchange_value, exchange_value, shift_amount);
1569   }
1570 
1571   // atomic emulation loop
1572   bind(retry);
1573 
1574   switch (instruction_type) {
1575     case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1576     case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1577     case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1578     default: ShouldNotReachHere();
1579   }
1580 
1581   if (instruction_type != size) {
1582     srw(dest_current_value, val32, shift_amount);
1583   }
1584   if (size == 1) {
1585     extsb(dest_current_value, dest_current_value);
1586   } else if (size == 2) {
1587     extsh(dest_current_value, dest_current_value);
1588   };
1589 
1590   cmpw(flag, dest_current_value, compare_value);
1591   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1592     bne_predict_not_taken(flag, failed);
1593   } else {
1594     bne(                  flag, failed);
1595   }
1596   // branch to done  => (flag == ne), (dest_current_value != compare_value)
1597   // fall through    => (flag == eq), (dest_current_value == compare_value)
1598 
1599   if (instruction_type != size) {
1600     xorr(modval, val32, exchange_value);
1601   }
1602 
1603   switch (instruction_type) {
1604     case 4: stwcx_(modval, addr_base); break;
1605     case 2: sthcx_(modval, addr_base); break;
1606     case 1: stbcx_(modval, addr_base); break;
1607     default: ShouldNotReachHere();
1608   }
1609 }
1610 
1611 // CmpxchgX sets condition register to cmpX(current, compare).
1612 void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value,
1613                                      Register compare_value, Register exchange_value,
1614                                      Register addr_base, Register tmp1, Register tmp2,
1615                                      int semantics, bool cmpxchgx_hint,
1616                                      Register int_flag_success, bool contention_hint, bool weak, int size) {
1617   Label retry;
1618   Label failed;
1619   Label done;
1620 
1621   // Save one branch if result is returned via register and
1622   // result register is different from the other ones.
1623   bool use_result_reg    = (int_flag_success != noreg);
1624   bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value &&
1625                             int_flag_success != exchange_value && int_flag_success != addr_base &&
1626                             int_flag_success != tmp1 && int_flag_success != tmp2);
1627   assert(!weak || flag == CCR0, "weak only supported with CCR0");
1628   assert(size == 1 || size == 2 || size == 4, "unsupported");
1629 
1630   if (use_result_reg && preset_result_reg) {
1631     li(int_flag_success, 0); // preset (assume cas failed)
1632   }
1633 
1634   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1635   if (contention_hint) { // Don't try to reserve if cmp fails.
1636     switch (size) {
1637       case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break;
1638       case 2: lha(dest_current_value, 0, addr_base); break;
1639       case 4: lwz(dest_current_value, 0, addr_base); break;
1640       default: ShouldNotReachHere();
1641     }
1642     cmpw(flag, dest_current_value, compare_value);
1643     bne(flag, failed);
1644   }
1645 
1646   // release/fence semantics
1647   if (semantics & MemBarRel) {
1648     release();
1649   }
1650 
1651   cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base, tmp1, tmp2,
1652                     retry, failed, cmpxchgx_hint, size);
1653   if (!weak || use_result_reg) {
1654     if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1655       bne_predict_not_taken(CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1656     } else {
1657       bne(                  CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1658     }
1659   }
1660   // fall through    => (flag == eq), (dest_current_value == compare_value), (swapped)
1661 
1662   // Result in register (must do this at the end because int_flag_success can be the
1663   // same register as one above).
1664   if (use_result_reg) {
1665     li(int_flag_success, 1);
1666   }
1667 
1668   if (semantics & MemBarFenceAfter) {
1669     fence();
1670   } else if (semantics & MemBarAcq) {
1671     isync();
1672   }
1673 
1674   if (use_result_reg && !preset_result_reg) {
1675     b(done);
1676   }
1677 
1678   bind(failed);
1679   if (use_result_reg && !preset_result_reg) {
1680     li(int_flag_success, 0);
1681   }
1682 
1683   bind(done);
1684   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1685   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1686 }
1687 
1688 // Preforms atomic compare exchange:
1689 //   if (compare_value == *addr_base)
1690 //     *addr_base = exchange_value
1691 //     int_flag_success = 1;
1692 //   else
1693 //     int_flag_success = 0;
1694 //
1695 // ConditionRegister flag       = cmp(compare_value, *addr_base)
1696 // Register dest_current_value  = *addr_base
1697 // Register compare_value       Used to compare with value in memory
1698 // Register exchange_value      Written to memory if compare_value == *addr_base
1699 // Register addr_base           The memory location to compareXChange
1700 // Register int_flag_success    Set to 1 if exchange_value was written to *addr_base
1701 //
1702 // To avoid the costly compare exchange the value is tested beforehand.
1703 // Several special cases exist to avoid that unnecessary information is generated.
1704 //
1705 void MacroAssembler::cmpxchgd(ConditionRegister flag,
1706                               Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value,
1707                               Register addr_base, int semantics, bool cmpxchgx_hint,
1708                               Register int_flag_success, Label* failed_ext, bool contention_hint, bool weak) {
1709   Label retry;
1710   Label failed_int;
1711   Label& failed = (failed_ext != NULL) ? *failed_ext : failed_int;
1712   Label done;
1713 
1714   // Save one branch if result is returned via register and result register is different from the other ones.
1715   bool use_result_reg    = (int_flag_success!=noreg);
1716   bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() &&
1717                             int_flag_success!=exchange_value && int_flag_success!=addr_base);
1718   assert(!weak || flag == CCR0, "weak only supported with CCR0");
1719   assert(int_flag_success == noreg || failed_ext == NULL, "cannot have both");
1720 
1721   if (use_result_reg && preset_result_reg) {
1722     li(int_flag_success, 0); // preset (assume cas failed)
1723   }
1724 
1725   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1726   if (contention_hint) { // Don't try to reserve if cmp fails.
1727     ld(dest_current_value, 0, addr_base);
1728     cmpd(flag, compare_value, dest_current_value);
1729     bne(flag, failed);
1730   }
1731 
1732   // release/fence semantics
1733   if (semantics & MemBarRel) {
1734     release();
1735   }
1736 
1737   // atomic emulation loop
1738   bind(retry);
1739 
1740   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1741   cmpd(flag, compare_value, dest_current_value);
1742   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1743     bne_predict_not_taken(flag, failed);
1744   } else {
1745     bne(                  flag, failed);
1746   }
1747 
1748   stdcx_(exchange_value, addr_base);
1749   if (!weak || use_result_reg || failed_ext) {
1750     if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1751       bne_predict_not_taken(CCR0, weak ? failed : retry); // stXcx_ sets CCR0
1752     } else {
1753       bne(                  CCR0, weak ? failed : retry); // stXcx_ sets CCR0
1754     }
1755   }
1756 
1757   // result in register (must do this at the end because int_flag_success can be the same register as one above)
1758   if (use_result_reg) {
1759     li(int_flag_success, 1);
1760   }
1761 
1762   if (semantics & MemBarFenceAfter) {
1763     fence();
1764   } else if (semantics & MemBarAcq) {
1765     isync();
1766   }
1767 
1768   if (use_result_reg && !preset_result_reg) {
1769     b(done);
1770   }
1771 
1772   bind(failed_int);
1773   if (use_result_reg && !preset_result_reg) {
1774     li(int_flag_success, 0);
1775   }
1776 
1777   bind(done);
1778   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1779   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1780 }
1781 
1782 // Look up the method for a megamorphic invokeinterface call.
1783 // The target method is determined by <intf_klass, itable_index>.
1784 // The receiver klass is in recv_klass.
1785 // On success, the result will be in method_result, and execution falls through.
1786 // On failure, execution transfers to the given label.
1787 void MacroAssembler::lookup_interface_method(Register recv_klass,
1788                                              Register intf_klass,
1789                                              RegisterOrConstant itable_index,
1790                                              Register method_result,
1791                                              Register scan_temp,
1792                                              Register temp2,
1793                                              Label& L_no_such_interface,
1794                                              bool return_method) {
1795   assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
1796 
1797   // Compute start of first itableOffsetEntry (which is at the end of the vtable).
1798   int vtable_base = in_bytes(Klass::vtable_start_offset());
1799   int itentry_off = itableMethodEntry::method_offset_in_bytes();
1800   int logMEsize   = exact_log2(itableMethodEntry::size() * wordSize);
1801   int scan_step   = itableOffsetEntry::size() * wordSize;
1802   int log_vte_size= exact_log2(vtableEntry::size_in_bytes());
1803 
1804   lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass);
1805   // %%% We should store the aligned, prescaled offset in the klassoop.
1806   // Then the next several instructions would fold away.
1807 
1808   sldi(scan_temp, scan_temp, log_vte_size);
1809   addi(scan_temp, scan_temp, vtable_base);
1810   add(scan_temp, recv_klass, scan_temp);
1811 
1812   // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1813   if (return_method) {
1814     if (itable_index.is_register()) {
1815       Register itable_offset = itable_index.as_register();
1816       sldi(method_result, itable_offset, logMEsize);
1817       if (itentry_off) { addi(method_result, method_result, itentry_off); }
1818       add(method_result, method_result, recv_klass);
1819     } else {
1820       long itable_offset = (long)itable_index.as_constant();
1821       // static address, no relocation
1822       add_const_optimized(method_result, recv_klass, (itable_offset << logMEsize) + itentry_off, temp2);
1823     }
1824   }
1825 
1826   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
1827   //   if (scan->interface() == intf) {
1828   //     result = (klass + scan->offset() + itable_index);
1829   //   }
1830   // }
1831   Label search, found_method;
1832 
1833   for (int peel = 1; peel >= 0; peel--) {
1834     // %%%% Could load both offset and interface in one ldx, if they were
1835     // in the opposite order. This would save a load.
1836     ld(temp2, itableOffsetEntry::interface_offset_in_bytes(), scan_temp);
1837 
1838     // Check that this entry is non-null. A null entry means that
1839     // the receiver class doesn't implement the interface, and wasn't the
1840     // same as when the caller was compiled.
1841     cmpd(CCR0, temp2, intf_klass);
1842 
1843     if (peel) {
1844       beq(CCR0, found_method);
1845     } else {
1846       bne(CCR0, search);
1847       // (invert the test to fall through to found_method...)
1848     }
1849 
1850     if (!peel) break;
1851 
1852     bind(search);
1853 
1854     cmpdi(CCR0, temp2, 0);
1855     beq(CCR0, L_no_such_interface);
1856     addi(scan_temp, scan_temp, scan_step);
1857   }
1858 
1859   bind(found_method);
1860 
1861   // Got a hit.
1862   if (return_method) {
1863     int ito_offset = itableOffsetEntry::offset_offset_in_bytes();
1864     lwz(scan_temp, ito_offset, scan_temp);
1865     ldx(method_result, scan_temp, method_result);
1866   }
1867 }
1868 
1869 // virtual method calling
1870 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1871                                            RegisterOrConstant vtable_index,
1872                                            Register method_result) {
1873 
1874   assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg());
1875 
1876   const int base = in_bytes(Klass::vtable_start_offset());
1877   assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1878 
1879   if (vtable_index.is_register()) {
1880     sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord);
1881     add(recv_klass, vtable_index.as_register(), recv_klass);
1882   } else {
1883     addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord);
1884   }
1885   ld(R19_method, base + vtableEntry::method_offset_in_bytes(), recv_klass);
1886 }
1887 
1888 /////////////////////////////////////////// subtype checking ////////////////////////////////////////////
1889 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1890                                                    Register super_klass,
1891                                                    Register temp1_reg,
1892                                                    Register temp2_reg,
1893                                                    Label* L_success,
1894                                                    Label* L_failure,
1895                                                    Label* L_slow_path,
1896                                                    RegisterOrConstant super_check_offset) {
1897 
1898   const Register check_cache_offset = temp1_reg;
1899   const Register cached_super       = temp2_reg;
1900 
1901   assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super);
1902 
1903   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1904   int sc_offset  = in_bytes(Klass::secondary_super_cache_offset());
1905 
1906   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1907   bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset);
1908 
1909   Label L_fallthrough;
1910   int label_nulls = 0;
1911   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1912   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1913   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1914   assert(label_nulls <= 1 ||
1915          (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path),
1916          "at most one NULL in the batch, usually");
1917 
1918   // If the pointers are equal, we are done (e.g., String[] elements).
1919   // This self-check enables sharing of secondary supertype arrays among
1920   // non-primary types such as array-of-interface. Otherwise, each such
1921   // type would need its own customized SSA.
1922   // We move this check to the front of the fast path because many
1923   // type checks are in fact trivially successful in this manner,
1924   // so we get a nicely predicted branch right at the start of the check.
1925   cmpd(CCR0, sub_klass, super_klass);
1926   beq(CCR0, *L_success);
1927 
1928   // Check the supertype display:
1929   if (must_load_sco) {
1930     // The super check offset is always positive...
1931     lwz(check_cache_offset, sco_offset, super_klass);
1932     super_check_offset = RegisterOrConstant(check_cache_offset);
1933     // super_check_offset is register.
1934     assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register());
1935   }
1936   // The loaded value is the offset from KlassOopDesc.
1937 
1938   ld(cached_super, super_check_offset, sub_klass);
1939   cmpd(CCR0, cached_super, super_klass);
1940 
1941   // This check has worked decisively for primary supers.
1942   // Secondary supers are sought in the super_cache ('super_cache_addr').
1943   // (Secondary supers are interfaces and very deeply nested subtypes.)
1944   // This works in the same check above because of a tricky aliasing
1945   // between the super_cache and the primary super display elements.
1946   // (The 'super_check_addr' can address either, as the case requires.)
1947   // Note that the cache is updated below if it does not help us find
1948   // what we need immediately.
1949   // So if it was a primary super, we can just fail immediately.
1950   // Otherwise, it's the slow path for us (no success at this point).
1951 
1952 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); }
1953 
1954   if (super_check_offset.is_register()) {
1955     beq(CCR0, *L_success);
1956     cmpwi(CCR0, super_check_offset.as_register(), sc_offset);
1957     if (L_failure == &L_fallthrough) {
1958       beq(CCR0, *L_slow_path);
1959     } else {
1960       bne(CCR0, *L_failure);
1961       FINAL_JUMP(*L_slow_path);
1962     }
1963   } else {
1964     if (super_check_offset.as_constant() == sc_offset) {
1965       // Need a slow path; fast failure is impossible.
1966       if (L_slow_path == &L_fallthrough) {
1967         beq(CCR0, *L_success);
1968       } else {
1969         bne(CCR0, *L_slow_path);
1970         FINAL_JUMP(*L_success);
1971       }
1972     } else {
1973       // No slow path; it's a fast decision.
1974       if (L_failure == &L_fallthrough) {
1975         beq(CCR0, *L_success);
1976       } else {
1977         bne(CCR0, *L_failure);
1978         FINAL_JUMP(*L_success);
1979       }
1980     }
1981   }
1982 
1983   bind(L_fallthrough);
1984 #undef FINAL_JUMP
1985 }
1986 
1987 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1988                                                    Register super_klass,
1989                                                    Register temp1_reg,
1990                                                    Register temp2_reg,
1991                                                    Label* L_success,
1992                                                    Register result_reg) {
1993   const Register array_ptr = temp1_reg; // current value from cache array
1994   const Register temp      = temp2_reg;
1995 
1996   assert_different_registers(sub_klass, super_klass, array_ptr, temp);
1997 
1998   int source_offset = in_bytes(Klass::secondary_supers_offset());
1999   int target_offset = in_bytes(Klass::secondary_super_cache_offset());
2000 
2001   int length_offset = Array<Klass*>::length_offset_in_bytes();
2002   int base_offset   = Array<Klass*>::base_offset_in_bytes();
2003 
2004   Label hit, loop, failure, fallthru;
2005 
2006   ld(array_ptr, source_offset, sub_klass);
2007 
2008   // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated.");
2009   lwz(temp, length_offset, array_ptr);
2010   cmpwi(CCR0, temp, 0);
2011   beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0
2012 
2013   mtctr(temp); // load ctr
2014 
2015   bind(loop);
2016   // Oops in table are NO MORE compressed.
2017   ld(temp, base_offset, array_ptr);
2018   cmpd(CCR0, temp, super_klass);
2019   beq(CCR0, hit);
2020   addi(array_ptr, array_ptr, BytesPerWord);
2021   bdnz(loop);
2022 
2023   bind(failure);
2024   if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss)
2025   b(fallthru);
2026 
2027   bind(hit);
2028   std(super_klass, target_offset, sub_klass); // save result to cache
2029   if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit)
2030   if (L_success != NULL) { b(*L_success); }
2031   else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided
2032 
2033   bind(fallthru);
2034 }
2035 
2036 // Try fast path, then go to slow one if not successful
2037 void MacroAssembler::check_klass_subtype(Register sub_klass,
2038                          Register super_klass,
2039                          Register temp1_reg,
2040                          Register temp2_reg,
2041                          Label& L_success) {
2042   Label L_failure;
2043   check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure);
2044   check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success);
2045   bind(L_failure); // Fallthru if not successful.
2046 }
2047 
2048 void MacroAssembler::check_method_handle_type(Register mtype_reg, Register mh_reg,
2049                                               Register temp_reg,
2050                                               Label& wrong_method_type) {
2051   assert_different_registers(mtype_reg, mh_reg, temp_reg);
2052   // Compare method type against that of the receiver.
2053   load_heap_oop_not_null(temp_reg, delayed_value(java_lang_invoke_MethodHandle::type_offset_in_bytes, temp_reg), mh_reg);
2054   cmpd(CCR0, temp_reg, mtype_reg);
2055   bne(CCR0, wrong_method_type);
2056 }
2057 
2058 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot,
2059                                                    Register temp_reg,
2060                                                    int extra_slot_offset) {
2061   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
2062   int stackElementSize = Interpreter::stackElementSize;
2063   int offset = extra_slot_offset * stackElementSize;
2064   if (arg_slot.is_constant()) {
2065     offset += arg_slot.as_constant() * stackElementSize;
2066     return offset;
2067   } else {
2068     assert(temp_reg != noreg, "must specify");
2069     sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize));
2070     if (offset != 0)
2071       addi(temp_reg, temp_reg, offset);
2072     return temp_reg;
2073   }
2074 }
2075 
2076 // Supports temp2_reg = R0.
2077 void MacroAssembler::biased_locking_enter(ConditionRegister cr_reg, Register obj_reg,
2078                                           Register mark_reg, Register temp_reg,
2079                                           Register temp2_reg, Label& done, Label* slow_case) {
2080   assert(UseBiasedLocking, "why call this otherwise?");
2081 
2082 #ifdef ASSERT
2083   assert_different_registers(obj_reg, mark_reg, temp_reg, temp2_reg);
2084 #endif
2085 
2086   Label cas_label;
2087 
2088   // Branch to done if fast path fails and no slow_case provided.
2089   Label *slow_case_int = (slow_case != NULL) ? slow_case : &done;
2090 
2091   // Biased locking
2092   // See whether the lock is currently biased toward our thread and
2093   // whether the epoch is still valid
2094   // Note that the runtime guarantees sufficient alignment of JavaThread
2095   // pointers to allow age to be placed into low bits
2096   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits,
2097          "biased locking makes assumptions about bit layout");
2098 
2099   if (PrintBiasedLockingStatistics) {
2100     load_const(temp2_reg, (address) BiasedLocking::total_entry_count_addr(), temp_reg);
2101     lwzx(temp_reg, temp2_reg);
2102     addi(temp_reg, temp_reg, 1);
2103     stwx(temp_reg, temp2_reg);
2104   }
2105 
2106   andi(temp_reg, mark_reg, markOopDesc::biased_lock_mask_in_place);
2107   cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern);
2108   bne(cr_reg, cas_label);
2109 
2110   load_klass(temp_reg, obj_reg);
2111 
2112   load_const_optimized(temp2_reg, ~((int) markOopDesc::age_mask_in_place));
2113   ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
2114   orr(temp_reg, R16_thread, temp_reg);
2115   xorr(temp_reg, mark_reg, temp_reg);
2116   andr(temp_reg, temp_reg, temp2_reg);
2117   cmpdi(cr_reg, temp_reg, 0);
2118   if (PrintBiasedLockingStatistics) {
2119     Label l;
2120     bne(cr_reg, l);
2121     load_const(temp2_reg, (address) BiasedLocking::biased_lock_entry_count_addr());
2122     lwzx(mark_reg, temp2_reg);
2123     addi(mark_reg, mark_reg, 1);
2124     stwx(mark_reg, temp2_reg);
2125     // restore mark_reg
2126     ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);
2127     bind(l);
2128   }
2129   beq(cr_reg, done);
2130 
2131   Label try_revoke_bias;
2132   Label try_rebias;
2133 
2134   // At this point we know that the header has the bias pattern and
2135   // that we are not the bias owner in the current epoch. We need to
2136   // figure out more details about the state of the header in order to
2137   // know what operations can be legally performed on the object's
2138   // header.
2139 
2140   // If the low three bits in the xor result aren't clear, that means
2141   // the prototype header is no longer biased and we have to revoke
2142   // the bias on this object.
2143   andi(temp2_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
2144   cmpwi(cr_reg, temp2_reg, 0);
2145   bne(cr_reg, try_revoke_bias);
2146 
2147   // Biasing is still enabled for this data type. See whether the
2148   // epoch of the current bias is still valid, meaning that the epoch
2149   // bits of the mark word are equal to the epoch bits of the
2150   // prototype header. (Note that the prototype header's epoch bits
2151   // only change at a safepoint.) If not, attempt to rebias the object
2152   // toward the current thread. Note that we must be absolutely sure
2153   // that the current epoch is invalid in order to do this because
2154   // otherwise the manipulations it performs on the mark word are
2155   // illegal.
2156 
2157   int shift_amount = 64 - markOopDesc::epoch_shift;
2158   // rotate epoch bits to right (little) end and set other bits to 0
2159   // [ big part | epoch | little part ] -> [ 0..0 | epoch ]
2160   rldicl_(temp2_reg, temp_reg, shift_amount, 64 - markOopDesc::epoch_bits);
2161   // branch if epoch bits are != 0, i.e. they differ, because the epoch has been incremented
2162   bne(CCR0, try_rebias);
2163 
2164   // The epoch of the current bias is still valid but we know nothing
2165   // about the owner; it might be set or it might be clear. Try to
2166   // acquire the bias of the object using an atomic operation. If this
2167   // fails we will go in to the runtime to revoke the object's bias.
2168   // Note that we first construct the presumed unbiased header so we
2169   // don't accidentally blow away another thread's valid bias.
2170   andi(mark_reg, mark_reg, (markOopDesc::biased_lock_mask_in_place |
2171                                 markOopDesc::age_mask_in_place |
2172                                 markOopDesc::epoch_mask_in_place));
2173   orr(temp_reg, R16_thread, mark_reg);
2174 
2175   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2176 
2177   // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
2178   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
2179            /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
2180            /*where=*/obj_reg,
2181            MacroAssembler::MemBarAcq,
2182            MacroAssembler::cmpxchgx_hint_acquire_lock(),
2183            noreg, slow_case_int); // bail out if failed
2184 
2185   // If the biasing toward our thread failed, this means that
2186   // another thread succeeded in biasing it toward itself and we
2187   // need to revoke that bias. The revocation will occur in the
2188   // interpreter runtime in the slow case.
2189   if (PrintBiasedLockingStatistics) {
2190     load_const(temp2_reg, (address) BiasedLocking::anonymously_biased_lock_entry_count_addr(), temp_reg);
2191     lwzx(temp_reg, temp2_reg);
2192     addi(temp_reg, temp_reg, 1);
2193     stwx(temp_reg, temp2_reg);
2194   }
2195   b(done);
2196 
2197   bind(try_rebias);
2198   // At this point we know the epoch has expired, meaning that the
2199   // current "bias owner", if any, is actually invalid. Under these
2200   // circumstances _only_, we are allowed to use the current header's
2201   // value as the comparison value when doing the cas to acquire the
2202   // bias in the current epoch. In other words, we allow transfer of
2203   // the bias from one thread to another directly in this situation.
2204   load_klass(temp_reg, obj_reg);
2205   andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place);
2206   orr(temp2_reg, R16_thread, temp2_reg);
2207   ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
2208   orr(temp_reg, temp2_reg, temp_reg);
2209 
2210   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2211 
2212   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
2213                  /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
2214                  /*where=*/obj_reg,
2215                  MacroAssembler::MemBarAcq,
2216                  MacroAssembler::cmpxchgx_hint_acquire_lock(),
2217                  noreg, slow_case_int); // bail out if failed
2218 
2219   // If the biasing toward our thread failed, this means that
2220   // another thread succeeded in biasing it toward itself and we
2221   // need to revoke that bias. The revocation will occur in the
2222   // interpreter runtime in the slow case.
2223   if (PrintBiasedLockingStatistics) {
2224     load_const(temp2_reg, (address) BiasedLocking::rebiased_lock_entry_count_addr(), temp_reg);
2225     lwzx(temp_reg, temp2_reg);
2226     addi(temp_reg, temp_reg, 1);
2227     stwx(temp_reg, temp2_reg);
2228   }
2229   b(done);
2230 
2231   bind(try_revoke_bias);
2232   // The prototype mark in the klass doesn't have the bias bit set any
2233   // more, indicating that objects of this data type are not supposed
2234   // to be biased any more. We are going to try to reset the mark of
2235   // this object to the prototype value and fall through to the
2236   // CAS-based locking scheme. Note that if our CAS fails, it means
2237   // that another thread raced us for the privilege of revoking the
2238   // bias of this particular object, so it's okay to continue in the
2239   // normal locking code.
2240   load_klass(temp_reg, obj_reg);
2241   ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
2242   andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place);
2243   orr(temp_reg, temp_reg, temp2_reg);
2244 
2245   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2246 
2247   // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
2248   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
2249                  /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
2250                  /*where=*/obj_reg,
2251                  MacroAssembler::MemBarAcq,
2252                  MacroAssembler::cmpxchgx_hint_acquire_lock());
2253 
2254   // reload markOop in mark_reg before continuing with lightweight locking
2255   ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);
2256 
2257   // Fall through to the normal CAS-based lock, because no matter what
2258   // the result of the above CAS, some thread must have succeeded in
2259   // removing the bias bit from the object's header.
2260   if (PrintBiasedLockingStatistics) {
2261     Label l;
2262     bne(cr_reg, l);
2263     load_const(temp2_reg, (address) BiasedLocking::revoked_lock_entry_count_addr(), temp_reg);
2264     lwzx(temp_reg, temp2_reg);
2265     addi(temp_reg, temp_reg, 1);
2266     stwx(temp_reg, temp2_reg);
2267     bind(l);
2268   }
2269 
2270   bind(cas_label);
2271 }
2272 
2273 void MacroAssembler::biased_locking_exit (ConditionRegister cr_reg, Register mark_addr, Register temp_reg, Label& done) {
2274   // Check for biased locking unlock case, which is a no-op
2275   // Note: we do not have to check the thread ID for two reasons.
2276   // First, the interpreter checks for IllegalMonitorStateException at
2277   // a higher level. Second, if the bias was revoked while we held the
2278   // lock, the object could not be rebiased toward another thread, so
2279   // the bias bit would be clear.
2280 
2281   ld(temp_reg, 0, mark_addr);
2282   andi(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
2283 
2284   cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern);
2285   beq(cr_reg, done);
2286 }
2287 
2288 // allocation (for C1)
2289 void MacroAssembler::eden_allocate(
2290   Register obj,                      // result: pointer to object after successful allocation
2291   Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
2292   int      con_size_in_bytes,        // object size in bytes if   known at compile time
2293   Register t1,                       // temp register
2294   Register t2,                       // temp register
2295   Label&   slow_case                 // continuation point if fast allocation fails
2296 ) {
2297   b(slow_case);
2298 }
2299 
2300 void MacroAssembler::tlab_allocate(
2301   Register obj,                      // result: pointer to object after successful allocation
2302   Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
2303   int      con_size_in_bytes,        // object size in bytes if   known at compile time
2304   Register t1,                       // temp register
2305   Label&   slow_case                 // continuation point if fast allocation fails
2306 ) {
2307   // make sure arguments make sense
2308   assert_different_registers(obj, var_size_in_bytes, t1);
2309   assert(0 <= con_size_in_bytes && is_simm13(con_size_in_bytes), "illegal object size");
2310   assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment");
2311 
2312   const Register new_top = t1;
2313   //verify_tlab(); not implemented
2314 
2315   ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2316   ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread);
2317   if (var_size_in_bytes == noreg) {
2318     addi(new_top, obj, con_size_in_bytes);
2319   } else {
2320     add(new_top, obj, var_size_in_bytes);
2321   }
2322   cmpld(CCR0, new_top, R0);
2323   bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case);
2324 
2325 #ifdef ASSERT
2326   // make sure new free pointer is properly aligned
2327   {
2328     Label L;
2329     andi_(R0, new_top, MinObjAlignmentInBytesMask);
2330     beq(CCR0, L);
2331     stop("updated TLAB free is not properly aligned", 0x934);
2332     bind(L);
2333   }
2334 #endif // ASSERT
2335 
2336   // update the tlab top pointer
2337   std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2338   //verify_tlab(); not implemented
2339 }
2340 void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register t1, Register t2) {
2341   unimplemented("incr_allocated_bytes");
2342 }
2343 
2344 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset,
2345                                              int insts_call_instruction_offset, Register Rtoc) {
2346   // Start the stub.
2347   address stub = start_a_stub(64);
2348   if (stub == NULL) { return NULL; } // CodeCache full: bail out
2349 
2350   // Create a trampoline stub relocation which relates this trampoline stub
2351   // with the call instruction at insts_call_instruction_offset in the
2352   // instructions code-section.
2353   relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset));
2354   const int stub_start_offset = offset();
2355 
2356   // For java_to_interp stubs we use R11_scratch1 as scratch register
2357   // and in call trampoline stubs we use R12_scratch2. This way we
2358   // can distinguish them (see is_NativeCallTrampolineStub_at()).
2359   Register reg_scratch = R12_scratch2;
2360 
2361   // Now, create the trampoline stub's code:
2362   // - load the TOC
2363   // - load the call target from the constant pool
2364   // - call
2365   if (Rtoc == noreg) {
2366     calculate_address_from_global_toc(reg_scratch, method_toc());
2367     Rtoc = reg_scratch;
2368   }
2369 
2370   ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false);
2371   mtctr(reg_scratch);
2372   bctr();
2373 
2374   const address stub_start_addr = addr_at(stub_start_offset);
2375 
2376   // Assert that the encoded destination_toc_offset can be identified and that it is correct.
2377   assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(),
2378          "encoded offset into the constant pool must match");
2379   // Trampoline_stub_size should be good.
2380   assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size");
2381   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
2382 
2383   // End the stub.
2384   end_a_stub();
2385   return stub;
2386 }
2387 
2388 // TM on PPC64.
2389 void MacroAssembler::atomic_inc_ptr(Register addr, Register result, int simm16) {
2390   Label retry;
2391   bind(retry);
2392   ldarx(result, addr, /*hint*/ false);
2393   addi(result, result, simm16);
2394   stdcx_(result, addr);
2395   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
2396     bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
2397   } else {
2398     bne(                  CCR0, retry); // stXcx_ sets CCR0
2399   }
2400 }
2401 
2402 void MacroAssembler::atomic_ori_int(Register addr, Register result, int uimm16) {
2403   Label retry;
2404   bind(retry);
2405   lwarx(result, addr, /*hint*/ false);
2406   ori(result, result, uimm16);
2407   stwcx_(result, addr);
2408   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
2409     bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
2410   } else {
2411     bne(                  CCR0, retry); // stXcx_ sets CCR0
2412   }
2413 }
2414 
2415 #if INCLUDE_RTM_OPT
2416 
2417 // Update rtm_counters based on abort status
2418 // input: abort_status
2419 //        rtm_counters (RTMLockingCounters*)
2420 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters_Reg) {
2421   // Mapping to keep PreciseRTMLockingStatistics similar to x86.
2422   // x86 ppc (! means inverted, ? means not the same)
2423   //  0   31  Set if abort caused by XABORT instruction.
2424   //  1  ! 7  If set, the transaction may succeed on a retry. This bit is always clear if bit 0 is set.
2425   //  2   13  Set if another logical processor conflicted with a memory address that was part of the transaction that aborted.
2426   //  3   10  Set if an internal buffer overflowed.
2427   //  4  ?12  Set if a debug breakpoint was hit.
2428   //  5  ?32  Set if an abort occurred during execution of a nested transaction.
2429   const  int tm_failure_bit[] = {Assembler::tm_tabort, // Note: Seems like signal handler sets this, too.
2430                                  Assembler::tm_failure_persistent, // inverted: transient
2431                                  Assembler::tm_trans_cf,
2432                                  Assembler::tm_footprint_of,
2433                                  Assembler::tm_non_trans_cf,
2434                                  Assembler::tm_suspended};
2435   const bool tm_failure_inv[] = {false, true, false, false, false, false};
2436   assert(sizeof(tm_failure_bit)/sizeof(int) == RTMLockingCounters::ABORT_STATUS_LIMIT, "adapt mapping!");
2437 
2438   const Register addr_Reg = R0;
2439   // Keep track of offset to where rtm_counters_Reg had pointed to.
2440   int counters_offs = RTMLockingCounters::abort_count_offset();
2441   addi(addr_Reg, rtm_counters_Reg, counters_offs);
2442   const Register temp_Reg = rtm_counters_Reg;
2443 
2444   //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically
2445   ldx(temp_Reg, addr_Reg);
2446   addi(temp_Reg, temp_Reg, 1);
2447   stdx(temp_Reg, addr_Reg);
2448 
2449   if (PrintPreciseRTMLockingStatistics) {
2450     int counters_offs_delta = RTMLockingCounters::abortX_count_offset() - counters_offs;
2451 
2452     //mftexasr(abort_status); done by caller
2453     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
2454       counters_offs += counters_offs_delta;
2455       li(temp_Reg, counters_offs_delta); // can't use addi with R0
2456       add(addr_Reg, addr_Reg, temp_Reg); // point to next counter
2457       counters_offs_delta = sizeof(uintx);
2458 
2459       Label check_abort;
2460       rldicr_(temp_Reg, abort_status, tm_failure_bit[i], 0);
2461       if (tm_failure_inv[i]) {
2462         bne(CCR0, check_abort);
2463       } else {
2464         beq(CCR0, check_abort);
2465       }
2466       //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically
2467       ldx(temp_Reg, addr_Reg);
2468       addi(temp_Reg, temp_Reg, 1);
2469       stdx(temp_Reg, addr_Reg);
2470       bind(check_abort);
2471     }
2472   }
2473   li(temp_Reg, -counters_offs); // can't use addi with R0
2474   add(rtm_counters_Reg, addr_Reg, temp_Reg); // restore
2475 }
2476 
2477 // Branch if (random & (count-1) != 0), count is 2^n
2478 // tmp and CR0 are killed
2479 void MacroAssembler::branch_on_random_using_tb(Register tmp, int count, Label& brLabel) {
2480   mftb(tmp);
2481   andi_(tmp, tmp, count-1);
2482   bne(CCR0, brLabel);
2483 }
2484 
2485 // Perform abort ratio calculation, set no_rtm bit if high ratio.
2486 // input:  rtm_counters_Reg (RTMLockingCounters* address) - KILLED
2487 void MacroAssembler::rtm_abort_ratio_calculation(Register rtm_counters_Reg,
2488                                                  RTMLockingCounters* rtm_counters,
2489                                                  Metadata* method_data) {
2490   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
2491 
2492   if (RTMLockingCalculationDelay > 0) {
2493     // Delay calculation.
2494     ld(rtm_counters_Reg, (RegisterOrConstant)(intptr_t)RTMLockingCounters::rtm_calculation_flag_addr());
2495     cmpdi(CCR0, rtm_counters_Reg, 0);
2496     beq(CCR0, L_done);
2497     load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2498   }
2499   // Abort ratio calculation only if abort_count > RTMAbortThreshold.
2500   //   Aborted transactions = abort_count * 100
2501   //   All transactions = total_count *  RTMTotalCountIncrRate
2502   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
2503   ld(R0, RTMLockingCounters::abort_count_offset(), rtm_counters_Reg);
2504   if (is_simm(RTMAbortThreshold, 16)) {   // cmpdi can handle 16bit immediate only.
2505     cmpdi(CCR0, R0, RTMAbortThreshold);
2506     blt(CCR0, L_check_always_rtm2);  // reload of rtm_counters_Reg not necessary
2507   } else {
2508     load_const_optimized(rtm_counters_Reg, RTMAbortThreshold);
2509     cmpd(CCR0, R0, rtm_counters_Reg);
2510     blt(CCR0, L_check_always_rtm1);  // reload of rtm_counters_Reg required
2511   }
2512   mulli(R0, R0, 100);
2513 
2514   const Register tmpReg = rtm_counters_Reg;
2515   ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2516   mulli(tmpReg, tmpReg, RTMTotalCountIncrRate); // allowable range: int16
2517   mulli(tmpReg, tmpReg, RTMAbortRatio);         // allowable range: int16
2518   cmpd(CCR0, R0, tmpReg);
2519   blt(CCR0, L_check_always_rtm1); // jump to reload
2520   if (method_data != NULL) {
2521     // Set rtm_state to "no rtm" in MDO.
2522     // Not using a metadata relocation. Method and Class Loader are kept alive anyway.
2523     // (See nmethod::metadata_do and CodeBuffer::finalize_oop_references.)
2524     load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2525     atomic_ori_int(R0, tmpReg, NoRTM);
2526   }
2527   b(L_done);
2528 
2529   bind(L_check_always_rtm1);
2530   load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2531   bind(L_check_always_rtm2);
2532   ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2533   int64_t thresholdValue = RTMLockingThreshold / RTMTotalCountIncrRate;
2534   if (is_simm(thresholdValue, 16)) {   // cmpdi can handle 16bit immediate only.
2535     cmpdi(CCR0, tmpReg, thresholdValue);
2536   } else {
2537     load_const_optimized(R0, thresholdValue);
2538     cmpd(CCR0, tmpReg, R0);
2539   }
2540   blt(CCR0, L_done);
2541   if (method_data != NULL) {
2542     // Set rtm_state to "always rtm" in MDO.
2543     // Not using a metadata relocation. See above.
2544     load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2545     atomic_ori_int(R0, tmpReg, UseRTM);
2546   }
2547   bind(L_done);
2548 }
2549 
2550 // Update counters and perform abort ratio calculation.
2551 // input: abort_status_Reg
2552 void MacroAssembler::rtm_profiling(Register abort_status_Reg, Register temp_Reg,
2553                                    RTMLockingCounters* rtm_counters,
2554                                    Metadata* method_data,
2555                                    bool profile_rtm) {
2556 
2557   assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2558   // Update rtm counters based on state at abort.
2559   // Reads abort_status_Reg, updates flags.
2560   assert_different_registers(abort_status_Reg, temp_Reg);
2561   load_const_optimized(temp_Reg, (address)rtm_counters, R0);
2562   rtm_counters_update(abort_status_Reg, temp_Reg);
2563   if (profile_rtm) {
2564     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2565     rtm_abort_ratio_calculation(temp_Reg, rtm_counters, method_data);
2566   }
2567 }
2568 
2569 // Retry on abort if abort's status indicates non-persistent failure.
2570 // inputs: retry_count_Reg
2571 //       : abort_status_Reg
2572 // output: retry_count_Reg decremented by 1
2573 void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg,
2574                                              Label& retryLabel, Label* checkRetry) {
2575   Label doneRetry;
2576   rldicr_(R0, abort_status_Reg, tm_failure_persistent, 0);
2577   bne(CCR0, doneRetry);
2578   if (checkRetry) { bind(*checkRetry); }
2579   addic_(retry_count_Reg, retry_count_Reg, -1);
2580   blt(CCR0, doneRetry);
2581   b(retryLabel);
2582   bind(doneRetry);
2583 }
2584 
2585 // Spin and retry if lock is busy.
2586 // inputs: owner_addr_Reg (monitor address)
2587 //       : retry_count_Reg
2588 // output: retry_count_Reg decremented by 1
2589 // CTR is killed
2590 void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register owner_addr_Reg, Label& retryLabel) {
2591   Label SpinLoop, doneRetry, doRetry;
2592   addic_(retry_count_Reg, retry_count_Reg, -1);
2593   blt(CCR0, doneRetry);
2594 
2595   if (RTMSpinLoopCount > 1) {
2596     li(R0, RTMSpinLoopCount);
2597     mtctr(R0);
2598   }
2599 
2600   // low thread priority
2601   smt_prio_low();
2602   bind(SpinLoop);
2603 
2604   if (RTMSpinLoopCount > 1) {
2605     bdz(doRetry);
2606     ld(R0, 0, owner_addr_Reg);
2607     cmpdi(CCR0, R0, 0);
2608     bne(CCR0, SpinLoop);
2609   }
2610 
2611   bind(doRetry);
2612 
2613   // restore thread priority to default in userspace
2614 #ifdef LINUX
2615   smt_prio_medium_low();
2616 #else
2617   smt_prio_medium();
2618 #endif
2619 
2620   b(retryLabel);
2621 
2622   bind(doneRetry);
2623 }
2624 
2625 // Use RTM for normal stack locks.
2626 // Input: objReg (object to lock)
2627 void MacroAssembler::rtm_stack_locking(ConditionRegister flag,
2628                                        Register obj, Register mark_word, Register tmp,
2629                                        Register retry_on_abort_count_Reg,
2630                                        RTMLockingCounters* stack_rtm_counters,
2631                                        Metadata* method_data, bool profile_rtm,
2632                                        Label& DONE_LABEL, Label& IsInflated) {
2633   assert(UseRTMForStackLocks, "why call this otherwise?");
2634   assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
2635   Label L_rtm_retry, L_decrement_retry, L_on_abort;
2636 
2637   if (RTMRetryCount > 0) {
2638     load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
2639     bind(L_rtm_retry);
2640   }
2641   andi_(R0, mark_word, markOopDesc::monitor_value);  // inflated vs stack-locked|neutral|biased
2642   bne(CCR0, IsInflated);
2643 
2644   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2645     Label L_noincrement;
2646     if (RTMTotalCountIncrRate > 1) {
2647       branch_on_random_using_tb(tmp, RTMTotalCountIncrRate, L_noincrement);
2648     }
2649     assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
2650     load_const_optimized(tmp, (address)stack_rtm_counters->total_count_addr(), R0);
2651     //atomic_inc_ptr(tmp, /*temp, will be reloaded*/mark_word); We don't increment atomically
2652     ldx(mark_word, tmp);
2653     addi(mark_word, mark_word, 1);
2654     stdx(mark_word, tmp);
2655     bind(L_noincrement);
2656   }
2657   tbegin_();
2658   beq(CCR0, L_on_abort);
2659   ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);         // Reload in transaction, conflicts need to be tracked.
2660   andi(R0, mark_word, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
2661   cmpwi(flag, R0, markOopDesc::unlocked_value);                // bits = 001 unlocked
2662   beq(flag, DONE_LABEL);                                       // all done if unlocked
2663 
2664   if (UseRTMXendForLockBusy) {
2665     tend_();
2666     b(L_decrement_retry);
2667   } else {
2668     tabort_();
2669   }
2670   bind(L_on_abort);
2671   const Register abort_status_Reg = tmp;
2672   mftexasr(abort_status_Reg);
2673   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2674     rtm_profiling(abort_status_Reg, /*temp*/mark_word, stack_rtm_counters, method_data, profile_rtm);
2675   }
2676   ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // reload
2677   if (RTMRetryCount > 0) {
2678     // Retry on lock abort if abort status is not permanent.
2679     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry, &L_decrement_retry);
2680   } else {
2681     bind(L_decrement_retry);
2682   }
2683 }
2684 
2685 // Use RTM for inflating locks
2686 // inputs: obj       (object to lock)
2687 //         mark_word (current header - KILLED)
2688 //         boxReg    (on-stack box address (displaced header location) - KILLED)
2689 void MacroAssembler::rtm_inflated_locking(ConditionRegister flag,
2690                                           Register obj, Register mark_word, Register boxReg,
2691                                           Register retry_on_busy_count_Reg, Register retry_on_abort_count_Reg,
2692                                           RTMLockingCounters* rtm_counters,
2693                                           Metadata* method_data, bool profile_rtm,
2694                                           Label& DONE_LABEL) {
2695   assert(UseRTMLocking, "why call this otherwise?");
2696   Label L_rtm_retry, L_decrement_retry, L_on_abort;
2697   // Clean monitor_value bit to get valid pointer.
2698   int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markOopDesc::monitor_value;
2699 
2700   // Store non-null, using boxReg instead of (intptr_t)markOopDesc::unused_mark().
2701   std(boxReg, BasicLock::displaced_header_offset_in_bytes(), boxReg);
2702   const Register tmpReg = boxReg;
2703   const Register owner_addr_Reg = mark_word;
2704   addi(owner_addr_Reg, mark_word, owner_offset);
2705 
2706   if (RTMRetryCount > 0) {
2707     load_const_optimized(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy.
2708     load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort.
2709     bind(L_rtm_retry);
2710   }
2711   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2712     Label L_noincrement;
2713     if (RTMTotalCountIncrRate > 1) {
2714       branch_on_random_using_tb(R0, RTMTotalCountIncrRate, L_noincrement);
2715     }
2716     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2717     load_const(R0, (address)rtm_counters->total_count_addr(), tmpReg);
2718     //atomic_inc_ptr(R0, tmpReg); We don't increment atomically
2719     ldx(tmpReg, R0);
2720     addi(tmpReg, tmpReg, 1);
2721     stdx(tmpReg, R0);
2722     bind(L_noincrement);
2723   }
2724   tbegin_();
2725   beq(CCR0, L_on_abort);
2726   // We don't reload mark word. Will only be reset at safepoint.
2727   ld(R0, 0, owner_addr_Reg); // Load in transaction, conflicts need to be tracked.
2728   cmpdi(flag, R0, 0);
2729   beq(flag, DONE_LABEL);
2730 
2731   if (UseRTMXendForLockBusy) {
2732     tend_();
2733     b(L_decrement_retry);
2734   } else {
2735     tabort_();
2736   }
2737   bind(L_on_abort);
2738   const Register abort_status_Reg = tmpReg;
2739   mftexasr(abort_status_Reg);
2740   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2741     rtm_profiling(abort_status_Reg, /*temp*/ owner_addr_Reg, rtm_counters, method_data, profile_rtm);
2742     // Restore owner_addr_Reg
2743     ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);
2744 #ifdef ASSERT
2745     andi_(R0, mark_word, markOopDesc::monitor_value);
2746     asm_assert_ne("must be inflated", 0xa754); // Deflating only allowed at safepoint.
2747 #endif
2748     addi(owner_addr_Reg, mark_word, owner_offset);
2749   }
2750   if (RTMRetryCount > 0) {
2751     // Retry on lock abort if abort status is not permanent.
2752     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
2753   }
2754 
2755   // Appears unlocked - try to swing _owner from null to non-null.
2756   cmpxchgd(flag, /*current val*/ R0, (intptr_t)0, /*new val*/ R16_thread, owner_addr_Reg,
2757            MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2758            MacroAssembler::cmpxchgx_hint_acquire_lock(), noreg, &L_decrement_retry, true);
2759 
2760   if (RTMRetryCount > 0) {
2761     // success done else retry
2762     b(DONE_LABEL);
2763     bind(L_decrement_retry);
2764     // Spin and retry if lock is busy.
2765     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, owner_addr_Reg, L_rtm_retry);
2766   } else {
2767     bind(L_decrement_retry);
2768   }
2769 }
2770 
2771 #endif //  INCLUDE_RTM_OPT
2772 
2773 // "The box" is the space on the stack where we copy the object mark.
2774 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box,
2775                                                Register temp, Register displaced_header, Register current_header,
2776                                                bool try_bias,
2777                                                RTMLockingCounters* rtm_counters,
2778                                                RTMLockingCounters* stack_rtm_counters,
2779                                                Metadata* method_data,
2780                                                bool use_rtm, bool profile_rtm) {
2781   assert_different_registers(oop, box, temp, displaced_header, current_header);
2782   assert(flag != CCR0, "bad condition register");
2783   Label cont;
2784   Label object_has_monitor;
2785   Label cas_failed;
2786 
2787   // Load markOop from object into displaced_header.
2788   ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop);
2789 
2790 
2791   // Always do locking in runtime.
2792   if (EmitSync & 0x01) {
2793     cmpdi(flag, oop, 0); // Oop can't be 0 here => always false.
2794     return;
2795   }
2796 
2797   if (try_bias) {
2798     biased_locking_enter(flag, oop, displaced_header, temp, current_header, cont);
2799   }
2800 
2801 #if INCLUDE_RTM_OPT
2802   if (UseRTMForStackLocks && use_rtm) {
2803     rtm_stack_locking(flag, oop, displaced_header, temp, /*temp*/ current_header,
2804                       stack_rtm_counters, method_data, profile_rtm,
2805                       cont, object_has_monitor);
2806   }
2807 #endif // INCLUDE_RTM_OPT
2808 
2809   // Handle existing monitor.
2810   if ((EmitSync & 0x02) == 0) {
2811     // The object has an existing monitor iff (mark & monitor_value) != 0.
2812     andi_(temp, displaced_header, markOopDesc::monitor_value);
2813     bne(CCR0, object_has_monitor);
2814   }
2815 
2816   // Set displaced_header to be (markOop of object | UNLOCK_VALUE).
2817   ori(displaced_header, displaced_header, markOopDesc::unlocked_value);
2818 
2819   // Load Compare Value application register.
2820 
2821   // Initialize the box. (Must happen before we update the object mark!)
2822   std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2823 
2824   // Must fence, otherwise, preceding store(s) may float below cmpxchg.
2825   // Compare object markOop with mark and if equal exchange scratch1 with object markOop.
2826   cmpxchgd(/*flag=*/flag,
2827            /*current_value=*/current_header,
2828            /*compare_value=*/displaced_header,
2829            /*exchange_value=*/box,
2830            /*where=*/oop,
2831            MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2832            MacroAssembler::cmpxchgx_hint_acquire_lock(),
2833            noreg,
2834            &cas_failed,
2835            /*check without membar and ldarx first*/true);
2836   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2837 
2838   // If the compare-and-exchange succeeded, then we found an unlocked
2839   // object and we have now locked it.
2840   b(cont);
2841 
2842   bind(cas_failed);
2843   // We did not see an unlocked object so try the fast recursive case.
2844 
2845   // Check if the owner is self by comparing the value in the markOop of object
2846   // (current_header) with the stack pointer.
2847   sub(current_header, current_header, R1_SP);
2848   load_const_optimized(temp, ~(os::vm_page_size()-1) | markOopDesc::lock_mask_in_place);
2849 
2850   and_(R0/*==0?*/, current_header, temp);
2851   // If condition is true we are cont and hence we can store 0 as the
2852   // displaced header in the box, which indicates that it is a recursive lock.
2853   mcrf(flag,CCR0);
2854   std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box);
2855 
2856   // Handle existing monitor.
2857   if ((EmitSync & 0x02) == 0) {
2858     b(cont);
2859 
2860     bind(object_has_monitor);
2861     // The object's monitor m is unlocked iff m->owner == NULL,
2862     // otherwise m->owner may contain a thread or a stack address.
2863 
2864 #if INCLUDE_RTM_OPT
2865     // Use the same RTM locking code in 32- and 64-bit VM.
2866     if (use_rtm) {
2867       rtm_inflated_locking(flag, oop, displaced_header, box, temp, /*temp*/ current_header,
2868                            rtm_counters, method_data, profile_rtm, cont);
2869     } else {
2870 #endif // INCLUDE_RTM_OPT
2871 
2872     // Try to CAS m->owner from NULL to current thread.
2873     addi(temp, displaced_header, ObjectMonitor::owner_offset_in_bytes()-markOopDesc::monitor_value);
2874     cmpxchgd(/*flag=*/flag,
2875              /*current_value=*/current_header,
2876              /*compare_value=*/(intptr_t)0,
2877              /*exchange_value=*/R16_thread,
2878              /*where=*/temp,
2879              MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2880              MacroAssembler::cmpxchgx_hint_acquire_lock());
2881 
2882     // Store a non-null value into the box.
2883     std(box, BasicLock::displaced_header_offset_in_bytes(), box);
2884 
2885 #   ifdef ASSERT
2886     bne(flag, cont);
2887     // We have acquired the monitor, check some invariants.
2888     addi(/*monitor=*/temp, temp, -ObjectMonitor::owner_offset_in_bytes());
2889     // Invariant 1: _recursions should be 0.
2890     //assert(ObjectMonitor::recursions_size_in_bytes() == 8, "unexpected size");
2891     asm_assert_mem8_is_zero(ObjectMonitor::recursions_offset_in_bytes(), temp,
2892                             "monitor->_recursions should be 0", -1);
2893 #   endif
2894 
2895 #if INCLUDE_RTM_OPT
2896     } // use_rtm()
2897 #endif
2898   }
2899 
2900   bind(cont);
2901   // flag == EQ indicates success
2902   // flag == NE indicates failure
2903 }
2904 
2905 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box,
2906                                                  Register temp, Register displaced_header, Register current_header,
2907                                                  bool try_bias, bool use_rtm) {
2908   assert_different_registers(oop, box, temp, displaced_header, current_header);
2909   assert(flag != CCR0, "bad condition register");
2910   Label cont;
2911   Label object_has_monitor;
2912 
2913   // Always do locking in runtime.
2914   if (EmitSync & 0x01) {
2915     cmpdi(flag, oop, 0); // Oop can't be 0 here => always false.
2916     return;
2917   }
2918 
2919   if (try_bias) {
2920     biased_locking_exit(flag, oop, current_header, cont);
2921   }
2922 
2923 #if INCLUDE_RTM_OPT
2924   if (UseRTMForStackLocks && use_rtm) {
2925     assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
2926     Label L_regular_unlock;
2927     ld(current_header, oopDesc::mark_offset_in_bytes(), oop);         // fetch markword
2928     andi(R0, current_header, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
2929     cmpwi(flag, R0, markOopDesc::unlocked_value);                     // bits = 001 unlocked
2930     bne(flag, L_regular_unlock);                                      // else RegularLock
2931     tend_();                                                          // otherwise end...
2932     b(cont);                                                          // ... and we're done
2933     bind(L_regular_unlock);
2934   }
2935 #endif
2936 
2937   // Find the lock address and load the displaced header from the stack.
2938   ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2939 
2940   // If the displaced header is 0, we have a recursive unlock.
2941   cmpdi(flag, displaced_header, 0);
2942   beq(flag, cont);
2943 
2944   // Handle existing monitor.
2945   if ((EmitSync & 0x02) == 0) {
2946     // The object has an existing monitor iff (mark & monitor_value) != 0.
2947     RTM_OPT_ONLY( if (!(UseRTMForStackLocks && use_rtm)) ) // skip load if already done
2948     ld(current_header, oopDesc::mark_offset_in_bytes(), oop);
2949     andi_(R0, current_header, markOopDesc::monitor_value);
2950     bne(CCR0, object_has_monitor);
2951   }
2952 
2953   // Check if it is still a light weight lock, this is is true if we see
2954   // the stack address of the basicLock in the markOop of the object.
2955   // Cmpxchg sets flag to cmpd(current_header, box).
2956   cmpxchgd(/*flag=*/flag,
2957            /*current_value=*/current_header,
2958            /*compare_value=*/box,
2959            /*exchange_value=*/displaced_header,
2960            /*where=*/oop,
2961            MacroAssembler::MemBarRel,
2962            MacroAssembler::cmpxchgx_hint_release_lock(),
2963            noreg,
2964            &cont);
2965 
2966   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2967 
2968   // Handle existing monitor.
2969   if ((EmitSync & 0x02) == 0) {
2970     b(cont);
2971 
2972     bind(object_has_monitor);
2973     addi(current_header, current_header, -markOopDesc::monitor_value); // monitor
2974     ld(temp,             ObjectMonitor::owner_offset_in_bytes(), current_header);
2975 
2976     // It's inflated.
2977 #if INCLUDE_RTM_OPT
2978     if (use_rtm) {
2979       Label L_regular_inflated_unlock;
2980       // Clean monitor_value bit to get valid pointer
2981       cmpdi(flag, temp, 0);
2982       bne(flag, L_regular_inflated_unlock);
2983       tend_();
2984       b(cont);
2985       bind(L_regular_inflated_unlock);
2986     }
2987 #endif
2988 
2989     ld(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header);
2990     xorr(temp, R16_thread, temp);      // Will be 0 if we are the owner.
2991     orr(temp, temp, displaced_header); // Will be 0 if there are 0 recursions.
2992     cmpdi(flag, temp, 0);
2993     bne(flag, cont);
2994 
2995     ld(temp,             ObjectMonitor::EntryList_offset_in_bytes(), current_header);
2996     ld(displaced_header, ObjectMonitor::cxq_offset_in_bytes(), current_header);
2997     orr(temp, temp, displaced_header); // Will be 0 if both are 0.
2998     cmpdi(flag, temp, 0);
2999     bne(flag, cont);
3000     release();
3001     std(temp, ObjectMonitor::owner_offset_in_bytes(), current_header);
3002   }
3003 
3004   bind(cont);
3005   // flag == EQ indicates success
3006   // flag == NE indicates failure
3007 }
3008 
3009 // Write serialization page so VM thread can do a pseudo remote membar.
3010 // We use the current thread pointer to calculate a thread specific
3011 // offset to write to within the page. This minimizes bus traffic
3012 // due to cache line collision.
3013 void MacroAssembler::serialize_memory(Register thread, Register tmp1, Register tmp2) {
3014   srdi(tmp2, thread, os::get_serialize_page_shift_count());
3015 
3016   int mask = os::vm_page_size() - sizeof(int);
3017   if (Assembler::is_simm(mask, 16)) {
3018     andi(tmp2, tmp2, mask);
3019   } else {
3020     lis(tmp1, (int)((signed short) (mask >> 16)));
3021     ori(tmp1, tmp1, mask & 0x0000ffff);
3022     andr(tmp2, tmp2, tmp1);
3023   }
3024 
3025   load_const(tmp1, (long) os::get_memory_serialize_page());
3026   release();
3027   stwx(R0, tmp1, tmp2);
3028 }
3029 
3030 void MacroAssembler::safepoint_poll(Label& slow_path, Register temp_reg) {
3031   if (SafepointMechanism::uses_thread_local_poll()) {
3032     ld(temp_reg, in_bytes(Thread::polling_page_offset()), R16_thread);
3033     // Armed page has poll_bit set.
3034     andi_(temp_reg, temp_reg, SafepointMechanism::poll_bit());
3035   } else {
3036     lwz(temp_reg, (RegisterOrConstant)(intptr_t)SafepointSynchronize::address_of_state());
3037     cmpwi(CCR0, temp_reg, SafepointSynchronize::_not_synchronized);
3038   }
3039   bne(CCR0, slow_path);
3040 }
3041 
3042 
3043 // GC barrier helper macros
3044 
3045 // Write the card table byte if needed.
3046 void MacroAssembler::card_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp) {
3047   CardTableBarrierSet* bs =
3048     barrier_set_cast<CardTableBarrierSet>(Universe::heap()->barrier_set());
3049   assert(bs->kind() == BarrierSet::CardTableBarrierSet, "wrong barrier");
3050   CardTable* ct = bs->card_table();
3051 #ifdef ASSERT
3052   cmpdi(CCR0, Rnew_val, 0);
3053   asm_assert_ne("null oop not allowed", 0x321);
3054 #endif
3055   card_table_write(ct->byte_map_base(), Rtmp, Rstore_addr);
3056 }
3057 
3058 // Write the card table byte.
3059 void MacroAssembler::card_table_write(jbyte* byte_map_base, Register Rtmp, Register Robj) {
3060   assert_different_registers(Robj, Rtmp, R0);
3061   load_const_optimized(Rtmp, (address)byte_map_base, R0);
3062   srdi(Robj, Robj, CardTable::card_shift);
3063   li(R0, 0); // dirty
3064   if (UseConcMarkSweepGC) membar(Assembler::StoreStore);
3065   stbx(R0, Rtmp, Robj);
3066 }
3067 
3068 // Kills R31 if value is a volatile register.
3069 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2, bool needs_frame) {
3070   Label done;
3071   cmpdi(CCR0, value, 0);
3072   beq(CCR0, done);         // Use NULL as-is.
3073 
3074   clrrdi(tmp1, value, JNIHandles::weak_tag_size);
3075 #if INCLUDE_ALL_GCS
3076   if (UseG1GC) { andi_(tmp2, value, JNIHandles::weak_tag_mask); }
3077 #endif
3078   ld(value, 0, tmp1);      // Resolve (untagged) jobject.
3079 
3080 #if INCLUDE_ALL_GCS
3081   if (UseG1GC) {
3082     Label not_weak;
3083     beq(CCR0, not_weak);   // Test for jweak tag.
3084     verify_oop(value);
3085     g1_write_barrier_pre(noreg, // obj
3086                          noreg, // offset
3087                          value, // pre_val
3088                          tmp1, tmp2, needs_frame);
3089     bind(not_weak);
3090   }
3091 #endif // INCLUDE_ALL_GCS
3092   verify_oop(value);
3093   bind(done);
3094 }
3095 
3096 #if INCLUDE_ALL_GCS
3097 // General G1 pre-barrier generator.
3098 // Goal: record the previous value if it is not null.
3099 void MacroAssembler::g1_write_barrier_pre(Register Robj, RegisterOrConstant offset, Register Rpre_val,
3100                                           Register Rtmp1, Register Rtmp2, bool needs_frame) {
3101   Label runtime, filtered;
3102 
3103   // Is marking active?
3104   if (in_bytes(SATBMarkQueue::byte_width_of_active()) == 4) {
3105     lwz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_active()), R16_thread);
3106   } else {
3107     guarantee(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, "Assumption");
3108     lbz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_active()), R16_thread);
3109   }
3110   cmpdi(CCR0, Rtmp1, 0);
3111   beq(CCR0, filtered);
3112 
3113   // Do we need to load the previous value?
3114   if (Robj != noreg) {
3115     // Load the previous value...
3116     if (UseCompressedOops) {
3117       lwz(Rpre_val, offset, Robj);
3118     } else {
3119       ld(Rpre_val, offset, Robj);
3120     }
3121     // Previous value has been loaded into Rpre_val.
3122   }
3123   assert(Rpre_val != noreg, "must have a real register");
3124 
3125   // Is the previous value null?
3126   cmpdi(CCR0, Rpre_val, 0);
3127   beq(CCR0, filtered);
3128 
3129   if (Robj != noreg && UseCompressedOops) {
3130     decode_heap_oop_not_null(Rpre_val);
3131   }
3132 
3133   // OK, it's not filtered, so we'll need to call enqueue. In the normal
3134   // case, pre_val will be a scratch G-reg, but there are some cases in
3135   // which it's an O-reg. In the first case, do a normal call. In the
3136   // latter, do a save here and call the frameless version.
3137 
3138   // Can we store original value in the thread's buffer?
3139   // Is index == 0?
3140   // (The index field is typed as size_t.)
3141   const Register Rbuffer = Rtmp1, Rindex = Rtmp2;
3142 
3143   ld(Rindex, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_index()), R16_thread);
3144   cmpdi(CCR0, Rindex, 0);
3145   beq(CCR0, runtime); // If index == 0, goto runtime.
3146   ld(Rbuffer, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_buf()), R16_thread);
3147 
3148   addi(Rindex, Rindex, -wordSize); // Decrement index.
3149   std(Rindex, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_index()), R16_thread);
3150 
3151   // Record the previous value.
3152   stdx(Rpre_val, Rbuffer, Rindex);
3153   b(filtered);
3154 
3155   bind(runtime);
3156 
3157   // May need to preserve LR. Also needed if current frame is not compatible with C calling convention.
3158   if (needs_frame) {
3159     save_LR_CR(Rtmp1);
3160     push_frame_reg_args(0, Rtmp2);
3161   }
3162 
3163   if (Rpre_val->is_volatile() && Robj == noreg) mr(R31, Rpre_val); // Save pre_val across C call if it was preloaded.
3164   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), Rpre_val, R16_thread);
3165   if (Rpre_val->is_volatile() && Robj == noreg) mr(Rpre_val, R31); // restore
3166 
3167   if (needs_frame) {
3168     pop_frame();
3169     restore_LR_CR(Rtmp1);
3170   }
3171 
3172   bind(filtered);
3173 }
3174 
3175 // General G1 post-barrier generator
3176 // Store cross-region card.
3177 void MacroAssembler::g1_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp1, Register Rtmp2, Register Rtmp3, Label *filtered_ext) {
3178   Label runtime, filtered_int;
3179   Label& filtered = (filtered_ext != NULL) ? *filtered_ext : filtered_int;
3180   assert_different_registers(Rstore_addr, Rnew_val, Rtmp1, Rtmp2);
3181 
3182   G1BarrierSet* bs =
3183     barrier_set_cast<G1BarrierSet>(Universe::heap()->barrier_set());
3184   CardTable* ct = bs->card_table();
3185 
3186   // Does store cross heap regions?
3187   if (G1RSBarrierRegionFilter) {
3188     xorr(Rtmp1, Rstore_addr, Rnew_val);
3189     srdi_(Rtmp1, Rtmp1, HeapRegion::LogOfHRGrainBytes);
3190     beq(CCR0, filtered);
3191   }
3192 
3193   // Crosses regions, storing NULL?
3194 #ifdef ASSERT
3195   cmpdi(CCR0, Rnew_val, 0);
3196   asm_assert_ne("null oop not allowed (G1)", 0x322); // Checked by caller on PPC64, so following branch is obsolete:
3197   //beq(CCR0, filtered);
3198 #endif
3199 
3200   // Storing region crossing non-NULL, is card already dirty?
3201   assert(sizeof(*ct->byte_map_base()) == sizeof(jbyte), "adjust this code");
3202   const Register Rcard_addr = Rtmp1;
3203   Register Rbase = Rtmp2;
3204   load_const_optimized(Rbase, (address)ct->byte_map_base(), /*temp*/ Rtmp3);
3205 
3206   srdi(Rcard_addr, Rstore_addr, CardTable::card_shift);
3207 
3208   // Get the address of the card.
3209   lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr);
3210   cmpwi(CCR0, Rtmp3, (int)G1CardTable::g1_young_card_val());
3211   beq(CCR0, filtered);
3212 
3213   membar(Assembler::StoreLoad);
3214   lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr);  // Reload after membar.
3215   cmpwi(CCR0, Rtmp3 /* card value */, CardTable::dirty_card_val());
3216   beq(CCR0, filtered);
3217 
3218   // Storing a region crossing, non-NULL oop, card is clean.
3219   // Dirty card and log.
3220   li(Rtmp3, CardTable::dirty_card_val());
3221   //release(); // G1: oops are allowed to get visible after dirty marking.
3222   stbx(Rtmp3, Rbase, Rcard_addr);
3223 
3224   add(Rcard_addr, Rbase, Rcard_addr); // This is the address which needs to get enqueued.
3225   Rbase = noreg; // end of lifetime
3226 
3227   const Register Rqueue_index = Rtmp2,
3228                  Rqueue_buf   = Rtmp3;
3229   ld(Rqueue_index, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_index()), R16_thread);
3230   cmpdi(CCR0, Rqueue_index, 0);
3231   beq(CCR0, runtime); // index == 0 then jump to runtime
3232   ld(Rqueue_buf, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_buf()), R16_thread);
3233 
3234   addi(Rqueue_index, Rqueue_index, -wordSize); // decrement index
3235   std(Rqueue_index, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_index()), R16_thread);
3236 
3237   stdx(Rcard_addr, Rqueue_buf, Rqueue_index); // store card
3238   b(filtered);
3239 
3240   bind(runtime);
3241 
3242   // Save the live input values.
3243   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), Rcard_addr, R16_thread);
3244 
3245   bind(filtered_int);
3246 }
3247 #endif // INCLUDE_ALL_GCS
3248 
3249 // Values for last_Java_pc, and last_Java_sp must comply to the rules
3250 // in frame_ppc.hpp.
3251 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) {
3252   // Always set last_Java_pc and flags first because once last_Java_sp
3253   // is visible has_last_Java_frame is true and users will look at the
3254   // rest of the fields. (Note: flags should always be zero before we
3255   // get here so doesn't need to be set.)
3256 
3257   // Verify that last_Java_pc was zeroed on return to Java
3258   asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread,
3259                           "last_Java_pc not zeroed before leaving Java", 0x200);
3260 
3261   // When returning from calling out from Java mode the frame anchor's
3262   // last_Java_pc will always be set to NULL. It is set here so that
3263   // if we are doing a call to native (not VM) that we capture the
3264   // known pc and don't have to rely on the native call having a
3265   // standard frame linkage where we can find the pc.
3266   if (last_Java_pc != noreg)
3267     std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
3268 
3269   // Set last_Java_sp last.
3270   std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
3271 }
3272 
3273 void MacroAssembler::reset_last_Java_frame(void) {
3274   asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()),
3275                              R16_thread, "SP was not set, still zero", 0x202);
3276 
3277   BLOCK_COMMENT("reset_last_Java_frame {");
3278   li(R0, 0);
3279 
3280   // _last_Java_sp = 0
3281   std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
3282 
3283   // _last_Java_pc = 0
3284   std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
3285   BLOCK_COMMENT("} reset_last_Java_frame");
3286 }
3287 
3288 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) {
3289   assert_different_registers(sp, tmp1);
3290 
3291   // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via
3292   // TOP_IJAVA_FRAME_ABI.
3293   // FIXME: assert that we really have a TOP_IJAVA_FRAME here!
3294   address entry = pc();
3295   load_const_optimized(tmp1, entry);
3296 
3297   set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1);
3298 }
3299 
3300 void MacroAssembler::get_vm_result(Register oop_result) {
3301   // Read:
3302   //   R16_thread
3303   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
3304   //
3305   // Updated:
3306   //   oop_result
3307   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
3308 
3309   verify_thread();
3310 
3311   ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread);
3312   li(R0, 0);
3313   std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread);
3314 
3315   verify_oop(oop_result);
3316 }
3317 
3318 void MacroAssembler::get_vm_result_2(Register metadata_result) {
3319   // Read:
3320   //   R16_thread
3321   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
3322   //
3323   // Updated:
3324   //   metadata_result
3325   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
3326 
3327   ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
3328   li(R0, 0);
3329   std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
3330 }
3331 
3332 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3333   Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided.
3334   if (Universe::narrow_klass_base() != 0) {
3335     // Use dst as temp if it is free.
3336     sub_const_optimized(dst, current, Universe::narrow_klass_base(), R0);
3337     current = dst;
3338   }
3339   if (Universe::narrow_klass_shift() != 0) {
3340     srdi(dst, current, Universe::narrow_klass_shift());
3341     current = dst;
3342   }
3343   return current;
3344 }
3345 
3346 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) {
3347   if (UseCompressedClassPointers) {
3348     Register compressedKlass = encode_klass_not_null(ck, klass);
3349     stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop);
3350   } else {
3351     std(klass, oopDesc::klass_offset_in_bytes(), dst_oop);
3352   }
3353 }
3354 
3355 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) {
3356   if (UseCompressedClassPointers) {
3357     if (val == noreg) {
3358       val = R0;
3359       li(val, 0);
3360     }
3361     stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed
3362   }
3363 }
3364 
3365 int MacroAssembler::instr_size_for_decode_klass_not_null() {
3366   if (!UseCompressedClassPointers) return 0;
3367   int num_instrs = 1;  // shift or move
3368   if (Universe::narrow_klass_base() != 0) num_instrs = 7;  // shift + load const + add
3369   return num_instrs * BytesPerInstWord;
3370 }
3371 
3372 void MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3373   assert(dst != R0, "Dst reg may not be R0, as R0 is used here.");
3374   if (src == noreg) src = dst;
3375   Register shifted_src = src;
3376   if (Universe::narrow_klass_shift() != 0 ||
3377       Universe::narrow_klass_base() == 0 && src != dst) {  // Move required.
3378     shifted_src = dst;
3379     sldi(shifted_src, src, Universe::narrow_klass_shift());
3380   }
3381   if (Universe::narrow_klass_base() != 0) {
3382     add_const_optimized(dst, shifted_src, Universe::narrow_klass_base(), R0);
3383   }
3384 }
3385 
3386 void MacroAssembler::load_klass(Register dst, Register src) {
3387   if (UseCompressedClassPointers) {
3388     lwz(dst, oopDesc::klass_offset_in_bytes(), src);
3389     // Attention: no null check here!
3390     decode_klass_not_null(dst, dst);
3391   } else {
3392     ld(dst, oopDesc::klass_offset_in_bytes(), src);
3393   }
3394 }
3395 
3396 // ((OopHandle)result).resolve();
3397 void MacroAssembler::resolve_oop_handle(Register result) {
3398   // OopHandle::resolve is an indirection.
3399   ld(result, 0, result);
3400 }
3401 
3402 void MacroAssembler::load_mirror_from_const_method(Register mirror, Register const_method) {
3403   ld(mirror, in_bytes(ConstMethod::constants_offset()), const_method);
3404   ld(mirror, ConstantPool::pool_holder_offset_in_bytes(), mirror);
3405   ld(mirror, in_bytes(Klass::java_mirror_offset()), mirror);
3406   resolve_oop_handle(mirror);
3407 }
3408 
3409 // Clear Array
3410 // For very short arrays. tmp == R0 is allowed.
3411 void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) {
3412   if (cnt_dwords > 0) { li(tmp, 0); }
3413   for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); }
3414 }
3415 
3416 // Version for constant short array length. Kills base_ptr. tmp == R0 is allowed.
3417 void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) {
3418   if (cnt_dwords < 8) {
3419     clear_memory_unrolled(base_ptr, cnt_dwords, tmp);
3420     return;
3421   }
3422 
3423   Label loop;
3424   const long loopcnt   = cnt_dwords >> 1,
3425              remainder = cnt_dwords & 1;
3426 
3427   li(tmp, loopcnt);
3428   mtctr(tmp);
3429   li(tmp, 0);
3430   bind(loop);
3431     std(tmp, 0, base_ptr);
3432     std(tmp, 8, base_ptr);
3433     addi(base_ptr, base_ptr, 16);
3434     bdnz(loop);
3435   if (remainder) { std(tmp, 0, base_ptr); }
3436 }
3437 
3438 // Kills both input registers. tmp == R0 is allowed.
3439 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) {
3440   // Procedure for large arrays (uses data cache block zero instruction).
3441     Label startloop, fast, fastloop, small_rest, restloop, done;
3442     const int cl_size         = VM_Version::L1_data_cache_line_size(),
3443               cl_dwords       = cl_size >> 3,
3444               cl_dw_addr_bits = exact_log2(cl_dwords),
3445               dcbz_min        = 1,  // Min count of dcbz executions, needs to be >0.
3446               min_cnt         = ((dcbz_min + 1) << cl_dw_addr_bits) - 1;
3447 
3448   if (const_cnt >= 0) {
3449     // Constant case.
3450     if (const_cnt < min_cnt) {
3451       clear_memory_constlen(base_ptr, const_cnt, tmp);
3452       return;
3453     }
3454     load_const_optimized(cnt_dwords, const_cnt, tmp);
3455   } else {
3456     // cnt_dwords already loaded in register. Need to check size.
3457     cmpdi(CCR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included).
3458     blt(CCR1, small_rest);
3459   }
3460     rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line.
3461     beq(CCR0, fast);                                  // Already 128byte aligned.
3462 
3463     subfic(tmp, tmp, cl_dwords);
3464     mtctr(tmp);                        // Set ctr to hit 128byte boundary (0<ctr<cl_dwords).
3465     subf(cnt_dwords, tmp, cnt_dwords); // rest.
3466     li(tmp, 0);
3467 
3468   bind(startloop);                     // Clear at the beginning to reach 128byte boundary.
3469     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3470     addi(base_ptr, base_ptr, 8);
3471     bdnz(startloop);
3472 
3473   bind(fast);                                  // Clear 128byte blocks.
3474     srdi(tmp, cnt_dwords, cl_dw_addr_bits);    // Loop count for 128byte loop (>0).
3475     andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords.
3476     mtctr(tmp);                                // Load counter.
3477 
3478   bind(fastloop);
3479     dcbz(base_ptr);                    // Clear 128byte aligned block.
3480     addi(base_ptr, base_ptr, cl_size);
3481     bdnz(fastloop);
3482 
3483   bind(small_rest);
3484     cmpdi(CCR0, cnt_dwords, 0);        // size 0?
3485     beq(CCR0, done);                   // rest == 0
3486     li(tmp, 0);
3487     mtctr(cnt_dwords);                 // Load counter.
3488 
3489   bind(restloop);                      // Clear rest.
3490     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3491     addi(base_ptr, base_ptr, 8);
3492     bdnz(restloop);
3493 
3494   bind(done);
3495 }
3496 
3497 /////////////////////////////////////////// String intrinsics ////////////////////////////////////////////
3498 
3499 #ifdef COMPILER2
3500 // Intrinsics for CompactStrings
3501 
3502 // Compress char[] to byte[] by compressing 16 bytes at once.
3503 void MacroAssembler::string_compress_16(Register src, Register dst, Register cnt,
3504                                         Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5,
3505                                         Label& Lfailure) {
3506 
3507   const Register tmp0 = R0;
3508   assert_different_registers(src, dst, cnt, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5);
3509   Label Lloop, Lslow;
3510 
3511   // Check if cnt >= 8 (= 16 bytes)
3512   lis(tmp1, 0xFF);                // tmp1 = 0x00FF00FF00FF00FF
3513   srwi_(tmp2, cnt, 3);
3514   beq(CCR0, Lslow);
3515   ori(tmp1, tmp1, 0xFF);
3516   rldimi(tmp1, tmp1, 32, 0);
3517   mtctr(tmp2);
3518 
3519   // 2x unrolled loop
3520   bind(Lloop);
3521   ld(tmp2, 0, src);               // _0_1_2_3 (Big Endian)
3522   ld(tmp4, 8, src);               // _4_5_6_7
3523 
3524   orr(tmp0, tmp2, tmp4);
3525   rldicl(tmp3, tmp2, 6*8, 64-24); // _____1_2
3526   rldimi(tmp2, tmp2, 2*8, 2*8);   // _0_2_3_3
3527   rldicl(tmp5, tmp4, 6*8, 64-24); // _____5_6
3528   rldimi(tmp4, tmp4, 2*8, 2*8);   // _4_6_7_7
3529 
3530   andc_(tmp0, tmp0, tmp1);
3531   bne(CCR0, Lfailure);            // Not latin1.
3532   addi(src, src, 16);
3533 
3534   rlwimi(tmp3, tmp2, 0*8, 24, 31);// _____1_3
3535   srdi(tmp2, tmp2, 3*8);          // ____0_2_
3536   rlwimi(tmp5, tmp4, 0*8, 24, 31);// _____5_7
3537   srdi(tmp4, tmp4, 3*8);          // ____4_6_
3538 
3539   orr(tmp2, tmp2, tmp3);          // ____0123
3540   orr(tmp4, tmp4, tmp5);          // ____4567
3541 
3542   stw(tmp2, 0, dst);
3543   stw(tmp4, 4, dst);
3544   addi(dst, dst, 8);
3545   bdnz(Lloop);
3546 
3547   bind(Lslow);                    // Fallback to slow version
3548 }
3549 
3550 // Compress char[] to byte[]. cnt must be positive int.
3551 void MacroAssembler::string_compress(Register src, Register dst, Register cnt, Register tmp, Label& Lfailure) {
3552   Label Lloop;
3553   mtctr(cnt);
3554 
3555   bind(Lloop);
3556   lhz(tmp, 0, src);
3557   cmplwi(CCR0, tmp, 0xff);
3558   bgt(CCR0, Lfailure);            // Not latin1.
3559   addi(src, src, 2);
3560   stb(tmp, 0, dst);
3561   addi(dst, dst, 1);
3562   bdnz(Lloop);
3563 }
3564 
3565 // Inflate byte[] to char[] by inflating 16 bytes at once.
3566 void MacroAssembler::string_inflate_16(Register src, Register dst, Register cnt,
3567                                        Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
3568   const Register tmp0 = R0;
3569   assert_different_registers(src, dst, cnt, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5);
3570   Label Lloop, Lslow;
3571 
3572   // Check if cnt >= 8
3573   srwi_(tmp2, cnt, 3);
3574   beq(CCR0, Lslow);
3575   lis(tmp1, 0xFF);                // tmp1 = 0x00FF00FF
3576   ori(tmp1, tmp1, 0xFF);
3577   mtctr(tmp2);
3578 
3579   // 2x unrolled loop
3580   bind(Lloop);
3581   lwz(tmp2, 0, src);              // ____0123 (Big Endian)
3582   lwz(tmp4, 4, src);              // ____4567
3583   addi(src, src, 8);
3584 
3585   rldicl(tmp3, tmp2, 7*8, 64-8);  // _______2
3586   rlwimi(tmp2, tmp2, 3*8, 16, 23);// ____0113
3587   rldicl(tmp5, tmp4, 7*8, 64-8);  // _______6
3588   rlwimi(tmp4, tmp4, 3*8, 16, 23);// ____4557
3589 
3590   andc(tmp0, tmp2, tmp1);         // ____0_1_
3591   rlwimi(tmp2, tmp3, 2*8, 0, 23); // _____2_3
3592   andc(tmp3, tmp4, tmp1);         // ____4_5_
3593   rlwimi(tmp4, tmp5, 2*8, 0, 23); // _____6_7
3594 
3595   rldimi(tmp2, tmp0, 3*8, 0*8);   // _0_1_2_3
3596   rldimi(tmp4, tmp3, 3*8, 0*8);   // _4_5_6_7
3597 
3598   std(tmp2, 0, dst);
3599   std(tmp4, 8, dst);
3600   addi(dst, dst, 16);
3601   bdnz(Lloop);
3602 
3603   bind(Lslow);                    // Fallback to slow version
3604 }
3605 
3606 // Inflate byte[] to char[]. cnt must be positive int.
3607 void MacroAssembler::string_inflate(Register src, Register dst, Register cnt, Register tmp) {
3608   Label Lloop;
3609   mtctr(cnt);
3610 
3611   bind(Lloop);
3612   lbz(tmp, 0, src);
3613   addi(src, src, 1);
3614   sth(tmp, 0, dst);
3615   addi(dst, dst, 2);
3616   bdnz(Lloop);
3617 }
3618 
3619 void MacroAssembler::string_compare(Register str1, Register str2,
3620                                     Register cnt1, Register cnt2,
3621                                     Register tmp1, Register result, int ae) {
3622   const Register tmp0 = R0,
3623                  diff = tmp1;
3624 
3625   assert_different_registers(str1, str2, cnt1, cnt2, tmp0, tmp1, result);
3626   Label Ldone, Lslow, Lloop, Lreturn_diff;
3627 
3628   // Note: Making use of the fact that compareTo(a, b) == -compareTo(b, a)
3629   // we interchange str1 and str2 in the UL case and negate the result.
3630   // Like this, str1 is always latin1 encoded, except for the UU case.
3631   // In addition, we need 0 (or sign which is 0) extend.
3632 
3633   if (ae == StrIntrinsicNode::UU) {
3634     srwi(cnt1, cnt1, 1);
3635   } else {
3636     clrldi(cnt1, cnt1, 32);
3637   }
3638 
3639   if (ae != StrIntrinsicNode::LL) {
3640     srwi(cnt2, cnt2, 1);
3641   } else {
3642     clrldi(cnt2, cnt2, 32);
3643   }
3644 
3645   // See if the lengths are different, and calculate min in cnt1.
3646   // Save diff in case we need it for a tie-breaker.
3647   subf_(diff, cnt2, cnt1); // diff = cnt1 - cnt2
3648   // if (diff > 0) { cnt1 = cnt2; }
3649   if (VM_Version::has_isel()) {
3650     isel(cnt1, CCR0, Assembler::greater, /*invert*/ false, cnt2);
3651   } else {
3652     Label Lskip;
3653     blt(CCR0, Lskip);
3654     mr(cnt1, cnt2);
3655     bind(Lskip);
3656   }
3657 
3658   // Rename registers
3659   Register chr1 = result;
3660   Register chr2 = tmp0;
3661 
3662   // Compare multiple characters in fast loop (only implemented for same encoding).
3663   int stride1 = 8, stride2 = 8;
3664   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3665     int log2_chars_per_iter = (ae == StrIntrinsicNode::LL) ? 3 : 2;
3666     Label Lfastloop, Lskipfast;
3667 
3668     srwi_(tmp0, cnt1, log2_chars_per_iter);
3669     beq(CCR0, Lskipfast);
3670     rldicl(cnt2, cnt1, 0, 64 - log2_chars_per_iter); // Remaining characters.
3671     li(cnt1, 1 << log2_chars_per_iter); // Initialize for failure case: Rescan characters from current iteration.
3672     mtctr(tmp0);
3673 
3674     bind(Lfastloop);
3675     ld(chr1, 0, str1);
3676     ld(chr2, 0, str2);
3677     cmpd(CCR0, chr1, chr2);
3678     bne(CCR0, Lslow);
3679     addi(str1, str1, stride1);
3680     addi(str2, str2, stride2);
3681     bdnz(Lfastloop);
3682     mr(cnt1, cnt2); // Remaining characters.
3683     bind(Lskipfast);
3684   }
3685 
3686   // Loop which searches the first difference character by character.
3687   cmpwi(CCR0, cnt1, 0);
3688   beq(CCR0, Lreturn_diff);
3689   bind(Lslow);
3690   mtctr(cnt1);
3691 
3692   switch (ae) {
3693     case StrIntrinsicNode::LL: stride1 = 1; stride2 = 1; break;
3694     case StrIntrinsicNode::UL: // fallthru (see comment above)
3695     case StrIntrinsicNode::LU: stride1 = 1; stride2 = 2; break;
3696     case StrIntrinsicNode::UU: stride1 = 2; stride2 = 2; break;
3697     default: ShouldNotReachHere(); break;
3698   }
3699 
3700   bind(Lloop);
3701   if (stride1 == 1) { lbz(chr1, 0, str1); } else { lhz(chr1, 0, str1); }
3702   if (stride2 == 1) { lbz(chr2, 0, str2); } else { lhz(chr2, 0, str2); }
3703   subf_(result, chr2, chr1); // result = chr1 - chr2
3704   bne(CCR0, Ldone);
3705   addi(str1, str1, stride1);
3706   addi(str2, str2, stride2);
3707   bdnz(Lloop);
3708 
3709   // If strings are equal up to min length, return the length difference.
3710   bind(Lreturn_diff);
3711   mr(result, diff);
3712 
3713   // Otherwise, return the difference between the first mismatched chars.
3714   bind(Ldone);
3715   if (ae == StrIntrinsicNode::UL) {
3716     neg(result, result); // Negate result (see note above).
3717   }
3718 }
3719 
3720 void MacroAssembler::array_equals(bool is_array_equ, Register ary1, Register ary2,
3721                                   Register limit, Register tmp1, Register result, bool is_byte) {
3722   const Register tmp0 = R0;
3723   assert_different_registers(ary1, ary2, limit, tmp0, tmp1, result);
3724   Label Ldone, Lskiploop, Lloop, Lfastloop, Lskipfast;
3725   bool limit_needs_shift = false;
3726 
3727   if (is_array_equ) {
3728     const int length_offset = arrayOopDesc::length_offset_in_bytes();
3729     const int base_offset   = arrayOopDesc::base_offset_in_bytes(is_byte ? T_BYTE : T_CHAR);
3730 
3731     // Return true if the same array.
3732     cmpd(CCR0, ary1, ary2);
3733     beq(CCR0, Lskiploop);
3734 
3735     // Return false if one of them is NULL.
3736     cmpdi(CCR0, ary1, 0);
3737     cmpdi(CCR1, ary2, 0);
3738     li(result, 0);
3739     cror(CCR0, Assembler::equal, CCR1, Assembler::equal);
3740     beq(CCR0, Ldone);
3741 
3742     // Load the lengths of arrays.
3743     lwz(limit, length_offset, ary1);
3744     lwz(tmp0, length_offset, ary2);
3745 
3746     // Return false if the two arrays are not equal length.
3747     cmpw(CCR0, limit, tmp0);
3748     bne(CCR0, Ldone);
3749 
3750     // Load array addresses.
3751     addi(ary1, ary1, base_offset);
3752     addi(ary2, ary2, base_offset);
3753   } else {
3754     limit_needs_shift = !is_byte;
3755     li(result, 0); // Assume not equal.
3756   }
3757 
3758   // Rename registers
3759   Register chr1 = tmp0;
3760   Register chr2 = tmp1;
3761 
3762   // Compare 8 bytes per iteration in fast loop.
3763   const int log2_chars_per_iter = is_byte ? 3 : 2;
3764 
3765   srwi_(tmp0, limit, log2_chars_per_iter + (limit_needs_shift ? 1 : 0));
3766   beq(CCR0, Lskipfast);
3767   mtctr(tmp0);
3768 
3769   bind(Lfastloop);
3770   ld(chr1, 0, ary1);
3771   ld(chr2, 0, ary2);
3772   addi(ary1, ary1, 8);
3773   addi(ary2, ary2, 8);
3774   cmpd(CCR0, chr1, chr2);
3775   bne(CCR0, Ldone);
3776   bdnz(Lfastloop);
3777 
3778   bind(Lskipfast);
3779   rldicl_(limit, limit, limit_needs_shift ? 64 - 1 : 0, 64 - log2_chars_per_iter); // Remaining characters.
3780   beq(CCR0, Lskiploop);
3781   mtctr(limit);
3782 
3783   // Character by character.
3784   bind(Lloop);
3785   if (is_byte) {
3786     lbz(chr1, 0, ary1);
3787     lbz(chr2, 0, ary2);
3788     addi(ary1, ary1, 1);
3789     addi(ary2, ary2, 1);
3790   } else {
3791     lhz(chr1, 0, ary1);
3792     lhz(chr2, 0, ary2);
3793     addi(ary1, ary1, 2);
3794     addi(ary2, ary2, 2);
3795   }
3796   cmpw(CCR0, chr1, chr2);
3797   bne(CCR0, Ldone);
3798   bdnz(Lloop);
3799 
3800   bind(Lskiploop);
3801   li(result, 1); // All characters are equal.
3802   bind(Ldone);
3803 }
3804 
3805 void MacroAssembler::string_indexof(Register result, Register haystack, Register haycnt,
3806                                     Register needle, ciTypeArray* needle_values, Register needlecnt, int needlecntval,
3807                                     Register tmp1, Register tmp2, Register tmp3, Register tmp4, int ae) {
3808 
3809   // Ensure 0<needlecnt<=haycnt in ideal graph as prerequisite!
3810   Label L_TooShort, L_Found, L_NotFound, L_End;
3811   Register last_addr = haycnt, // Kill haycnt at the beginning.
3812   addr      = tmp1,
3813   n_start   = tmp2,
3814   ch1       = tmp3,
3815   ch2       = R0;
3816 
3817   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
3818   const int h_csize = (ae == StrIntrinsicNode::LL) ? 1 : 2;
3819   const int n_csize = (ae == StrIntrinsicNode::UU) ? 2 : 1;
3820 
3821   // **************************************************************************************************
3822   // Prepare for main loop: optimized for needle count >=2, bail out otherwise.
3823   // **************************************************************************************************
3824 
3825   // Compute last haystack addr to use if no match gets found.
3826   clrldi(haycnt, haycnt, 32);         // Ensure positive int is valid as 64 bit value.
3827   addi(addr, haystack, -h_csize);     // Accesses use pre-increment.
3828   if (needlecntval == 0) { // variable needlecnt
3829    cmpwi(CCR6, needlecnt, 2);
3830    clrldi(needlecnt, needlecnt, 32);  // Ensure positive int is valid as 64 bit value.
3831    blt(CCR6, L_TooShort);             // Variable needlecnt: handle short needle separately.
3832   }
3833 
3834   if (n_csize == 2) { lwz(n_start, 0, needle); } else { lhz(n_start, 0, needle); } // Load first 2 characters of needle.
3835 
3836   if (needlecntval == 0) { // variable needlecnt
3837    subf(ch1, needlecnt, haycnt);      // Last character index to compare is haycnt-needlecnt.
3838    addi(needlecnt, needlecnt, -2);    // Rest of needle.
3839   } else { // constant needlecnt
3840   guarantee(needlecntval != 1, "IndexOf with single-character needle must be handled separately");
3841   assert((needlecntval & 0x7fff) == needlecntval, "wrong immediate");
3842    addi(ch1, haycnt, -needlecntval);  // Last character index to compare is haycnt-needlecnt.
3843    if (needlecntval > 3) { li(needlecnt, needlecntval - 2); } // Rest of needle.
3844   }
3845 
3846   if (h_csize == 2) { slwi(ch1, ch1, 1); } // Scale to number of bytes.
3847 
3848   if (ae ==StrIntrinsicNode::UL) {
3849    srwi(tmp4, n_start, 1*8);          // ___0
3850    rlwimi(n_start, tmp4, 2*8, 0, 23); // _0_1
3851   }
3852 
3853   add(last_addr, haystack, ch1);      // Point to last address to compare (haystack+2*(haycnt-needlecnt)).
3854 
3855   // Main Loop (now we have at least 2 characters).
3856   Label L_OuterLoop, L_InnerLoop, L_FinalCheck, L_Comp1, L_Comp2;
3857   bind(L_OuterLoop); // Search for 1st 2 characters.
3858   Register addr_diff = tmp4;
3859    subf(addr_diff, addr, last_addr);  // Difference between already checked address and last address to check.
3860    addi(addr, addr, h_csize);         // This is the new address we want to use for comparing.
3861    srdi_(ch2, addr_diff, h_csize);
3862    beq(CCR0, L_FinalCheck);           // 2 characters left?
3863    mtctr(ch2);                        // num of characters / 2
3864   bind(L_InnerLoop);                  // Main work horse (2x unrolled search loop)
3865    if (h_csize == 2) {                // Load 2 characters of haystack (ignore alignment).
3866     lwz(ch1, 0, addr);
3867     lwz(ch2, 2, addr);
3868    } else {
3869     lhz(ch1, 0, addr);
3870     lhz(ch2, 1, addr);
3871    }
3872    cmpw(CCR0, ch1, n_start);          // Compare 2 characters (1 would be sufficient but try to reduce branches to CompLoop).
3873    cmpw(CCR1, ch2, n_start);
3874    beq(CCR0, L_Comp1);                // Did we find the needle start?
3875    beq(CCR1, L_Comp2);
3876    addi(addr, addr, 2 * h_csize);
3877    bdnz(L_InnerLoop);
3878   bind(L_FinalCheck);
3879    andi_(addr_diff, addr_diff, h_csize); // Remaining characters not covered by InnerLoop: (num of characters) & 1.
3880    beq(CCR0, L_NotFound);
3881    if (h_csize == 2) { lwz(ch1, 0, addr); } else { lhz(ch1, 0, addr); } // One position left at which we have to compare.
3882    cmpw(CCR1, ch1, n_start);
3883    beq(CCR1, L_Comp1);
3884   bind(L_NotFound);
3885    li(result, -1);                    // not found
3886    b(L_End);
3887 
3888    // **************************************************************************************************
3889    // Special Case: unfortunately, the variable needle case can be called with needlecnt<2
3890    // **************************************************************************************************
3891   if (needlecntval == 0) {           // We have to handle these cases separately.
3892   Label L_OneCharLoop;
3893   bind(L_TooShort);
3894    mtctr(haycnt);
3895    if (n_csize == 2) { lhz(n_start, 0, needle); } else { lbz(n_start, 0, needle); } // First character of needle
3896   bind(L_OneCharLoop);
3897    if (h_csize == 2) { lhzu(ch1, 2, addr); } else { lbzu(ch1, 1, addr); }
3898    cmpw(CCR1, ch1, n_start);
3899    beq(CCR1, L_Found);               // Did we find the one character needle?
3900    bdnz(L_OneCharLoop);
3901    li(result, -1);                   // Not found.
3902    b(L_End);
3903   }
3904 
3905   // **************************************************************************************************
3906   // Regular Case Part II: compare rest of needle (first 2 characters have been compared already)
3907   // **************************************************************************************************
3908 
3909   // Compare the rest
3910   bind(L_Comp2);
3911    addi(addr, addr, h_csize);        // First comparison has failed, 2nd one hit.
3912   bind(L_Comp1);                     // Addr points to possible needle start.
3913   if (needlecntval != 2) {           // Const needlecnt==2?
3914    if (needlecntval != 3) {
3915     if (needlecntval == 0) { beq(CCR6, L_Found); } // Variable needlecnt==2?
3916     Register n_ind = tmp4,
3917              h_ind = n_ind;
3918     li(n_ind, 2 * n_csize);          // First 2 characters are already compared, use index 2.
3919     mtctr(needlecnt);                // Decremented by 2, still > 0.
3920    Label L_CompLoop;
3921    bind(L_CompLoop);
3922     if (ae ==StrIntrinsicNode::UL) {
3923       h_ind = ch1;
3924       sldi(h_ind, n_ind, 1);
3925     }
3926     if (n_csize == 2) { lhzx(ch2, needle, n_ind); } else { lbzx(ch2, needle, n_ind); }
3927     if (h_csize == 2) { lhzx(ch1, addr, h_ind); } else { lbzx(ch1, addr, h_ind); }
3928     cmpw(CCR1, ch1, ch2);
3929     bne(CCR1, L_OuterLoop);
3930     addi(n_ind, n_ind, n_csize);
3931     bdnz(L_CompLoop);
3932    } else { // No loop required if there's only one needle character left.
3933     if (n_csize == 2) { lhz(ch2, 2 * 2, needle); } else { lbz(ch2, 2 * 1, needle); }
3934     if (h_csize == 2) { lhz(ch1, 2 * 2, addr); } else { lbz(ch1, 2 * 1, addr); }
3935     cmpw(CCR1, ch1, ch2);
3936     bne(CCR1, L_OuterLoop);
3937    }
3938   }
3939   // Return index ...
3940   bind(L_Found);
3941    subf(result, haystack, addr);     // relative to haystack, ...
3942    if (h_csize == 2) { srdi(result, result, 1); } // in characters.
3943   bind(L_End);
3944 } // string_indexof
3945 
3946 void MacroAssembler::string_indexof_char(Register result, Register haystack, Register haycnt,
3947                                          Register needle, jchar needleChar, Register tmp1, Register tmp2, bool is_byte) {
3948   assert_different_registers(haystack, haycnt, needle, tmp1, tmp2);
3949 
3950   Label L_InnerLoop, L_FinalCheck, L_Found1, L_Found2, L_NotFound, L_End;
3951   Register addr = tmp1,
3952            ch1 = tmp2,
3953            ch2 = R0;
3954 
3955   const int h_csize = is_byte ? 1 : 2;
3956 
3957 //4:
3958    srwi_(tmp2, haycnt, 1);   // Shift right by exact_log2(UNROLL_FACTOR).
3959    mr(addr, haystack);
3960    beq(CCR0, L_FinalCheck);
3961    mtctr(tmp2);              // Move to count register.
3962 //8:
3963   bind(L_InnerLoop);         // Main work horse (2x unrolled search loop).
3964    if (!is_byte) {
3965     lhz(ch1, 0, addr);
3966     lhz(ch2, 2, addr);
3967    } else {
3968     lbz(ch1, 0, addr);
3969     lbz(ch2, 1, addr);
3970    }
3971    (needle != R0) ? cmpw(CCR0, ch1, needle) : cmplwi(CCR0, ch1, (unsigned int)needleChar);
3972    (needle != R0) ? cmpw(CCR1, ch2, needle) : cmplwi(CCR1, ch2, (unsigned int)needleChar);
3973    beq(CCR0, L_Found1);      // Did we find the needle?
3974    beq(CCR1, L_Found2);
3975    addi(addr, addr, 2 * h_csize);
3976    bdnz(L_InnerLoop);
3977 //16:
3978   bind(L_FinalCheck);
3979    andi_(R0, haycnt, 1);
3980    beq(CCR0, L_NotFound);
3981    if (!is_byte) { lhz(ch1, 0, addr); } else { lbz(ch1, 0, addr); } // One position left at which we have to compare.
3982    (needle != R0) ? cmpw(CCR1, ch1, needle) : cmplwi(CCR1, ch1, (unsigned int)needleChar);
3983    beq(CCR1, L_Found1);
3984 //21:
3985   bind(L_NotFound);
3986    li(result, -1);           // Not found.
3987    b(L_End);
3988 
3989   bind(L_Found2);
3990    addi(addr, addr, h_csize);
3991 //24:
3992   bind(L_Found1);            // Return index ...
3993    subf(result, haystack, addr); // relative to haystack, ...
3994    if (!is_byte) { srdi(result, result, 1); } // in characters.
3995   bind(L_End);
3996 } // string_indexof_char
3997 
3998 
3999 void MacroAssembler::has_negatives(Register src, Register cnt, Register result,
4000                                    Register tmp1, Register tmp2) {
4001   const Register tmp0 = R0;
4002   assert_different_registers(src, result, cnt, tmp0, tmp1, tmp2);
4003   Label Lfastloop, Lslow, Lloop, Lnoneg, Ldone;
4004 
4005   // Check if cnt >= 8 (= 16 bytes)
4006   lis(tmp1, (int)(short)0x8080);  // tmp1 = 0x8080808080808080
4007   srwi_(tmp2, cnt, 4);
4008   li(result, 1);                  // Assume there's a negative byte.
4009   beq(CCR0, Lslow);
4010   ori(tmp1, tmp1, 0x8080);
4011   rldimi(tmp1, tmp1, 32, 0);
4012   mtctr(tmp2);
4013 
4014   // 2x unrolled loop
4015   bind(Lfastloop);
4016   ld(tmp2, 0, src);
4017   ld(tmp0, 8, src);
4018 
4019   orr(tmp0, tmp2, tmp0);
4020 
4021   and_(tmp0, tmp0, tmp1);
4022   bne(CCR0, Ldone);               // Found negative byte.
4023   addi(src, src, 16);
4024 
4025   bdnz(Lfastloop);
4026 
4027   bind(Lslow);                    // Fallback to slow version
4028   rldicl_(tmp0, cnt, 0, 64-4);
4029   beq(CCR0, Lnoneg);
4030   mtctr(tmp0);
4031   bind(Lloop);
4032   lbz(tmp0, 0, src);
4033   addi(src, src, 1);
4034   andi_(tmp0, tmp0, 0x80);
4035   bne(CCR0, Ldone);               // Found negative byte.
4036   bdnz(Lloop);
4037   bind(Lnoneg);
4038   li(result, 0);
4039 
4040   bind(Ldone);
4041 }
4042 
4043 #endif // Compiler2
4044 
4045 // Helpers for Intrinsic Emitters
4046 //
4047 // Revert the byte order of a 32bit value in a register
4048 //   src: 0x44556677
4049 //   dst: 0x77665544
4050 // Three steps to obtain the result:
4051 //  1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word
4052 //     into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared.
4053 //     This value initializes dst.
4054 //  2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost
4055 //     byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go.
4056 //     This value is mask inserted into dst with a [0..23] mask of 1s.
4057 //  3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position.
4058 //     This value is mask inserted into dst with a [8..15] mask of 1s.
4059 void MacroAssembler::load_reverse_32(Register dst, Register src) {
4060   assert_different_registers(dst, src);
4061 
4062   rldicl(dst, src, (4+1)*8, 56);       // Rotate byte 4 into position 7 (rightmost), clear all to the left.
4063   rlwimi(dst, src,     3*8,  0, 23);   // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone.
4064   rlwimi(dst, src,     1*8,  8, 15);   // Insert byte 6 into position 5, leave the rest alone.
4065 }
4066 
4067 // Calculate the column addresses of the crc32 lookup table into distinct registers.
4068 // This loop-invariant calculation is moved out of the loop body, reducing the loop
4069 // body size from 20 to 16 instructions.
4070 // Returns the offset that was used to calculate the address of column tc3.
4071 // Due to register shortage, setting tc3 may overwrite table. With the return offset
4072 // at hand, the original table address can be easily reconstructed.
4073 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {
4074 
4075 #ifdef VM_LITTLE_ENDIAN
4076   // This is what we implement (the DOLIT4 part):
4077   // ========================================================================= */
4078   // #define DOLIT4 c ^= *buf4++; \
4079   //         c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \
4080   //             crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24]
4081   // #define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4
4082   // ========================================================================= */
4083   const int ix0 = 3*(4*CRC32_COLUMN_SIZE);
4084   const int ix1 = 2*(4*CRC32_COLUMN_SIZE);
4085   const int ix2 = 1*(4*CRC32_COLUMN_SIZE);
4086   const int ix3 = 0*(4*CRC32_COLUMN_SIZE);
4087 #else
4088   // This is what we implement (the DOBIG4 part):
4089   // =========================================================================
4090   // #define DOBIG4 c ^= *++buf4; \
4091   //         c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \
4092   //             crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24]
4093   // #define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4
4094   // =========================================================================
4095   const int ix0 = 4*(4*CRC32_COLUMN_SIZE);
4096   const int ix1 = 5*(4*CRC32_COLUMN_SIZE);
4097   const int ix2 = 6*(4*CRC32_COLUMN_SIZE);
4098   const int ix3 = 7*(4*CRC32_COLUMN_SIZE);
4099 #endif
4100   assert_different_registers(table, tc0, tc1, tc2);
4101   assert(table == tc3, "must be!");
4102 
4103   addi(tc0, table, ix0);
4104   addi(tc1, table, ix1);
4105   addi(tc2, table, ix2);
4106   if (ix3 != 0) addi(tc3, table, ix3);
4107 
4108   return ix3;
4109 }
4110 
4111 /**
4112  * uint32_t crc;
4113  * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
4114  */
4115 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) {
4116   assert_different_registers(crc, table, tmp);
4117   assert_different_registers(val, table);
4118 
4119   if (crc == val) {                   // Must rotate first to use the unmodified value.
4120     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
4121                                       // As we use a word (4-byte) instruction, we have to adapt the mask bit positions.
4122     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
4123   } else {
4124     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
4125     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
4126   }
4127   lwzx(tmp, table, tmp);
4128   xorr(crc, crc, tmp);
4129 }
4130 
4131 /**
4132  * uint32_t crc;
4133  * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
4134  */
4135 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
4136   fold_byte_crc32(crc, crc, table, tmp);
4137 }
4138 
4139 /**
4140  * Emits code to update CRC-32 with a byte value according to constants in table.
4141  *
4142  * @param [in,out]crc   Register containing the crc.
4143  * @param [in]val       Register containing the byte to fold into the CRC.
4144  * @param [in]table     Register containing the table of crc constants.
4145  *
4146  * uint32_t crc;
4147  * val = crc_table[(val ^ crc) & 0xFF];
4148  * crc = val ^ (crc >> 8);
4149  */
4150 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
4151   BLOCK_COMMENT("update_byte_crc32:");
4152   xorr(val, val, crc);
4153   fold_byte_crc32(crc, val, table, val);
4154 }
4155 
4156 /**
4157  * @param crc   register containing existing CRC (32-bit)
4158  * @param buf   register pointing to input byte buffer (byte*)
4159  * @param len   register containing number of bytes
4160  * @param table register pointing to CRC table
4161  */
4162 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,
4163                                            Register data, bool loopAlignment) {
4164   assert_different_registers(crc, buf, len, table, data);
4165 
4166   Label L_mainLoop, L_done;
4167   const int mainLoop_stepping  = 1;
4168   const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4;
4169 
4170   // Process all bytes in a single-byte loop.
4171   clrldi_(len, len, 32);                         // Enforce 32 bit. Anything to do?
4172   beq(CCR0, L_done);
4173 
4174   mtctr(len);
4175   align(mainLoop_alignment);
4176   BIND(L_mainLoop);
4177     lbz(data, 0, buf);                           // Byte from buffer, zero-extended.
4178     addi(buf, buf, mainLoop_stepping);           // Advance buffer position.
4179     update_byte_crc32(crc, data, table);
4180     bdnz(L_mainLoop);                            // Iterate.
4181 
4182   bind(L_done);
4183 }
4184 
4185 /**
4186  * Emits code to update CRC-32 with a 4-byte value according to constants in table
4187  * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
4188  */
4189 // A not on the lookup table address(es):
4190 // The lookup table consists of two sets of four columns each.
4191 // The columns {0..3} are used for little-endian machines.
4192 // The columns {4..7} are used for big-endian machines.
4193 // To save the effort of adding the column offset to the table address each time
4194 // a table element is looked up, it is possible to pass the pre-calculated
4195 // column addresses.
4196 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary.
4197 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,
4198                                         Register t0,  Register t1,  Register t2,  Register t3,
4199                                         Register tc0, Register tc1, Register tc2, Register tc3) {
4200   assert_different_registers(crc, t3);
4201 
4202   // XOR crc with next four bytes of buffer.
4203   lwz(t3, bufDisp, buf);
4204   if (bufInc != 0) {
4205     addi(buf, buf, bufInc);
4206   }
4207   xorr(t3, t3, crc);
4208 
4209   // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices.
4210   rlwinm(t0, t3,  2,         24-2, 31-2);  // ((t1 >>  0) & 0xff) << 2
4211   rlwinm(t1, t3,  32+(2- 8), 24-2, 31-2);  // ((t1 >>  8) & 0xff) << 2
4212   rlwinm(t2, t3,  32+(2-16), 24-2, 31-2);  // ((t1 >> 16) & 0xff) << 2
4213   rlwinm(t3, t3,  32+(2-24), 24-2, 31-2);  // ((t1 >> 24) & 0xff) << 2
4214 
4215   // Use the pre-calculated column addresses.
4216   // Load pre-calculated table values.
4217   lwzx(t0, tc0, t0);
4218   lwzx(t1, tc1, t1);
4219   lwzx(t2, tc2, t2);
4220   lwzx(t3, tc3, t3);
4221 
4222   // Calculate new crc from table values.
4223   xorr(t0,  t0, t1);
4224   xorr(t2,  t2, t3);
4225   xorr(crc, t0, t2);  // Now crc contains the final checksum value.
4226 }
4227 
4228 /**
4229  * @param crc   register containing existing CRC (32-bit)
4230  * @param buf   register pointing to input byte buffer (byte*)
4231  * @param len   register containing number of bytes
4232  * @param table register pointing to CRC table
4233  *
4234  * Uses R9..R12 as work register. Must be saved/restored by caller!
4235  */
4236 void MacroAssembler::kernel_crc32_2word(Register crc, Register buf, Register len, Register table,
4237                                         Register t0,  Register t1,  Register t2,  Register t3,
4238                                         Register tc0, Register tc1, Register tc2, Register tc3,
4239                                         bool invertCRC) {
4240   assert_different_registers(crc, buf, len, table);
4241 
4242   Label L_mainLoop, L_tail;
4243   Register  tmp  = t0;
4244   Register  data = t0;
4245   Register  tmp2 = t1;
4246   const int mainLoop_stepping  = 8;
4247   const int tailLoop_stepping  = 1;
4248   const int log_stepping       = exact_log2(mainLoop_stepping);
4249   const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
4250   const int complexThreshold   = 2*mainLoop_stepping;
4251 
4252   // Don't test for len <= 0 here. This pathological case should not occur anyway.
4253   // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles
4254   // for all well-behaved cases. The situation itself is detected and handled correctly
4255   // within update_byteLoop_crc32.
4256   assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
4257 
4258   BLOCK_COMMENT("kernel_crc32_2word {");
4259 
4260   if (invertCRC) {
4261     nand(crc, crc, crc);                      // 1s complement of crc
4262   }
4263 
4264   // Check for short (<mainLoop_stepping) buffer.
4265   cmpdi(CCR0, len, complexThreshold);
4266   blt(CCR0, L_tail);
4267 
4268   // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
4269   // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
4270   {
4271     // Align buf addr to mainLoop_stepping boundary.
4272     neg(tmp2, buf);                           // Calculate # preLoop iterations for alignment.
4273     rldicl(tmp2, tmp2, 0, 64-log_stepping);   // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
4274 
4275     if (complexThreshold > mainLoop_stepping) {
4276       sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4277     } else {
4278       sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
4279       cmpdi(CCR0, tmp, mainLoop_stepping);
4280       blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
4281       mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4282     }
4283     update_byteLoop_crc32(crc, buf, tmp2, table, data, false);
4284   }
4285 
4286   srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
4287   andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
4288   mtctr(tmp2);
4289 
4290 #ifdef VM_LITTLE_ENDIAN
4291   Register crc_rv = crc;
4292 #else
4293   Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
4294                                                  // Occupies tmp, but frees up crc.
4295   load_reverse_32(crc_rv, crc);                  // Revert byte order because we are dealing with big-endian data.
4296   tmp = crc;
4297 #endif
4298 
4299   int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
4300 
4301   align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
4302   BIND(L_mainLoop);
4303     update_1word_crc32(crc_rv, buf, table, 0, 0, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
4304     update_1word_crc32(crc_rv, buf, table, 4, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
4305     bdnz(L_mainLoop);
4306 
4307 #ifndef VM_LITTLE_ENDIAN
4308   load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
4309   tmp = crc_rv;                                  // Tmp uses it's original register again.
4310 #endif
4311 
4312   // Restore original table address for tailLoop.
4313   if (reconstructTableOffset != 0) {
4314     addi(table, table, -reconstructTableOffset);
4315   }
4316 
4317   // Process last few (<complexThreshold) bytes of buffer.
4318   BIND(L_tail);
4319   update_byteLoop_crc32(crc, buf, len, table, data, false);
4320 
4321   if (invertCRC) {
4322     nand(crc, crc, crc);                      // 1s complement of crc
4323   }
4324   BLOCK_COMMENT("} kernel_crc32_2word");
4325 }
4326 
4327 /**
4328  * @param crc   register containing existing CRC (32-bit)
4329  * @param buf   register pointing to input byte buffer (byte*)
4330  * @param len   register containing number of bytes
4331  * @param table register pointing to CRC table
4332  *
4333  * uses R9..R12 as work register. Must be saved/restored by caller!
4334  */
4335 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table,
4336                                         Register t0,  Register t1,  Register t2,  Register t3,
4337                                         Register tc0, Register tc1, Register tc2, Register tc3,
4338                                         bool invertCRC) {
4339   assert_different_registers(crc, buf, len, table);
4340 
4341   Label L_mainLoop, L_tail;
4342   Register  tmp          = t0;
4343   Register  data         = t0;
4344   Register  tmp2         = t1;
4345   const int mainLoop_stepping  = 4;
4346   const int tailLoop_stepping  = 1;
4347   const int log_stepping       = exact_log2(mainLoop_stepping);
4348   const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
4349   const int complexThreshold   = 2*mainLoop_stepping;
4350 
4351   // Don't test for len <= 0 here. This pathological case should not occur anyway.
4352   // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles
4353   // for all well-behaved cases. The situation itself is detected and handled correctly
4354   // within update_byteLoop_crc32.
4355   assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
4356 
4357   BLOCK_COMMENT("kernel_crc32_1word {");
4358 
4359   if (invertCRC) {
4360     nand(crc, crc, crc);                      // 1s complement of crc
4361   }
4362 
4363   // Check for short (<mainLoop_stepping) buffer.
4364   cmpdi(CCR0, len, complexThreshold);
4365   blt(CCR0, L_tail);
4366 
4367   // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
4368   // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
4369   {
4370     // Align buf addr to mainLoop_stepping boundary.
4371     neg(tmp2, buf);                              // Calculate # preLoop iterations for alignment.
4372     rldicl(tmp2, tmp2, 0, 64-log_stepping);      // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
4373 
4374     if (complexThreshold > mainLoop_stepping) {
4375       sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4376     } else {
4377       sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
4378       cmpdi(CCR0, tmp, mainLoop_stepping);
4379       blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
4380       mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4381     }
4382     update_byteLoop_crc32(crc, buf, tmp2, table, data, false);
4383   }
4384 
4385   srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
4386   andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
4387   mtctr(tmp2);
4388 
4389 #ifdef VM_LITTLE_ENDIAN
4390   Register crc_rv = crc;
4391 #else
4392   Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
4393                                                  // Occupies tmp, but frees up crc.
4394   load_reverse_32(crc_rv, crc);                  // Revert byte order because we are dealing with big-endian data.
4395   tmp = crc;
4396 #endif
4397 
4398   int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
4399 
4400   align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
4401   BIND(L_mainLoop);
4402     update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
4403     bdnz(L_mainLoop);
4404 
4405 #ifndef VM_LITTLE_ENDIAN
4406   load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
4407   tmp = crc_rv;                                  // Tmp uses it's original register again.
4408 #endif
4409 
4410   // Restore original table address for tailLoop.
4411   if (reconstructTableOffset != 0) {
4412     addi(table, table, -reconstructTableOffset);
4413   }
4414 
4415   // Process last few (<complexThreshold) bytes of buffer.
4416   BIND(L_tail);
4417   update_byteLoop_crc32(crc, buf, len, table, data, false);
4418 
4419   if (invertCRC) {
4420     nand(crc, crc, crc);                      // 1s complement of crc
4421   }
4422   BLOCK_COMMENT("} kernel_crc32_1word");
4423 }
4424 
4425 /**
4426  * @param crc   register containing existing CRC (32-bit)
4427  * @param buf   register pointing to input byte buffer (byte*)
4428  * @param len   register containing number of bytes
4429  * @param table register pointing to CRC table
4430  *
4431  * Uses R7_ARG5, R8_ARG6 as work registers.
4432  */
4433 void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len, Register table,
4434                                         Register t0,  Register t1,  Register t2,  Register t3,
4435                                         bool invertCRC) {
4436   assert_different_registers(crc, buf, len, table);
4437 
4438   Register  data = t0;                   // Holds the current byte to be folded into crc.
4439 
4440   BLOCK_COMMENT("kernel_crc32_1byte {");
4441 
4442   if (invertCRC) {
4443     nand(crc, crc, crc);                      // 1s complement of crc
4444   }
4445 
4446   // Process all bytes in a single-byte loop.
4447   update_byteLoop_crc32(crc, buf, len, table, data, true);
4448 
4449   if (invertCRC) {
4450     nand(crc, crc, crc);                      // 1s complement of crc
4451   }
4452   BLOCK_COMMENT("} kernel_crc32_1byte");
4453 }
4454 
4455 /**
4456  * @param crc             register containing existing CRC (32-bit)
4457  * @param buf             register pointing to input byte buffer (byte*)
4458  * @param len             register containing number of bytes
4459  * @param table           register pointing to CRC table
4460  * @param constants       register pointing to CRC table for 128-bit aligned memory
4461  * @param barretConstants register pointing to table for barrett reduction
4462  * @param t0-t4           temp registers
4463  */
4464 void MacroAssembler::kernel_crc32_1word_vpmsum(Register crc, Register buf, Register len, Register table,
4465                                                Register constants, Register barretConstants,
4466                                                Register t0, Register t1, Register t2, Register t3, Register t4,
4467                                                bool invertCRC) {
4468   assert_different_registers(crc, buf, len, table);
4469 
4470   Label L_alignedHead, L_tail;
4471 
4472   BLOCK_COMMENT("kernel_crc32_1word_vpmsum {");
4473 
4474   // 1. ~c
4475   if (invertCRC) {
4476     nand(crc, crc, crc);                      // 1s complement of crc
4477   }
4478 
4479   // 2. use kernel_crc32_1word for short len
4480   clrldi(len, len, 32);
4481   cmpdi(CCR0, len, 512);
4482   blt(CCR0, L_tail);
4483 
4484   // 3. calculate from 0 to first aligned address
4485   const int alignment = 16;
4486   Register prealign = t0;
4487 
4488   andi_(prealign, buf, alignment - 1);
4489   beq(CCR0, L_alignedHead);
4490   subfic(prealign, prealign, alignment);
4491 
4492   subf(len, prealign, len);
4493   update_byteLoop_crc32(crc, buf, prealign, table, t2, false);
4494 
4495   // 4. calculate from first aligned address as far as possible
4496   BIND(L_alignedHead);
4497   kernel_crc32_1word_aligned(crc, buf, len, constants, barretConstants, t0, t1, t2, t3, t4);
4498 
4499   // 5. remaining bytes
4500   BIND(L_tail);
4501   Register tc0 = t4;
4502   Register tc1 = constants;
4503   Register tc2 = barretConstants;
4504   kernel_crc32_1word(crc, buf, len, table, t0, t1, t2, t3, tc0, tc1, tc2, table, false);
4505 
4506   // 6. ~c
4507   if (invertCRC) {
4508     nand(crc, crc, crc);                      // 1s complement of crc
4509   }
4510 
4511   BLOCK_COMMENT("} kernel_crc32_1word_vpmsum");
4512 }
4513 
4514 /**
4515  * @param crc             register containing existing CRC (32-bit)
4516  * @param buf             register pointing to input byte buffer (byte*)
4517  * @param len             register containing number of bytes (will get updated to remaining bytes)
4518  * @param constants       register pointing to CRC table for 128-bit aligned memory
4519  * @param barretConstants register pointing to table for barrett reduction
4520  * @param t0-t4           temp registers
4521  * Precondition: len should be >= 512. Otherwise, nothing will be done.
4522  */
4523 void MacroAssembler::kernel_crc32_1word_aligned(Register crc, Register buf, Register len,
4524     Register constants, Register barretConstants,
4525     Register t0, Register t1, Register t2, Register t3, Register t4) {
4526 
4527   // Save non-volatile vector registers (frameless).
4528   Register offset = t1;
4529   int offsetInt = 0;
4530   offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP);
4531   offsetInt -= 16; li(offset, offsetInt); stvx(VR21, offset, R1_SP);
4532   offsetInt -= 16; li(offset, offsetInt); stvx(VR22, offset, R1_SP);
4533   offsetInt -= 16; li(offset, offsetInt); stvx(VR23, offset, R1_SP);
4534   offsetInt -= 16; li(offset, offsetInt); stvx(VR24, offset, R1_SP);
4535   offsetInt -= 16; li(offset, offsetInt); stvx(VR25, offset, R1_SP);
4536 #ifndef VM_LITTLE_ENDIAN
4537   offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP);
4538 #endif
4539   offsetInt -= 8; std(R14, offsetInt, R1_SP);
4540   offsetInt -= 8; std(R15, offsetInt, R1_SP);
4541   offsetInt -= 8; std(R16, offsetInt, R1_SP);
4542   offsetInt -= 8; std(R17, offsetInt, R1_SP);
4543 
4544   // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor
4545   // bytes per iteration. The basic scheme is:
4546   // lvx: load vector (Big Endian needs reversal)
4547   // vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift
4548   // vxor: xor partial results together to get unroll_factor2 vectors
4549 
4550   // Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors.
4551 
4552   // Using 16 * unroll_factor / unroll_factor_2 bytes for constants.
4553   const int unroll_factor = 2048;
4554   const int unroll_factor2 = 8;
4555 
4556   // Support registers.
4557   Register offs[] = { noreg, t0, t1, t2, t3, t4, crc /* will live in VCRC */, R14 };
4558   Register num_bytes = R15,
4559            loop_count = R16,
4560            cur_const = R17;
4561   // Constant array for outer loop: unroll_factor2 - 1 registers,
4562   // Constant array for inner loop: unroll_factor / unroll_factor2 registers.
4563   VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 },
4564                  consts1[] = { VR23, VR24 };
4565   // Data register arrays: 2 arrays with unroll_factor2 registers.
4566   VectorRegister data0[] = { VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7 },
4567                  data1[] = { VR8, VR9, VR10, VR11, VR12, VR13, VR14, VR15 };
4568 
4569   VectorRegister VCRC = data0[0];
4570   VectorRegister Vc = VR25;
4571   VectorRegister swap_bytes = VR26; // Only for Big Endian.
4572 
4573   // We have at least 1 iteration (ensured by caller).
4574   Label L_outer_loop, L_inner_loop, L_last;
4575 
4576   // If supported set DSCR pre-fetch to deepest.
4577   if (VM_Version::has_mfdscr()) {
4578     load_const_optimized(t0, VM_Version::_dscr_val | 7);
4579     mtdscr(t0);
4580   }
4581 
4582   mtvrwz(VCRC, crc); // crc lives lives in VCRC, now
4583 
4584   for (int i = 1; i < unroll_factor2; ++i) {
4585     li(offs[i], 16 * i);
4586   }
4587 
4588   // Load consts for outer loop
4589   lvx(consts0[0], constants);
4590   for (int i = 1; i < unroll_factor2 - 1; ++i) {
4591     lvx(consts0[i], offs[i], constants);
4592   }
4593   addi(constants, constants, (unroll_factor2 - 1) * 16);
4594 
4595   load_const_optimized(num_bytes, 16 * unroll_factor);
4596   load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off.
4597 
4598   // Reuse data registers outside of the loop.
4599   VectorRegister Vtmp = data1[0];
4600   VectorRegister Vtmp2 = data1[1];
4601   VectorRegister zeroes = data1[2];
4602 
4603   vspltisb(Vtmp, 0);
4604   vsldoi(VCRC, Vtmp, VCRC, 8); // 96 bit zeroes, 32 bit CRC.
4605 
4606   // Load vector for vpermxor (to xor both 64 bit parts together)
4607   lvsl(Vtmp, buf);   // 000102030405060708090a0b0c0d0e0f
4608   vspltisb(Vc, 4);
4609   vsl(Vc, Vtmp, Vc); // 00102030405060708090a0b0c0d0e0f0
4610   xxspltd(Vc->to_vsr(), Vc->to_vsr(), 0);
4611   vor(Vc, Vtmp, Vc); // 001122334455667708192a3b4c5d6e7f
4612 
4613 #ifdef VM_LITTLE_ENDIAN
4614 #define BE_swap_bytes(x)
4615 #else
4616   vspltisb(Vtmp2, 0xf);
4617   vxor(swap_bytes, Vtmp, Vtmp2);
4618 #define BE_swap_bytes(x) vperm(x, x, x, swap_bytes)
4619 #endif
4620 
4621   cmpd(CCR0, len, num_bytes);
4622   blt(CCR0, L_last);
4623 
4624   // ********** Main loop start **********
4625   align(32);
4626   bind(L_outer_loop);
4627 
4628   // Begin of unrolled first iteration (no xor).
4629   lvx(data1[0], buf);
4630   mr(cur_const, constants);
4631   for (int i = 1; i < unroll_factor2 / 2; ++i) {
4632     lvx(data1[i], offs[i], buf);
4633   }
4634   vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
4635   lvx(consts1[0], cur_const);
4636   mtctr(loop_count);
4637   for (int i = 0; i < unroll_factor2 / 2; ++i) {
4638     BE_swap_bytes(data1[i]);
4639     if (i == 0) { vxor(data1[0], data1[0], VCRC); } // xor in previous CRC.
4640     lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
4641     vpmsumw(data0[i], data1[i], consts1[0]);
4642   }
4643   addi(buf, buf, 16 * unroll_factor2);
4644   subf(len, num_bytes, len);
4645   lvx(consts1[1], offs[1], cur_const);
4646   addi(cur_const, cur_const, 32);
4647   // Begin of unrolled second iteration (head).
4648   for (int i = 0; i < unroll_factor2 / 2; ++i) {
4649     BE_swap_bytes(data1[i + unroll_factor2 / 2]);
4650     if (i == 0) { lvx(data1[0], buf); } else { lvx(data1[i], offs[i], buf); }
4651     vpmsumw(data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[0]);
4652   }
4653   for (int i = 0; i < unroll_factor2 / 2; ++i) {
4654     BE_swap_bytes(data1[i]);
4655     lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
4656     vpmsumw(data1[i], data1[i], consts1[1]);
4657   }
4658   addi(buf, buf, 16 * unroll_factor2);
4659 
4660   // Generate most performance relevant code. Loads + half of the vpmsumw have been generated.
4661   // Double-iteration allows using the 2 constant registers alternatingly.
4662   align(32);
4663   bind(L_inner_loop);
4664   for (int j = 1; j < 3; ++j) { // j < unroll_factor / unroll_factor2 - 1 for complete unrolling.
4665     if (j & 1) {
4666       lvx(consts1[0], cur_const);
4667     } else {
4668       lvx(consts1[1], offs[1], cur_const);
4669       addi(cur_const, cur_const, 32);
4670     }
4671     for (int i = 0; i < unroll_factor2; ++i) {
4672       int idx = i + unroll_factor2 / 2, inc = 0; // For modulo-scheduled input.
4673       if (idx >= unroll_factor2) { idx -= unroll_factor2; inc = 1; }
4674       BE_swap_bytes(data1[idx]);
4675       vxor(data0[i], data0[i], data1[i]);
4676       if (i == 0) lvx(data1[0], buf); else lvx(data1[i], offs[i], buf);
4677       vpmsumw(data1[idx], data1[idx], consts1[(j + inc) & 1]);
4678     }
4679     addi(buf, buf, 16 * unroll_factor2);
4680   }
4681   bdnz(L_inner_loop);
4682 
4683   // Tail of last iteration (no loads).
4684   for (int i = 0; i < unroll_factor2 / 2; ++i) {
4685     BE_swap_bytes(data1[i + unroll_factor2 / 2]);
4686     vxor(data0[i], data0[i], data1[i]);
4687     vpmsumw(data1[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[1]);
4688   }
4689   for (int i = 0; i < unroll_factor2 / 2; ++i) {
4690     vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); // First half of fixup shifts.
4691     vxor(data0[i + unroll_factor2 / 2], data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2]);
4692   }
4693 
4694   // Last data register is ok, other ones need fixup shift.
4695   for (int i = unroll_factor2 / 2; i < unroll_factor2 - 1; ++i) {
4696     vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]);
4697   }
4698 
4699   // Combine to 128 bit result vector VCRC = data0[0].
4700   for (int i = 1; i < unroll_factor2; i<<=1) {
4701     for (int j = 0; j <= unroll_factor2 - 2*i; j+=2*i) {
4702       vxor(data0[j], data0[j], data0[j+i]);
4703     }
4704   }
4705   cmpd(CCR0, len, num_bytes);
4706   bge(CCR0, L_outer_loop);
4707 
4708   // Last chance with lower num_bytes.
4709   bind(L_last);
4710   srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations.
4711   add_const_optimized(constants, constants, 16 * (unroll_factor / unroll_factor2)); // Point behind last one.
4712   sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used.
4713   clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2));
4714   subf(constants, R0, constants); // Point to constant to be used first.
4715 
4716   addic_(loop_count, loop_count, -1); // One double-iteration peeled off.
4717   bgt(CCR0, L_outer_loop);
4718   // ********** Main loop end **********
4719 #undef BE_swap_bytes
4720 
4721   // Restore DSCR pre-fetch value.
4722   if (VM_Version::has_mfdscr()) {
4723     load_const_optimized(t0, VM_Version::_dscr_val);
4724     mtdscr(t0);
4725   }
4726 
4727   vspltisb(zeroes, 0);
4728 
4729   // Combine to 64 bit result.
4730   vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
4731 
4732   // Reduce to 32 bit CRC: Remainder by multiply-high.
4733   lvx(Vtmp, barretConstants);
4734   vsldoi(Vtmp2, zeroes, VCRC, 12);  // Extract high 32 bit.
4735   vpmsumd(Vtmp2, Vtmp2, Vtmp);      // Multiply by inverse long poly.
4736   vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit.
4737   vsldoi(Vtmp, zeroes, Vtmp, 8);
4738   vpmsumd(Vtmp2, Vtmp2, Vtmp);      // Multiply quotient by long poly.
4739   vxor(VCRC, VCRC, Vtmp2);          // Remainder fits into 32 bit.
4740 
4741   // Move result. len is already updated.
4742   vsldoi(VCRC, VCRC, zeroes, 8);
4743   mfvrd(crc, VCRC);
4744 
4745   // Restore non-volatile Vector registers (frameless).
4746   offsetInt = 0;
4747   offsetInt -= 16; li(offset, offsetInt); lvx(VR20, offset, R1_SP);
4748   offsetInt -= 16; li(offset, offsetInt); lvx(VR21, offset, R1_SP);
4749   offsetInt -= 16; li(offset, offsetInt); lvx(VR22, offset, R1_SP);
4750   offsetInt -= 16; li(offset, offsetInt); lvx(VR23, offset, R1_SP);
4751   offsetInt -= 16; li(offset, offsetInt); lvx(VR24, offset, R1_SP);
4752   offsetInt -= 16; li(offset, offsetInt); lvx(VR25, offset, R1_SP);
4753 #ifndef VM_LITTLE_ENDIAN
4754   offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP);
4755 #endif
4756   offsetInt -= 8;  ld(R14, offsetInt, R1_SP);
4757   offsetInt -= 8;  ld(R15, offsetInt, R1_SP);
4758   offsetInt -= 8;  ld(R16, offsetInt, R1_SP);
4759   offsetInt -= 8;  ld(R17, offsetInt, R1_SP);
4760 }
4761 
4762 void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp, bool invertCRC) {
4763   assert_different_registers(crc, buf, /* len,  not used!! */ table, tmp);
4764 
4765   BLOCK_COMMENT("kernel_crc32_singleByte:");
4766   if (invertCRC) {
4767     nand(crc, crc, crc);                // 1s complement of crc
4768   }
4769 
4770   lbz(tmp, 0, buf);                     // Byte from buffer, zero-extended.
4771   update_byte_crc32(crc, tmp, table);
4772 
4773   if (invertCRC) {
4774     nand(crc, crc, crc);                // 1s complement of crc
4775   }
4776 }
4777 
4778 void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) {
4779   assert_different_registers(crc, val, table);
4780 
4781   BLOCK_COMMENT("kernel_crc32_singleByteReg:");
4782   if (invertCRC) {
4783     nand(crc, crc, crc);                // 1s complement of crc
4784   }
4785 
4786   update_byte_crc32(crc, val, table);
4787 
4788   if (invertCRC) {
4789     nand(crc, crc, crc);                // 1s complement of crc
4790   }
4791 }
4792 
4793 // dest_lo += src1 + src2
4794 // dest_hi += carry1 + carry2
4795 void MacroAssembler::add2_with_carry(Register dest_hi,
4796                                      Register dest_lo,
4797                                      Register src1, Register src2) {
4798   li(R0, 0);
4799   addc(dest_lo, dest_lo, src1);
4800   adde(dest_hi, dest_hi, R0);
4801   addc(dest_lo, dest_lo, src2);
4802   adde(dest_hi, dest_hi, R0);
4803 }
4804 
4805 // Multiply 64 bit by 64 bit first loop.
4806 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,
4807                                            Register x_xstart,
4808                                            Register y, Register y_idx,
4809                                            Register z,
4810                                            Register carry,
4811                                            Register product_high, Register product,
4812                                            Register idx, Register kdx,
4813                                            Register tmp) {
4814   //  jlong carry, x[], y[], z[];
4815   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
4816   //    huge_128 product = y[idx] * x[xstart] + carry;
4817   //    z[kdx] = (jlong)product;
4818   //    carry  = (jlong)(product >>> 64);
4819   //  }
4820   //  z[xstart] = carry;
4821 
4822   Label L_first_loop, L_first_loop_exit;
4823   Label L_one_x, L_one_y, L_multiply;
4824 
4825   addic_(xstart, xstart, -1);
4826   blt(CCR0, L_one_x);   // Special case: length of x is 1.
4827 
4828   // Load next two integers of x.
4829   sldi(tmp, xstart, LogBytesPerInt);
4830   ldx(x_xstart, x, tmp);
4831 #ifdef VM_LITTLE_ENDIAN
4832   rldicl(x_xstart, x_xstart, 32, 0);
4833 #endif
4834 
4835   align(32, 16);
4836   bind(L_first_loop);
4837 
4838   cmpdi(CCR0, idx, 1);
4839   blt(CCR0, L_first_loop_exit);
4840   addi(idx, idx, -2);
4841   beq(CCR0, L_one_y);
4842 
4843   // Load next two integers of y.
4844   sldi(tmp, idx, LogBytesPerInt);
4845   ldx(y_idx, y, tmp);
4846 #ifdef VM_LITTLE_ENDIAN
4847   rldicl(y_idx, y_idx, 32, 0);
4848 #endif
4849 
4850 
4851   bind(L_multiply);
4852   multiply64(product_high, product, x_xstart, y_idx);
4853 
4854   li(tmp, 0);
4855   addc(product, product, carry);         // Add carry to result.
4856   adde(product_high, product_high, tmp); // Add carry of the last addition.
4857   addi(kdx, kdx, -2);
4858 
4859   // Store result.
4860 #ifdef VM_LITTLE_ENDIAN
4861   rldicl(product, product, 32, 0);
4862 #endif
4863   sldi(tmp, kdx, LogBytesPerInt);
4864   stdx(product, z, tmp);
4865   mr_if_needed(carry, product_high);
4866   b(L_first_loop);
4867 
4868 
4869   bind(L_one_y); // Load one 32 bit portion of y as (0,value).
4870 
4871   lwz(y_idx, 0, y);
4872   b(L_multiply);
4873 
4874 
4875   bind(L_one_x); // Load one 32 bit portion of x as (0,value).
4876 
4877   lwz(x_xstart, 0, x);
4878   b(L_first_loop);
4879 
4880   bind(L_first_loop_exit);
4881 }
4882 
4883 // Multiply 64 bit by 64 bit and add 128 bit.
4884 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y,
4885                                             Register z, Register yz_idx,
4886                                             Register idx, Register carry,
4887                                             Register product_high, Register product,
4888                                             Register tmp, int offset) {
4889 
4890   //  huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
4891   //  z[kdx] = (jlong)product;
4892 
4893   sldi(tmp, idx, LogBytesPerInt);
4894   if (offset) {
4895     addi(tmp, tmp, offset);
4896   }
4897   ldx(yz_idx, y, tmp);
4898 #ifdef VM_LITTLE_ENDIAN
4899   rldicl(yz_idx, yz_idx, 32, 0);
4900 #endif
4901 
4902   multiply64(product_high, product, x_xstart, yz_idx);
4903   ldx(yz_idx, z, tmp);
4904 #ifdef VM_LITTLE_ENDIAN
4905   rldicl(yz_idx, yz_idx, 32, 0);
4906 #endif
4907 
4908   add2_with_carry(product_high, product, carry, yz_idx);
4909 
4910   sldi(tmp, idx, LogBytesPerInt);
4911   if (offset) {
4912     addi(tmp, tmp, offset);
4913   }
4914 #ifdef VM_LITTLE_ENDIAN
4915   rldicl(product, product, 32, 0);
4916 #endif
4917   stdx(product, z, tmp);
4918 }
4919 
4920 // Multiply 128 bit by 128 bit. Unrolled inner loop.
4921 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart,
4922                                              Register y, Register z,
4923                                              Register yz_idx, Register idx, Register carry,
4924                                              Register product_high, Register product,
4925                                              Register carry2, Register tmp) {
4926 
4927   //  jlong carry, x[], y[], z[];
4928   //  int kdx = ystart+1;
4929   //  for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
4930   //    huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
4931   //    z[kdx+idx+1] = (jlong)product;
4932   //    jlong carry2 = (jlong)(product >>> 64);
4933   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
4934   //    z[kdx+idx] = (jlong)product;
4935   //    carry = (jlong)(product >>> 64);
4936   //  }
4937   //  idx += 2;
4938   //  if (idx > 0) {
4939   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
4940   //    z[kdx+idx] = (jlong)product;
4941   //    carry = (jlong)(product >>> 64);
4942   //  }
4943 
4944   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
4945   const Register jdx = R0;
4946 
4947   // Scale the index.
4948   srdi_(jdx, idx, 2);
4949   beq(CCR0, L_third_loop_exit);
4950   mtctr(jdx);
4951 
4952   align(32, 16);
4953   bind(L_third_loop);
4954 
4955   addi(idx, idx, -4);
4956 
4957   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8);
4958   mr_if_needed(carry2, product_high);
4959 
4960   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0);
4961   mr_if_needed(carry, product_high);
4962   bdnz(L_third_loop);
4963 
4964   bind(L_third_loop_exit);  // Handle any left-over operand parts.
4965 
4966   andi_(idx, idx, 0x3);
4967   beq(CCR0, L_post_third_loop_done);
4968 
4969   Label L_check_1;
4970 
4971   addic_(idx, idx, -2);
4972   blt(CCR0, L_check_1);
4973 
4974   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0);
4975   mr_if_needed(carry, product_high);
4976 
4977   bind(L_check_1);
4978 
4979   addi(idx, idx, 0x2);
4980   andi_(idx, idx, 0x1);
4981   addic_(idx, idx, -1);
4982   blt(CCR0, L_post_third_loop_done);
4983 
4984   sldi(tmp, idx, LogBytesPerInt);
4985   lwzx(yz_idx, y, tmp);
4986   multiply64(product_high, product, x_xstart, yz_idx);
4987   lwzx(yz_idx, z, tmp);
4988 
4989   add2_with_carry(product_high, product, yz_idx, carry);
4990 
4991   sldi(tmp, idx, LogBytesPerInt);
4992   stwx(product, z, tmp);
4993   srdi(product, product, 32);
4994 
4995   sldi(product_high, product_high, 32);
4996   orr(product, product, product_high);
4997   mr_if_needed(carry, product);
4998 
4999   bind(L_post_third_loop_done);
5000 }   // multiply_128_x_128_loop
5001 
5002 void MacroAssembler::muladd(Register out, Register in,
5003                             Register offset, Register len, Register k,
5004                             Register tmp1, Register tmp2, Register carry) {
5005 
5006   // Labels
5007   Label LOOP, SKIP;
5008 
5009   // Make sure length is positive.
5010   cmpdi  (CCR0,    len,     0);
5011 
5012   // Prepare variables
5013   subi   (offset,  offset,  4);
5014   li     (carry,   0);
5015   ble    (CCR0,    SKIP);
5016 
5017   mtctr  (len);
5018   subi   (len,     len,     1    );
5019   sldi   (len,     len,     2    );
5020 
5021   // Main loop
5022   bind(LOOP);
5023   lwzx   (tmp1,    len,     in   );
5024   lwzx   (tmp2,    offset,  out  );
5025   mulld  (tmp1,    tmp1,    k    );
5026   add    (tmp2,    carry,   tmp2 );
5027   add    (tmp2,    tmp1,    tmp2 );
5028   stwx   (tmp2,    offset,  out  );
5029   srdi   (carry,   tmp2,    32   );
5030   subi   (offset,  offset,  4    );
5031   subi   (len,     len,     4    );
5032   bdnz   (LOOP);
5033   bind(SKIP);
5034 }
5035 
5036 void MacroAssembler::multiply_to_len(Register x, Register xlen,
5037                                      Register y, Register ylen,
5038                                      Register z, Register zlen,
5039                                      Register tmp1, Register tmp2,
5040                                      Register tmp3, Register tmp4,
5041                                      Register tmp5, Register tmp6,
5042                                      Register tmp7, Register tmp8,
5043                                      Register tmp9, Register tmp10,
5044                                      Register tmp11, Register tmp12,
5045                                      Register tmp13) {
5046 
5047   ShortBranchVerifier sbv(this);
5048 
5049   assert_different_registers(x, xlen, y, ylen, z, zlen,
5050                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
5051   assert_different_registers(x, xlen, y, ylen, z, zlen,
5052                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp7);
5053   assert_different_registers(x, xlen, y, ylen, z, zlen,
5054                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp8);
5055 
5056   const Register idx = tmp1;
5057   const Register kdx = tmp2;
5058   const Register xstart = tmp3;
5059 
5060   const Register y_idx = tmp4;
5061   const Register carry = tmp5;
5062   const Register product = tmp6;
5063   const Register product_high = tmp7;
5064   const Register x_xstart = tmp8;
5065   const Register tmp = tmp9;
5066 
5067   // First Loop.
5068   //
5069   //  final static long LONG_MASK = 0xffffffffL;
5070   //  int xstart = xlen - 1;
5071   //  int ystart = ylen - 1;
5072   //  long carry = 0;
5073   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
5074   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
5075   //    z[kdx] = (int)product;
5076   //    carry = product >>> 32;
5077   //  }
5078   //  z[xstart] = (int)carry;
5079 
5080   mr_if_needed(idx, ylen);        // idx = ylen
5081   mr_if_needed(kdx, zlen);        // kdx = xlen + ylen
5082   li(carry, 0);                   // carry = 0
5083 
5084   Label L_done;
5085 
5086   addic_(xstart, xlen, -1);
5087   blt(CCR0, L_done);
5088 
5089   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z,
5090                         carry, product_high, product, idx, kdx, tmp);
5091 
5092   Label L_second_loop;
5093 
5094   cmpdi(CCR0, kdx, 0);
5095   beq(CCR0, L_second_loop);
5096 
5097   Label L_carry;
5098 
5099   addic_(kdx, kdx, -1);
5100   beq(CCR0, L_carry);
5101 
5102   // Store lower 32 bits of carry.
5103   sldi(tmp, kdx, LogBytesPerInt);
5104   stwx(carry, z, tmp);
5105   srdi(carry, carry, 32);
5106   addi(kdx, kdx, -1);
5107 
5108 
5109   bind(L_carry);
5110 
5111   // Store upper 32 bits of carry.
5112   sldi(tmp, kdx, LogBytesPerInt);
5113   stwx(carry, z, tmp);
5114 
5115   // Second and third (nested) loops.
5116   //
5117   //  for (int i = xstart-1; i >= 0; i--) { // Second loop
5118   //    carry = 0;
5119   //    for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
5120   //      long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
5121   //                     (z[k] & LONG_MASK) + carry;
5122   //      z[k] = (int)product;
5123   //      carry = product >>> 32;
5124   //    }
5125   //    z[i] = (int)carry;
5126   //  }
5127   //
5128   //  i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
5129 
5130   bind(L_second_loop);
5131 
5132   li(carry, 0);                   // carry = 0;
5133 
5134   addic_(xstart, xstart, -1);     // i = xstart-1;
5135   blt(CCR0, L_done);
5136 
5137   Register zsave = tmp10;
5138 
5139   mr(zsave, z);
5140 
5141 
5142   Label L_last_x;
5143 
5144   sldi(tmp, xstart, LogBytesPerInt);
5145   add(z, z, tmp);                 // z = z + k - j
5146   addi(z, z, 4);
5147   addic_(xstart, xstart, -1);     // i = xstart-1;
5148   blt(CCR0, L_last_x);
5149 
5150   sldi(tmp, xstart, LogBytesPerInt);
5151   ldx(x_xstart, x, tmp);
5152 #ifdef VM_LITTLE_ENDIAN
5153   rldicl(x_xstart, x_xstart, 32, 0);
5154 #endif
5155 
5156 
5157   Label L_third_loop_prologue;
5158 
5159   bind(L_third_loop_prologue);
5160 
5161   Register xsave = tmp11;
5162   Register xlensave = tmp12;
5163   Register ylensave = tmp13;
5164 
5165   mr(xsave, x);
5166   mr(xlensave, xstart);
5167   mr(ylensave, ylen);
5168 
5169 
5170   multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen,
5171                           carry, product_high, product, x, tmp);
5172 
5173   mr(z, zsave);
5174   mr(x, xsave);
5175   mr(xlen, xlensave);   // This is the decrement of the loop counter!
5176   mr(ylen, ylensave);
5177 
5178   addi(tmp3, xlen, 1);
5179   sldi(tmp, tmp3, LogBytesPerInt);
5180   stwx(carry, z, tmp);
5181   addic_(tmp3, tmp3, -1);
5182   blt(CCR0, L_done);
5183 
5184   srdi(carry, carry, 32);
5185   sldi(tmp, tmp3, LogBytesPerInt);
5186   stwx(carry, z, tmp);
5187   b(L_second_loop);
5188 
5189   // Next infrequent code is moved outside loops.
5190   bind(L_last_x);
5191 
5192   lwz(x_xstart, 0, x);
5193   b(L_third_loop_prologue);
5194 
5195   bind(L_done);
5196 }   // multiply_to_len
5197 
5198 void MacroAssembler::asm_assert(bool check_equal, const char *msg, int id) {
5199 #ifdef ASSERT
5200   Label ok;
5201   if (check_equal) {
5202     beq(CCR0, ok);
5203   } else {
5204     bne(CCR0, ok);
5205   }
5206   stop(msg, id);
5207   bind(ok);
5208 #endif
5209 }
5210 
5211 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset,
5212                                           Register mem_base, const char* msg, int id) {
5213 #ifdef ASSERT
5214   switch (size) {
5215     case 4:
5216       lwz(R0, mem_offset, mem_base);
5217       cmpwi(CCR0, R0, 0);
5218       break;
5219     case 8:
5220       ld(R0, mem_offset, mem_base);
5221       cmpdi(CCR0, R0, 0);
5222       break;
5223     default:
5224       ShouldNotReachHere();
5225   }
5226   asm_assert(check_equal, msg, id);
5227 #endif // ASSERT
5228 }
5229 
5230 void MacroAssembler::verify_thread() {
5231   if (VerifyThread) {
5232     unimplemented("'VerifyThread' currently not implemented on PPC");
5233   }
5234 }
5235 
5236 // READ: oop. KILL: R0. Volatile floats perhaps.
5237 void MacroAssembler::verify_oop(Register oop, const char* msg) {
5238   if (!VerifyOops) {
5239     return;
5240   }
5241 
5242   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
5243   const Register tmp = R11; // Will be preserved.
5244   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
5245   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
5246 
5247   mr_if_needed(R4_ARG2, oop);
5248   save_LR_CR(tmp); // save in old frame
5249   push_frame_reg_args(nbytes_save, tmp);
5250   // load FunctionDescriptor** / entry_address *
5251   load_const_optimized(tmp, fd, R0);
5252   // load FunctionDescriptor* / entry_address
5253   ld(tmp, 0, tmp);
5254   load_const_optimized(R3_ARG1, (address)msg, R0);
5255   // Call destination for its side effect.
5256   call_c(tmp);
5257 
5258   pop_frame();
5259   restore_LR_CR(tmp);
5260   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
5261 }
5262 
5263 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) {
5264   if (!VerifyOops) {
5265     return;
5266   }
5267 
5268   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
5269   const Register tmp = R11; // Will be preserved.
5270   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
5271   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
5272 
5273   ld(R4_ARG2, offs, base);
5274   save_LR_CR(tmp); // save in old frame
5275   push_frame_reg_args(nbytes_save, tmp);
5276   // load FunctionDescriptor** / entry_address *
5277   load_const_optimized(tmp, fd, R0);
5278   // load FunctionDescriptor* / entry_address
5279   ld(tmp, 0, tmp);
5280   load_const_optimized(R3_ARG1, (address)msg, R0);
5281   // Call destination for its side effect.
5282   call_c(tmp);
5283 
5284   pop_frame();
5285   restore_LR_CR(tmp);
5286   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
5287 }
5288 
5289 const char* stop_types[] = {
5290   "stop",
5291   "untested",
5292   "unimplemented",
5293   "shouldnotreachhere"
5294 };
5295 
5296 static void stop_on_request(int tp, const char* msg) {
5297   tty->print("PPC assembly code requires stop: (%s) %s\n", stop_types[tp%/*stop_end*/4], msg);
5298   guarantee(false, "PPC assembly code requires stop: %s", msg);
5299 }
5300 
5301 // Call a C-function that prints output.
5302 void MacroAssembler::stop(int type, const char* msg, int id) {
5303 #ifndef PRODUCT
5304   block_comment(err_msg("stop: %s %s {", stop_types[type%stop_end], msg));
5305 #else
5306   block_comment("stop {");
5307 #endif
5308 
5309   // setup arguments
5310   load_const_optimized(R3_ARG1, type);
5311   load_const_optimized(R4_ARG2, (void *)msg, /*tmp=*/R0);
5312   call_VM_leaf(CAST_FROM_FN_PTR(address, stop_on_request), R3_ARG1, R4_ARG2);
5313   illtrap();
5314   emit_int32(id);
5315   block_comment("} stop;");
5316 }
5317 
5318 #ifndef PRODUCT
5319 // Write pattern 0x0101010101010101 in memory region [low-before, high+after].
5320 // Val, addr are temp registers.
5321 // If low == addr, addr is killed.
5322 // High is preserved.
5323 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) {
5324   if (!ZapMemory) return;
5325 
5326   assert_different_registers(low, val);
5327 
5328   BLOCK_COMMENT("zap memory region {");
5329   load_const_optimized(val, 0x0101010101010101);
5330   int size = before + after;
5331   if (low == high && size < 5 && size > 0) {
5332     int offset = -before*BytesPerWord;
5333     for (int i = 0; i < size; ++i) {
5334       std(val, offset, low);
5335       offset += (1*BytesPerWord);
5336     }
5337   } else {
5338     addi(addr, low, -before*BytesPerWord);
5339     assert_different_registers(high, val);
5340     if (after) addi(high, high, after * BytesPerWord);
5341     Label loop;
5342     bind(loop);
5343     std(val, 0, addr);
5344     addi(addr, addr, 8);
5345     cmpd(CCR6, addr, high);
5346     ble(CCR6, loop);
5347     if (after) addi(high, high, -after * BytesPerWord);  // Correct back to old value.
5348   }
5349   BLOCK_COMMENT("} zap memory region");
5350 }
5351 
5352 #endif // !PRODUCT
5353 
5354 void SkipIfEqualZero::skip_to_label_if_equal_zero(MacroAssembler* masm, Register temp,
5355                                                   const bool* flag_addr, Label& label) {
5356   int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true);
5357   assert(sizeof(bool) == 1, "PowerPC ABI");
5358   masm->lbz(temp, simm16_offset, temp);
5359   masm->cmpwi(CCR0, temp, 0);
5360   masm->beq(CCR0, label);
5361 }
5362 
5363 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() {
5364   skip_to_label_if_equal_zero(masm, temp, flag_addr, _label);
5365 }
5366 
5367 SkipIfEqualZero::~SkipIfEqualZero() {
5368   _masm->bind(_label);
5369 }