1 /*
   2  * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2012, 2017, SAP SE. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "compiler/disassembler.hpp"
  29 #include "gc/shared/cardTable.hpp"
  30 #include "gc/shared/cardTableModRefBS.hpp"
  31 #include "gc/shared/collectedHeap.inline.hpp"
  32 #include "interpreter/interpreter.hpp"
  33 #include "memory/resourceArea.hpp"
  34 #include "nativeInst_ppc.hpp"
  35 #include "prims/methodHandles.hpp"
  36 #include "runtime/biasedLocking.hpp"
  37 #include "runtime/icache.hpp"
  38 #include "runtime/interfaceSupport.hpp"
  39 #include "runtime/objectMonitor.hpp"
  40 #include "runtime/os.hpp"
  41 #include "runtime/safepoint.hpp"
  42 #include "runtime/safepointMechanism.hpp"
  43 #include "runtime/sharedRuntime.hpp"
  44 #include "runtime/stubRoutines.hpp"
  45 #include "utilities/macros.hpp"
  46 #if INCLUDE_ALL_GCS
  47 #include "gc/g1/g1CardTable.hpp"
  48 #include "gc/g1/g1CollectedHeap.inline.hpp"
  49 #include "gc/g1/g1SATBCardTableModRefBS.hpp"
  50 #include "gc/g1/heapRegion.hpp"
  51 #endif // INCLUDE_ALL_GCS
  52 #ifdef COMPILER2
  53 #include "opto/intrinsicnode.hpp"
  54 #endif
  55 
  56 #ifdef PRODUCT
  57 #define BLOCK_COMMENT(str) // nothing
  58 #else
  59 #define BLOCK_COMMENT(str) block_comment(str)
  60 #endif
  61 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  62 
  63 #ifdef ASSERT
  64 // On RISC, there's no benefit to verifying instruction boundaries.
  65 bool AbstractAssembler::pd_check_instruction_mark() { return false; }
  66 #endif
  67 
  68 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) {
  69   assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range");
  70   if (Assembler::is_simm(si31, 16)) {
  71     ld(d, si31, a);
  72     if (emit_filler_nop) nop();
  73   } else {
  74     const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31);
  75     const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31);
  76     addis(d, a, hi);
  77     ld(d, lo, d);
  78   }
  79 }
  80 
  81 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) {
  82   assert_different_registers(d, a);
  83   ld_largeoffset_unchecked(d, si31, a, emit_filler_nop);
  84 }
  85 
  86 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base,
  87                                       size_t size_in_bytes, bool is_signed) {
  88   switch (size_in_bytes) {
  89   case  8:              ld(dst, offs, base);                         break;
  90   case  4:  is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break;
  91   case  2:  is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break;
  92   case  1:  lbz(dst, offs, base); if (is_signed) extsb(dst, dst);    break; // lba doesn't exist :(
  93   default:  ShouldNotReachHere();
  94   }
  95 }
  96 
  97 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base,
  98                                        size_t size_in_bytes) {
  99   switch (size_in_bytes) {
 100   case  8:  std(dst, offs, base); break;
 101   case  4:  stw(dst, offs, base); break;
 102   case  2:  sth(dst, offs, base); break;
 103   case  1:  stb(dst, offs, base); break;
 104   default:  ShouldNotReachHere();
 105   }
 106 }
 107 
 108 void MacroAssembler::align(int modulus, int max, int rem) {
 109   int padding = (rem + modulus - (offset() % modulus)) % modulus;
 110   if (padding > max) return;
 111   for (int c = (padding >> 2); c > 0; --c) { nop(); }
 112 }
 113 
 114 // Issue instructions that calculate given TOC from global TOC.
 115 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16,
 116                                                        bool add_relocation, bool emit_dummy_addr) {
 117   int offset = -1;
 118   if (emit_dummy_addr) {
 119     offset = -128; // dummy address
 120   } else if (addr != (address)(intptr_t)-1) {
 121     offset = MacroAssembler::offset_to_global_toc(addr);
 122   }
 123 
 124   if (hi16) {
 125     addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset));
 126   }
 127   if (lo16) {
 128     if (add_relocation) {
 129       // Relocate at the addi to avoid confusion with a load from the method's TOC.
 130       relocate(internal_word_Relocation::spec(addr));
 131     }
 132     addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset));
 133   }
 134 }
 135 
 136 address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) {
 137   const int offset = MacroAssembler::offset_to_global_toc(addr);
 138 
 139   const address inst2_addr = a;
 140   const int inst2 = *(int *)inst2_addr;
 141 
 142   // The relocation points to the second instruction, the addi,
 143   // and the addi reads and writes the same register dst.
 144   const int dst = inv_rt_field(inst2);
 145   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 146 
 147   // Now, find the preceding addis which writes to dst.
 148   int inst1 = 0;
 149   address inst1_addr = inst2_addr - BytesPerInstWord;
 150   while (inst1_addr >= bound) {
 151     inst1 = *(int *) inst1_addr;
 152     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 153       // Stop, found the addis which writes dst.
 154       break;
 155     }
 156     inst1_addr -= BytesPerInstWord;
 157   }
 158 
 159   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 160   set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset));
 161   set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset));
 162   return inst1_addr;
 163 }
 164 
 165 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) {
 166   const address inst2_addr = a;
 167   const int inst2 = *(int *)inst2_addr;
 168 
 169   // The relocation points to the second instruction, the addi,
 170   // and the addi reads and writes the same register dst.
 171   const int dst = inv_rt_field(inst2);
 172   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 173 
 174   // Now, find the preceding addis which writes to dst.
 175   int inst1 = 0;
 176   address inst1_addr = inst2_addr - BytesPerInstWord;
 177   while (inst1_addr >= bound) {
 178     inst1 = *(int *) inst1_addr;
 179     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 180       // stop, found the addis which writes dst
 181       break;
 182     }
 183     inst1_addr -= BytesPerInstWord;
 184   }
 185 
 186   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 187 
 188   int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0);
 189   // -1 is a special case
 190   if (offset == -1) {
 191     return (address)(intptr_t)-1;
 192   } else {
 193     return global_toc() + offset;
 194   }
 195 }
 196 
 197 #ifdef _LP64
 198 // Patch compressed oops or klass constants.
 199 // Assembler sequence is
 200 // 1) compressed oops:
 201 //    lis  rx = const.hi
 202 //    ori rx = rx | const.lo
 203 // 2) compressed klass:
 204 //    lis  rx = const.hi
 205 //    clrldi rx = rx & 0xFFFFffff // clearMS32b, optional
 206 //    ori rx = rx | const.lo
 207 // Clrldi will be passed by.
 208 address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) {
 209   assert(UseCompressedOops, "Should only patch compressed oops");
 210 
 211   const address inst2_addr = a;
 212   const int inst2 = *(int *)inst2_addr;
 213 
 214   // The relocation points to the second instruction, the ori,
 215   // and the ori reads and writes the same register dst.
 216   const int dst = inv_rta_field(inst2);
 217   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 218   // Now, find the preceding addis which writes to dst.
 219   int inst1 = 0;
 220   address inst1_addr = inst2_addr - BytesPerInstWord;
 221   bool inst1_found = false;
 222   while (inst1_addr >= bound) {
 223     inst1 = *(int *)inst1_addr;
 224     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; }
 225     inst1_addr -= BytesPerInstWord;
 226   }
 227   assert(inst1_found, "inst is not lis");
 228 
 229   int xc = (data >> 16) & 0xffff;
 230   int xd = (data >>  0) & 0xffff;
 231 
 232   set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo
 233   set_imm((int *)inst2_addr,        (xd)); // unsigned int
 234   return inst1_addr;
 235 }
 236 
 237 // Get compressed oop or klass constant.
 238 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) {
 239   assert(UseCompressedOops, "Should only patch compressed oops");
 240 
 241   const address inst2_addr = a;
 242   const int inst2 = *(int *)inst2_addr;
 243 
 244   // The relocation points to the second instruction, the ori,
 245   // and the ori reads and writes the same register dst.
 246   const int dst = inv_rta_field(inst2);
 247   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 248   // Now, find the preceding lis which writes to dst.
 249   int inst1 = 0;
 250   address inst1_addr = inst2_addr - BytesPerInstWord;
 251   bool inst1_found = false;
 252 
 253   while (inst1_addr >= bound) {
 254     inst1 = *(int *) inst1_addr;
 255     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;}
 256     inst1_addr -= BytesPerInstWord;
 257   }
 258   assert(inst1_found, "inst is not lis");
 259 
 260   uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff));
 261   uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16);
 262 
 263   return (int) (xl | xh);
 264 }
 265 #endif // _LP64
 266 
 267 // Returns true if successful.
 268 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a,
 269                                                 Register toc, bool fixed_size) {
 270   int toc_offset = 0;
 271   // Use RelocationHolder::none for the constant pool entry, otherwise
 272   // we will end up with a failing NativeCall::verify(x) where x is
 273   // the address of the constant pool entry.
 274   // FIXME: We should insert relocation information for oops at the constant
 275   // pool entries instead of inserting it at the loads; patching of a constant
 276   // pool entry should be less expensive.
 277   address const_address = address_constant((address)a.value(), RelocationHolder::none);
 278   if (const_address == NULL) { return false; } // allocation failure
 279   // Relocate at the pc of the load.
 280   relocate(a.rspec());
 281   toc_offset = (int)(const_address - code()->consts()->start());
 282   ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size);
 283   return true;
 284 }
 285 
 286 bool MacroAssembler::is_load_const_from_method_toc_at(address a) {
 287   const address inst1_addr = a;
 288   const int inst1 = *(int *)inst1_addr;
 289 
 290    // The relocation points to the ld or the addis.
 291    return (is_ld(inst1)) ||
 292           (is_addis(inst1) && inv_ra_field(inst1) != 0);
 293 }
 294 
 295 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) {
 296   assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc");
 297 
 298   const address inst1_addr = a;
 299   const int inst1 = *(int *)inst1_addr;
 300 
 301   if (is_ld(inst1)) {
 302     return inv_d1_field(inst1);
 303   } else if (is_addis(inst1)) {
 304     const int dst = inv_rt_field(inst1);
 305 
 306     // Now, find the succeeding ld which reads and writes to dst.
 307     address inst2_addr = inst1_addr + BytesPerInstWord;
 308     int inst2 = 0;
 309     while (true) {
 310       inst2 = *(int *) inst2_addr;
 311       if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) {
 312         // Stop, found the ld which reads and writes dst.
 313         break;
 314       }
 315       inst2_addr += BytesPerInstWord;
 316     }
 317     return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2);
 318   }
 319   ShouldNotReachHere();
 320   return 0;
 321 }
 322 
 323 // Get the constant from a `load_const' sequence.
 324 long MacroAssembler::get_const(address a) {
 325   assert(is_load_const_at(a), "not a load of a constant");
 326   const int *p = (const int*) a;
 327   unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48);
 328   if (is_ori(*(p+1))) {
 329     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32);
 330     x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16);
 331     x |= (((unsigned long) (get_imm(a,4) & 0xffff)));
 332   } else if (is_lis(*(p+1))) {
 333     x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32);
 334     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16);
 335     x |= (((unsigned long) (get_imm(a,3) & 0xffff)));
 336   } else {
 337     ShouldNotReachHere();
 338     return (long) 0;
 339   }
 340   return (long) x;
 341 }
 342 
 343 // Patch the 64 bit constant of a `load_const' sequence. This is a low
 344 // level procedure. It neither flushes the instruction cache nor is it
 345 // mt safe.
 346 void MacroAssembler::patch_const(address a, long x) {
 347   assert(is_load_const_at(a), "not a load of a constant");
 348   int *p = (int*) a;
 349   if (is_ori(*(p+1))) {
 350     set_imm(0 + p, (x >> 48) & 0xffff);
 351     set_imm(1 + p, (x >> 32) & 0xffff);
 352     set_imm(3 + p, (x >> 16) & 0xffff);
 353     set_imm(4 + p, x & 0xffff);
 354   } else if (is_lis(*(p+1))) {
 355     set_imm(0 + p, (x >> 48) & 0xffff);
 356     set_imm(2 + p, (x >> 32) & 0xffff);
 357     set_imm(1 + p, (x >> 16) & 0xffff);
 358     set_imm(3 + p, x & 0xffff);
 359   } else {
 360     ShouldNotReachHere();
 361   }
 362 }
 363 
 364 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) {
 365   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
 366   int index = oop_recorder()->allocate_metadata_index(obj);
 367   RelocationHolder rspec = metadata_Relocation::spec(index);
 368   return AddressLiteral((address)obj, rspec);
 369 }
 370 
 371 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) {
 372   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
 373   int index = oop_recorder()->find_index(obj);
 374   RelocationHolder rspec = metadata_Relocation::spec(index);
 375   return AddressLiteral((address)obj, rspec);
 376 }
 377 
 378 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) {
 379   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
 380   int oop_index = oop_recorder()->allocate_oop_index(obj);
 381   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 382 }
 383 
 384 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) {
 385   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
 386   int oop_index = oop_recorder()->find_index(obj);
 387   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 388 }
 389 
 390 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
 391                                                       Register tmp, int offset) {
 392   intptr_t value = *delayed_value_addr;
 393   if (value != 0) {
 394     return RegisterOrConstant(value + offset);
 395   }
 396 
 397   // Load indirectly to solve generation ordering problem.
 398   // static address, no relocation
 399   int simm16_offset = load_const_optimized(tmp, delayed_value_addr, noreg, true);
 400   ld(tmp, simm16_offset, tmp); // must be aligned ((xa & 3) == 0)
 401 
 402   if (offset != 0) {
 403     addi(tmp, tmp, offset);
 404   }
 405 
 406   return RegisterOrConstant(tmp);
 407 }
 408 
 409 #ifndef PRODUCT
 410 void MacroAssembler::pd_print_patched_instruction(address branch) {
 411   Unimplemented(); // TODO: PPC port
 412 }
 413 #endif // ndef PRODUCT
 414 
 415 // Conditional far branch for destinations encodable in 24+2 bits.
 416 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) {
 417 
 418   // If requested by flag optimize, relocate the bc_far as a
 419   // runtime_call and prepare for optimizing it when the code gets
 420   // relocated.
 421   if (optimize == bc_far_optimize_on_relocate) {
 422     relocate(relocInfo::runtime_call_type);
 423   }
 424 
 425   // variant 2:
 426   //
 427   //    b!cxx SKIP
 428   //    bxx   DEST
 429   //  SKIP:
 430   //
 431 
 432   const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 433                                                 opposite_bcond(inv_boint_bcond(boint)));
 434 
 435   // We emit two branches.
 436   // First, a conditional branch which jumps around the far branch.
 437   const address not_taken_pc = pc() + 2 * BytesPerInstWord;
 438   const address bc_pc        = pc();
 439   bc(opposite_boint, biint, not_taken_pc);
 440 
 441   const int bc_instr = *(int*)bc_pc;
 442   assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition");
 443   assert(opposite_boint == inv_bo_field(bc_instr), "postcondition");
 444   assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))),
 445                                      opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))),
 446          "postcondition");
 447   assert(biint == inv_bi_field(bc_instr), "postcondition");
 448 
 449   // Second, an unconditional far branch which jumps to dest.
 450   // Note: target(dest) remembers the current pc (see CodeSection::target)
 451   //       and returns the current pc if the label is not bound yet; when
 452   //       the label gets bound, the unconditional far branch will be patched.
 453   const address target_pc = target(dest);
 454   const address b_pc  = pc();
 455   b(target_pc);
 456 
 457   assert(not_taken_pc == pc(),                     "postcondition");
 458   assert(dest.is_bound() || target_pc == b_pc, "postcondition");
 459 }
 460 
 461 // 1 or 2 instructions
 462 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) {
 463   if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) {
 464     bc(boint, biint, dest);
 465   } else {
 466     bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate);
 467   }
 468 }
 469 
 470 bool MacroAssembler::is_bc_far_at(address instruction_addr) {
 471   return is_bc_far_variant1_at(instruction_addr) ||
 472          is_bc_far_variant2_at(instruction_addr) ||
 473          is_bc_far_variant3_at(instruction_addr);
 474 }
 475 
 476 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) {
 477   if (is_bc_far_variant1_at(instruction_addr)) {
 478     const address instruction_1_addr = instruction_addr;
 479     const int instruction_1 = *(int*)instruction_1_addr;
 480     return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr);
 481   } else if (is_bc_far_variant2_at(instruction_addr)) {
 482     const address instruction_2_addr = instruction_addr + 4;
 483     return bxx_destination(instruction_2_addr);
 484   } else if (is_bc_far_variant3_at(instruction_addr)) {
 485     return instruction_addr + 8;
 486   }
 487   // variant 4 ???
 488   ShouldNotReachHere();
 489   return NULL;
 490 }
 491 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) {
 492 
 493   if (is_bc_far_variant3_at(instruction_addr)) {
 494     // variant 3, far cond branch to the next instruction, already patched to nops:
 495     //
 496     //    nop
 497     //    endgroup
 498     //  SKIP/DEST:
 499     //
 500     return;
 501   }
 502 
 503   // first, extract boint and biint from the current branch
 504   int boint = 0;
 505   int biint = 0;
 506 
 507   ResourceMark rm;
 508   const int code_size = 2 * BytesPerInstWord;
 509   CodeBuffer buf(instruction_addr, code_size);
 510   MacroAssembler masm(&buf);
 511   if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) {
 512     // Far branch to next instruction: Optimize it by patching nops (produce variant 3).
 513     masm.nop();
 514     masm.endgroup();
 515   } else {
 516     if (is_bc_far_variant1_at(instruction_addr)) {
 517       // variant 1, the 1st instruction contains the destination address:
 518       //
 519       //    bcxx  DEST
 520       //    nop
 521       //
 522       const int instruction_1 = *(int*)(instruction_addr);
 523       boint = inv_bo_field(instruction_1);
 524       biint = inv_bi_field(instruction_1);
 525     } else if (is_bc_far_variant2_at(instruction_addr)) {
 526       // variant 2, the 2nd instruction contains the destination address:
 527       //
 528       //    b!cxx SKIP
 529       //    bxx   DEST
 530       //  SKIP:
 531       //
 532       const int instruction_1 = *(int*)(instruction_addr);
 533       boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))),
 534           opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1))));
 535       biint = inv_bi_field(instruction_1);
 536     } else {
 537       // variant 4???
 538       ShouldNotReachHere();
 539     }
 540 
 541     // second, set the new branch destination and optimize the code
 542     if (dest != instruction_addr + 4 && // the bc_far is still unbound!
 543         masm.is_within_range_of_bcxx(dest, instruction_addr)) {
 544       // variant 1:
 545       //
 546       //    bcxx  DEST
 547       //    nop
 548       //
 549       masm.bc(boint, biint, dest);
 550       masm.nop();
 551     } else {
 552       // variant 2:
 553       //
 554       //    b!cxx SKIP
 555       //    bxx   DEST
 556       //  SKIP:
 557       //
 558       const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 559                                                     opposite_bcond(inv_boint_bcond(boint)));
 560       const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord;
 561       masm.bc(opposite_boint, biint, not_taken_pc);
 562       masm.b(dest);
 563     }
 564   }
 565   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 566 }
 567 
 568 // Emit a NOT mt-safe patchable 64 bit absolute call/jump.
 569 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) {
 570   // get current pc
 571   uint64_t start_pc = (uint64_t) pc();
 572 
 573   const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last
 574   const address pc_of_b  = (address) (start_pc + (0*BytesPerInstWord)); // b is first
 575 
 576   // relocate here
 577   if (rt != relocInfo::none) {
 578     relocate(rt);
 579   }
 580 
 581   if ( ReoptimizeCallSequences &&
 582        (( link && is_within_range_of_b(dest, pc_of_bl)) ||
 583         (!link && is_within_range_of_b(dest, pc_of_b)))) {
 584     // variant 2:
 585     // Emit an optimized, pc-relative call/jump.
 586 
 587     if (link) {
 588       // some padding
 589       nop();
 590       nop();
 591       nop();
 592       nop();
 593       nop();
 594       nop();
 595 
 596       // do the call
 597       assert(pc() == pc_of_bl, "just checking");
 598       bl(dest, relocInfo::none);
 599     } else {
 600       // do the jump
 601       assert(pc() == pc_of_b, "just checking");
 602       b(dest, relocInfo::none);
 603 
 604       // some padding
 605       nop();
 606       nop();
 607       nop();
 608       nop();
 609       nop();
 610       nop();
 611     }
 612 
 613     // Assert that we can identify the emitted call/jump.
 614     assert(is_bxx64_patchable_variant2_at((address)start_pc, link),
 615            "can't identify emitted call");
 616   } else {
 617     // variant 1:
 618     mr(R0, R11);  // spill R11 -> R0.
 619 
 620     // Load the destination address into CTR,
 621     // calculate destination relative to global toc.
 622     calculate_address_from_global_toc(R11, dest, true, true, false);
 623 
 624     mtctr(R11);
 625     mr(R11, R0);  // spill R11 <- R0.
 626     nop();
 627 
 628     // do the call/jump
 629     if (link) {
 630       bctrl();
 631     } else{
 632       bctr();
 633     }
 634     // Assert that we can identify the emitted call/jump.
 635     assert(is_bxx64_patchable_variant1b_at((address)start_pc, link),
 636            "can't identify emitted call");
 637   }
 638 
 639   // Assert that we can identify the emitted call/jump.
 640   assert(is_bxx64_patchable_at((address)start_pc, link),
 641          "can't identify emitted call");
 642   assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest,
 643          "wrong encoding of dest address");
 644 }
 645 
 646 // Identify a bxx64_patchable instruction.
 647 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) {
 648   return is_bxx64_patchable_variant1b_at(instruction_addr, link)
 649     //|| is_bxx64_patchable_variant1_at(instruction_addr, link)
 650       || is_bxx64_patchable_variant2_at(instruction_addr, link);
 651 }
 652 
 653 // Does the call64_patchable instruction use a pc-relative encoding of
 654 // the call destination?
 655 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) {
 656   // variant 2 is pc-relative
 657   return is_bxx64_patchable_variant2_at(instruction_addr, link);
 658 }
 659 
 660 // Identify variant 1.
 661 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) {
 662   unsigned int* instr = (unsigned int*) instruction_addr;
 663   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 664       && is_mtctr(instr[5]) // mtctr
 665     && is_load_const_at(instruction_addr);
 666 }
 667 
 668 // Identify variant 1b: load destination relative to global toc.
 669 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) {
 670   unsigned int* instr = (unsigned int*) instruction_addr;
 671   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 672     && is_mtctr(instr[3]) // mtctr
 673     && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr);
 674 }
 675 
 676 // Identify variant 2.
 677 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) {
 678   unsigned int* instr = (unsigned int*) instruction_addr;
 679   if (link) {
 680     return is_bl (instr[6])  // bl dest is last
 681       && is_nop(instr[0])  // nop
 682       && is_nop(instr[1])  // nop
 683       && is_nop(instr[2])  // nop
 684       && is_nop(instr[3])  // nop
 685       && is_nop(instr[4])  // nop
 686       && is_nop(instr[5]); // nop
 687   } else {
 688     return is_b  (instr[0])  // b  dest is first
 689       && is_nop(instr[1])  // nop
 690       && is_nop(instr[2])  // nop
 691       && is_nop(instr[3])  // nop
 692       && is_nop(instr[4])  // nop
 693       && is_nop(instr[5])  // nop
 694       && is_nop(instr[6]); // nop
 695   }
 696 }
 697 
 698 // Set dest address of a bxx64_patchable instruction.
 699 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) {
 700   ResourceMark rm;
 701   int code_size = MacroAssembler::bxx64_patchable_size;
 702   CodeBuffer buf(instruction_addr, code_size);
 703   MacroAssembler masm(&buf);
 704   masm.bxx64_patchable(dest, relocInfo::none, link);
 705   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 706 }
 707 
 708 // Get dest address of a bxx64_patchable instruction.
 709 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) {
 710   if (is_bxx64_patchable_variant1_at(instruction_addr, link)) {
 711     return (address) (unsigned long) get_const(instruction_addr);
 712   } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) {
 713     unsigned int* instr = (unsigned int*) instruction_addr;
 714     if (link) {
 715       const int instr_idx = 6; // bl is last
 716       int branchoffset = branch_destination(instr[instr_idx], 0);
 717       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 718     } else {
 719       const int instr_idx = 0; // b is first
 720       int branchoffset = branch_destination(instr[instr_idx], 0);
 721       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 722     }
 723   // Load dest relative to global toc.
 724   } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) {
 725     return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord,
 726                                                                instruction_addr);
 727   } else {
 728     ShouldNotReachHere();
 729     return NULL;
 730   }
 731 }
 732 
 733 // Uses ordering which corresponds to ABI:
 734 //    _savegpr0_14:  std  r14,-144(r1)
 735 //    _savegpr0_15:  std  r15,-136(r1)
 736 //    _savegpr0_16:  std  r16,-128(r1)
 737 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) {
 738   std(R14, offset, dst);   offset += 8;
 739   std(R15, offset, dst);   offset += 8;
 740   std(R16, offset, dst);   offset += 8;
 741   std(R17, offset, dst);   offset += 8;
 742   std(R18, offset, dst);   offset += 8;
 743   std(R19, offset, dst);   offset += 8;
 744   std(R20, offset, dst);   offset += 8;
 745   std(R21, offset, dst);   offset += 8;
 746   std(R22, offset, dst);   offset += 8;
 747   std(R23, offset, dst);   offset += 8;
 748   std(R24, offset, dst);   offset += 8;
 749   std(R25, offset, dst);   offset += 8;
 750   std(R26, offset, dst);   offset += 8;
 751   std(R27, offset, dst);   offset += 8;
 752   std(R28, offset, dst);   offset += 8;
 753   std(R29, offset, dst);   offset += 8;
 754   std(R30, offset, dst);   offset += 8;
 755   std(R31, offset, dst);   offset += 8;
 756 
 757   stfd(F14, offset, dst);   offset += 8;
 758   stfd(F15, offset, dst);   offset += 8;
 759   stfd(F16, offset, dst);   offset += 8;
 760   stfd(F17, offset, dst);   offset += 8;
 761   stfd(F18, offset, dst);   offset += 8;
 762   stfd(F19, offset, dst);   offset += 8;
 763   stfd(F20, offset, dst);   offset += 8;
 764   stfd(F21, offset, dst);   offset += 8;
 765   stfd(F22, offset, dst);   offset += 8;
 766   stfd(F23, offset, dst);   offset += 8;
 767   stfd(F24, offset, dst);   offset += 8;
 768   stfd(F25, offset, dst);   offset += 8;
 769   stfd(F26, offset, dst);   offset += 8;
 770   stfd(F27, offset, dst);   offset += 8;
 771   stfd(F28, offset, dst);   offset += 8;
 772   stfd(F29, offset, dst);   offset += 8;
 773   stfd(F30, offset, dst);   offset += 8;
 774   stfd(F31, offset, dst);
 775 }
 776 
 777 // Uses ordering which corresponds to ABI:
 778 //    _restgpr0_14:  ld   r14,-144(r1)
 779 //    _restgpr0_15:  ld   r15,-136(r1)
 780 //    _restgpr0_16:  ld   r16,-128(r1)
 781 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) {
 782   ld(R14, offset, src);   offset += 8;
 783   ld(R15, offset, src);   offset += 8;
 784   ld(R16, offset, src);   offset += 8;
 785   ld(R17, offset, src);   offset += 8;
 786   ld(R18, offset, src);   offset += 8;
 787   ld(R19, offset, src);   offset += 8;
 788   ld(R20, offset, src);   offset += 8;
 789   ld(R21, offset, src);   offset += 8;
 790   ld(R22, offset, src);   offset += 8;
 791   ld(R23, offset, src);   offset += 8;
 792   ld(R24, offset, src);   offset += 8;
 793   ld(R25, offset, src);   offset += 8;
 794   ld(R26, offset, src);   offset += 8;
 795   ld(R27, offset, src);   offset += 8;
 796   ld(R28, offset, src);   offset += 8;
 797   ld(R29, offset, src);   offset += 8;
 798   ld(R30, offset, src);   offset += 8;
 799   ld(R31, offset, src);   offset += 8;
 800 
 801   // FP registers
 802   lfd(F14, offset, src);   offset += 8;
 803   lfd(F15, offset, src);   offset += 8;
 804   lfd(F16, offset, src);   offset += 8;
 805   lfd(F17, offset, src);   offset += 8;
 806   lfd(F18, offset, src);   offset += 8;
 807   lfd(F19, offset, src);   offset += 8;
 808   lfd(F20, offset, src);   offset += 8;
 809   lfd(F21, offset, src);   offset += 8;
 810   lfd(F22, offset, src);   offset += 8;
 811   lfd(F23, offset, src);   offset += 8;
 812   lfd(F24, offset, src);   offset += 8;
 813   lfd(F25, offset, src);   offset += 8;
 814   lfd(F26, offset, src);   offset += 8;
 815   lfd(F27, offset, src);   offset += 8;
 816   lfd(F28, offset, src);   offset += 8;
 817   lfd(F29, offset, src);   offset += 8;
 818   lfd(F30, offset, src);   offset += 8;
 819   lfd(F31, offset, src);
 820 }
 821 
 822 // For verify_oops.
 823 void MacroAssembler::save_volatile_gprs(Register dst, int offset) {
 824   std(R2,  offset, dst);   offset += 8;
 825   std(R3,  offset, dst);   offset += 8;
 826   std(R4,  offset, dst);   offset += 8;
 827   std(R5,  offset, dst);   offset += 8;
 828   std(R6,  offset, dst);   offset += 8;
 829   std(R7,  offset, dst);   offset += 8;
 830   std(R8,  offset, dst);   offset += 8;
 831   std(R9,  offset, dst);   offset += 8;
 832   std(R10, offset, dst);   offset += 8;
 833   std(R11, offset, dst);   offset += 8;
 834   std(R12, offset, dst);   offset += 8;
 835 
 836   stfd(F0, offset, dst);   offset += 8;
 837   stfd(F1, offset, dst);   offset += 8;
 838   stfd(F2, offset, dst);   offset += 8;
 839   stfd(F3, offset, dst);   offset += 8;
 840   stfd(F4, offset, dst);   offset += 8;
 841   stfd(F5, offset, dst);   offset += 8;
 842   stfd(F6, offset, dst);   offset += 8;
 843   stfd(F7, offset, dst);   offset += 8;
 844   stfd(F8, offset, dst);   offset += 8;
 845   stfd(F9, offset, dst);   offset += 8;
 846   stfd(F10, offset, dst);  offset += 8;
 847   stfd(F11, offset, dst);  offset += 8;
 848   stfd(F12, offset, dst);  offset += 8;
 849   stfd(F13, offset, dst);
 850 }
 851 
 852 // For verify_oops.
 853 void MacroAssembler::restore_volatile_gprs(Register src, int offset) {
 854   ld(R2,  offset, src);   offset += 8;
 855   ld(R3,  offset, src);   offset += 8;
 856   ld(R4,  offset, src);   offset += 8;
 857   ld(R5,  offset, src);   offset += 8;
 858   ld(R6,  offset, src);   offset += 8;
 859   ld(R7,  offset, src);   offset += 8;
 860   ld(R8,  offset, src);   offset += 8;
 861   ld(R9,  offset, src);   offset += 8;
 862   ld(R10, offset, src);   offset += 8;
 863   ld(R11, offset, src);   offset += 8;
 864   ld(R12, offset, src);   offset += 8;
 865 
 866   lfd(F0, offset, src);   offset += 8;
 867   lfd(F1, offset, src);   offset += 8;
 868   lfd(F2, offset, src);   offset += 8;
 869   lfd(F3, offset, src);   offset += 8;
 870   lfd(F4, offset, src);   offset += 8;
 871   lfd(F5, offset, src);   offset += 8;
 872   lfd(F6, offset, src);   offset += 8;
 873   lfd(F7, offset, src);   offset += 8;
 874   lfd(F8, offset, src);   offset += 8;
 875   lfd(F9, offset, src);   offset += 8;
 876   lfd(F10, offset, src);  offset += 8;
 877   lfd(F11, offset, src);  offset += 8;
 878   lfd(F12, offset, src);  offset += 8;
 879   lfd(F13, offset, src);
 880 }
 881 
 882 void MacroAssembler::save_LR_CR(Register tmp) {
 883   mfcr(tmp);
 884   std(tmp, _abi(cr), R1_SP);
 885   mflr(tmp);
 886   std(tmp, _abi(lr), R1_SP);
 887   // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad)
 888 }
 889 
 890 void MacroAssembler::restore_LR_CR(Register tmp) {
 891   assert(tmp != R1_SP, "must be distinct");
 892   ld(tmp, _abi(lr), R1_SP);
 893   mtlr(tmp);
 894   ld(tmp, _abi(cr), R1_SP);
 895   mtcr(tmp);
 896 }
 897 
 898 address MacroAssembler::get_PC_trash_LR(Register result) {
 899   Label L;
 900   bl(L);
 901   bind(L);
 902   address lr_pc = pc();
 903   mflr(result);
 904   return lr_pc;
 905 }
 906 
 907 void MacroAssembler::resize_frame(Register offset, Register tmp) {
 908 #ifdef ASSERT
 909   assert_different_registers(offset, tmp, R1_SP);
 910   andi_(tmp, offset, frame::alignment_in_bytes-1);
 911   asm_assert_eq("resize_frame: unaligned", 0x204);
 912 #endif
 913 
 914   // tmp <- *(SP)
 915   ld(tmp, _abi(callers_sp), R1_SP);
 916   // addr <- SP + offset;
 917   // *(addr) <- tmp;
 918   // SP <- addr
 919   stdux(tmp, R1_SP, offset);
 920 }
 921 
 922 void MacroAssembler::resize_frame(int offset, Register tmp) {
 923   assert(is_simm(offset, 16), "too big an offset");
 924   assert_different_registers(tmp, R1_SP);
 925   assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned");
 926   // tmp <- *(SP)
 927   ld(tmp, _abi(callers_sp), R1_SP);
 928   // addr <- SP + offset;
 929   // *(addr) <- tmp;
 930   // SP <- addr
 931   stdu(tmp, offset, R1_SP);
 932 }
 933 
 934 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) {
 935   // (addr == tmp1) || (addr == tmp2) is allowed here!
 936   assert(tmp1 != tmp2, "must be distinct");
 937 
 938   // compute offset w.r.t. current stack pointer
 939   // tmp_1 <- addr - SP (!)
 940   subf(tmp1, R1_SP, addr);
 941 
 942   // atomically update SP keeping back link.
 943   resize_frame(tmp1/* offset */, tmp2/* tmp */);
 944 }
 945 
 946 void MacroAssembler::push_frame(Register bytes, Register tmp) {
 947 #ifdef ASSERT
 948   assert(bytes != R0, "r0 not allowed here");
 949   andi_(R0, bytes, frame::alignment_in_bytes-1);
 950   asm_assert_eq("push_frame(Reg, Reg): unaligned", 0x203);
 951 #endif
 952   neg(tmp, bytes);
 953   stdux(R1_SP, R1_SP, tmp);
 954 }
 955 
 956 // Push a frame of size `bytes'.
 957 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) {
 958   long offset = align_addr(bytes, frame::alignment_in_bytes);
 959   if (is_simm(-offset, 16)) {
 960     stdu(R1_SP, -offset, R1_SP);
 961   } else {
 962     load_const_optimized(tmp, -offset);
 963     stdux(R1_SP, R1_SP, tmp);
 964   }
 965 }
 966 
 967 // Push a frame of size `bytes' plus abi_reg_args on top.
 968 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) {
 969   push_frame(bytes + frame::abi_reg_args_size, tmp);
 970 }
 971 
 972 // Setup up a new C frame with a spill area for non-volatile GPRs and
 973 // additional space for local variables.
 974 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes,
 975                                                       Register tmp) {
 976   push_frame(bytes + frame::abi_reg_args_size + frame::spill_nonvolatiles_size, tmp);
 977 }
 978 
 979 // Pop current C frame.
 980 void MacroAssembler::pop_frame() {
 981   ld(R1_SP, _abi(callers_sp), R1_SP);
 982 }
 983 
 984 #if defined(ABI_ELFv2)
 985 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) {
 986   // TODO(asmundak): make sure the caller uses R12 as function descriptor
 987   // most of the times.
 988   if (R12 != r_function_entry) {
 989     mr(R12, r_function_entry);
 990   }
 991   mtctr(R12);
 992   // Do a call or a branch.
 993   if (and_link) {
 994     bctrl();
 995   } else {
 996     bctr();
 997   }
 998   _last_calls_return_pc = pc();
 999 
1000   return _last_calls_return_pc;
1001 }
1002 
1003 // Call a C function via a function descriptor and use full C
1004 // calling conventions. Updates and returns _last_calls_return_pc.
1005 address MacroAssembler::call_c(Register r_function_entry) {
1006   return branch_to(r_function_entry, /*and_link=*/true);
1007 }
1008 
1009 // For tail calls: only branch, don't link, so callee returns to caller of this function.
1010 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) {
1011   return branch_to(r_function_entry, /*and_link=*/false);
1012 }
1013 
1014 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) {
1015   load_const(R12, function_entry, R0);
1016   return branch_to(R12,  /*and_link=*/true);
1017 }
1018 
1019 #else
1020 // Generic version of a call to C function via a function descriptor
1021 // with variable support for C calling conventions (TOC, ENV, etc.).
1022 // Updates and returns _last_calls_return_pc.
1023 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call,
1024                                   bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) {
1025   // we emit standard ptrgl glue code here
1026   assert((function_descriptor != R0), "function_descriptor cannot be R0");
1027 
1028   // retrieve necessary entries from the function descriptor
1029   ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor);
1030   mtctr(R0);
1031 
1032   if (load_toc_of_callee) {
1033     ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor);
1034   }
1035   if (load_env_of_callee) {
1036     ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor);
1037   } else if (load_toc_of_callee) {
1038     li(R11, 0);
1039   }
1040 
1041   // do a call or a branch
1042   if (and_link) {
1043     bctrl();
1044   } else {
1045     bctr();
1046   }
1047   _last_calls_return_pc = pc();
1048 
1049   return _last_calls_return_pc;
1050 }
1051 
1052 // Call a C function via a function descriptor and use full C calling
1053 // conventions.
1054 // We don't use the TOC in generated code, so there is no need to save
1055 // and restore its value.
1056 address MacroAssembler::call_c(Register fd) {
1057   return branch_to(fd, /*and_link=*/true,
1058                        /*save toc=*/false,
1059                        /*restore toc=*/false,
1060                        /*load toc=*/true,
1061                        /*load env=*/true);
1062 }
1063 
1064 address MacroAssembler::call_c_and_return_to_caller(Register fd) {
1065   return branch_to(fd, /*and_link=*/false,
1066                        /*save toc=*/false,
1067                        /*restore toc=*/false,
1068                        /*load toc=*/true,
1069                        /*load env=*/true);
1070 }
1071 
1072 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) {
1073   if (rt != relocInfo::none) {
1074     // this call needs to be relocatable
1075     if (!ReoptimizeCallSequences
1076         || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1077         || fd == NULL   // support code-size estimation
1078         || !fd->is_friend_function()
1079         || fd->entry() == NULL) {
1080       // it's not a friend function as defined by class FunctionDescriptor,
1081       // so do a full call-c here.
1082       load_const(R11, (address)fd, R0);
1083 
1084       bool has_env = (fd != NULL && fd->env() != NULL);
1085       return branch_to(R11, /*and_link=*/true,
1086                             /*save toc=*/false,
1087                             /*restore toc=*/false,
1088                             /*load toc=*/true,
1089                             /*load env=*/has_env);
1090     } else {
1091       // It's a friend function. Load the entry point and don't care about
1092       // toc and env. Use an optimizable call instruction, but ensure the
1093       // same code-size as in the case of a non-friend function.
1094       nop();
1095       nop();
1096       nop();
1097       bl64_patchable(fd->entry(), rt);
1098       _last_calls_return_pc = pc();
1099       return _last_calls_return_pc;
1100     }
1101   } else {
1102     // This call does not need to be relocatable, do more aggressive
1103     // optimizations.
1104     if (!ReoptimizeCallSequences
1105       || !fd->is_friend_function()) {
1106       // It's not a friend function as defined by class FunctionDescriptor,
1107       // so do a full call-c here.
1108       load_const(R11, (address)fd, R0);
1109       return branch_to(R11, /*and_link=*/true,
1110                             /*save toc=*/false,
1111                             /*restore toc=*/false,
1112                             /*load toc=*/true,
1113                             /*load env=*/true);
1114     } else {
1115       // it's a friend function, load the entry point and don't care about
1116       // toc and env.
1117       address dest = fd->entry();
1118       if (is_within_range_of_b(dest, pc())) {
1119         bl(dest);
1120       } else {
1121         bl64_patchable(dest, rt);
1122       }
1123       _last_calls_return_pc = pc();
1124       return _last_calls_return_pc;
1125     }
1126   }
1127 }
1128 
1129 // Call a C function.  All constants needed reside in TOC.
1130 //
1131 // Read the address to call from the TOC.
1132 // Read env from TOC, if fd specifies an env.
1133 // Read new TOC from TOC.
1134 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd,
1135                                          relocInfo::relocType rt, Register toc) {
1136   if (!ReoptimizeCallSequences
1137     || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1138     || !fd->is_friend_function()) {
1139     // It's not a friend function as defined by class FunctionDescriptor,
1140     // so do a full call-c here.
1141     assert(fd->entry() != NULL, "function must be linked");
1142 
1143     AddressLiteral fd_entry(fd->entry());
1144     bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true);
1145     mtctr(R11);
1146     if (fd->env() == NULL) {
1147       li(R11, 0);
1148       nop();
1149     } else {
1150       AddressLiteral fd_env(fd->env());
1151       success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true);
1152     }
1153     AddressLiteral fd_toc(fd->toc());
1154     // Set R2_TOC (load from toc)
1155     success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true);
1156     bctrl();
1157     _last_calls_return_pc = pc();
1158     if (!success) { return NULL; }
1159   } else {
1160     // It's a friend function, load the entry point and don't care about
1161     // toc and env. Use an optimizable call instruction, but ensure the
1162     // same code-size as in the case of a non-friend function.
1163     nop();
1164     bl64_patchable(fd->entry(), rt);
1165     _last_calls_return_pc = pc();
1166   }
1167   return _last_calls_return_pc;
1168 }
1169 #endif // ABI_ELFv2
1170 
1171 void MacroAssembler::call_VM_base(Register oop_result,
1172                                   Register last_java_sp,
1173                                   address  entry_point,
1174                                   bool     check_exceptions) {
1175   BLOCK_COMMENT("call_VM {");
1176   // Determine last_java_sp register.
1177   if (!last_java_sp->is_valid()) {
1178     last_java_sp = R1_SP;
1179   }
1180   set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1);
1181 
1182   // ARG1 must hold thread address.
1183   mr(R3_ARG1, R16_thread);
1184 #if defined(ABI_ELFv2)
1185   address return_pc = call_c(entry_point, relocInfo::none);
1186 #else
1187   address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none);
1188 #endif
1189 
1190   reset_last_Java_frame();
1191 
1192   // Check for pending exceptions.
1193   if (check_exceptions) {
1194     // We don't check for exceptions here.
1195     ShouldNotReachHere();
1196   }
1197 
1198   // Get oop result if there is one and reset the value in the thread.
1199   if (oop_result->is_valid()) {
1200     get_vm_result(oop_result);
1201   }
1202 
1203   _last_calls_return_pc = return_pc;
1204   BLOCK_COMMENT("} call_VM");
1205 }
1206 
1207 void MacroAssembler::call_VM_leaf_base(address entry_point) {
1208   BLOCK_COMMENT("call_VM_leaf {");
1209 #if defined(ABI_ELFv2)
1210   call_c(entry_point, relocInfo::none);
1211 #else
1212   call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none);
1213 #endif
1214   BLOCK_COMMENT("} call_VM_leaf");
1215 }
1216 
1217 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) {
1218   call_VM_base(oop_result, noreg, entry_point, check_exceptions);
1219 }
1220 
1221 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1,
1222                              bool check_exceptions) {
1223   // R3_ARG1 is reserved for the thread.
1224   mr_if_needed(R4_ARG2, arg_1);
1225   call_VM(oop_result, entry_point, check_exceptions);
1226 }
1227 
1228 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2,
1229                              bool check_exceptions) {
1230   // R3_ARG1 is reserved for the thread
1231   mr_if_needed(R4_ARG2, arg_1);
1232   assert(arg_2 != R4_ARG2, "smashed argument");
1233   mr_if_needed(R5_ARG3, arg_2);
1234   call_VM(oop_result, entry_point, check_exceptions);
1235 }
1236 
1237 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3,
1238                              bool check_exceptions) {
1239   // R3_ARG1 is reserved for the thread
1240   mr_if_needed(R4_ARG2, arg_1);
1241   assert(arg_2 != R4_ARG2, "smashed argument");
1242   mr_if_needed(R5_ARG3, arg_2);
1243   mr_if_needed(R6_ARG4, arg_3);
1244   call_VM(oop_result, entry_point, check_exceptions);
1245 }
1246 
1247 void MacroAssembler::call_VM_leaf(address entry_point) {
1248   call_VM_leaf_base(entry_point);
1249 }
1250 
1251 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) {
1252   mr_if_needed(R3_ARG1, arg_1);
1253   call_VM_leaf(entry_point);
1254 }
1255 
1256 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) {
1257   mr_if_needed(R3_ARG1, arg_1);
1258   assert(arg_2 != R3_ARG1, "smashed argument");
1259   mr_if_needed(R4_ARG2, arg_2);
1260   call_VM_leaf(entry_point);
1261 }
1262 
1263 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) {
1264   mr_if_needed(R3_ARG1, arg_1);
1265   assert(arg_2 != R3_ARG1, "smashed argument");
1266   mr_if_needed(R4_ARG2, arg_2);
1267   assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument");
1268   mr_if_needed(R5_ARG3, arg_3);
1269   call_VM_leaf(entry_point);
1270 }
1271 
1272 // Check whether instruction is a read access to the polling page
1273 // which was emitted by load_from_polling_page(..).
1274 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext,
1275                                                address* polling_address_ptr) {
1276   if (!is_ld(instruction))
1277     return false; // It's not a ld. Fail.
1278 
1279   int rt = inv_rt_field(instruction);
1280   int ra = inv_ra_field(instruction);
1281   int ds = inv_ds_field(instruction);
1282   if (!(ds == 0 && ra != 0 && rt == 0)) {
1283     return false; // It's not a ld(r0, X, ra). Fail.
1284   }
1285 
1286   if (!ucontext) {
1287     // Set polling address.
1288     if (polling_address_ptr != NULL) {
1289       *polling_address_ptr = NULL;
1290     }
1291     return true; // No ucontext given. Can't check value of ra. Assume true.
1292   }
1293 
1294 #ifdef LINUX
1295   // Ucontext given. Check that register ra contains the address of
1296   // the safepoing polling page.
1297   ucontext_t* uc = (ucontext_t*) ucontext;
1298   // Set polling address.
1299   address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds;
1300   if (polling_address_ptr != NULL) {
1301     *polling_address_ptr = addr;
1302   }
1303   return os::is_poll_address(addr);
1304 #else
1305   // Not on Linux, ucontext must be NULL.
1306   ShouldNotReachHere();
1307   return false;
1308 #endif
1309 }
1310 
1311 bool MacroAssembler::is_memory_serialization(int instruction, JavaThread* thread, void* ucontext) {
1312 #ifdef LINUX
1313   ucontext_t* uc = (ucontext_t*) ucontext;
1314 
1315   if (is_stwx(instruction) || is_stwux(instruction)) {
1316     int ra = inv_ra_field(instruction);
1317     int rb = inv_rb_field(instruction);
1318 
1319     // look up content of ra and rb in ucontext
1320     address ra_val=(address)uc->uc_mcontext.regs->gpr[ra];
1321     long rb_val=(long)uc->uc_mcontext.regs->gpr[rb];
1322     return os::is_memory_serialize_page(thread, ra_val+rb_val);
1323   } else if (is_stw(instruction) || is_stwu(instruction)) {
1324     int ra = inv_ra_field(instruction);
1325     int d1 = inv_d1_field(instruction);
1326 
1327     // look up content of ra in ucontext
1328     address ra_val=(address)uc->uc_mcontext.regs->gpr[ra];
1329     return os::is_memory_serialize_page(thread, ra_val+d1);
1330   } else {
1331     return false;
1332   }
1333 #else
1334   // workaround not needed on !LINUX :-)
1335   ShouldNotCallThis();
1336   return false;
1337 #endif
1338 }
1339 
1340 void MacroAssembler::bang_stack_with_offset(int offset) {
1341   // When increasing the stack, the old stack pointer will be written
1342   // to the new top of stack according to the PPC64 abi.
1343   // Therefore, stack banging is not necessary when increasing
1344   // the stack by <= os::vm_page_size() bytes.
1345   // When increasing the stack by a larger amount, this method is
1346   // called repeatedly to bang the intermediate pages.
1347 
1348   // Stack grows down, caller passes positive offset.
1349   assert(offset > 0, "must bang with positive offset");
1350 
1351   long stdoffset = -offset;
1352 
1353   if (is_simm(stdoffset, 16)) {
1354     // Signed 16 bit offset, a simple std is ok.
1355     if (UseLoadInstructionsForStackBangingPPC64) {
1356       ld(R0, (int)(signed short)stdoffset, R1_SP);
1357     } else {
1358       std(R0,(int)(signed short)stdoffset, R1_SP);
1359     }
1360   } else if (is_simm(stdoffset, 31)) {
1361     const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset);
1362     const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset);
1363 
1364     Register tmp = R11;
1365     addis(tmp, R1_SP, hi);
1366     if (UseLoadInstructionsForStackBangingPPC64) {
1367       ld(R0,  lo, tmp);
1368     } else {
1369       std(R0, lo, tmp);
1370     }
1371   } else {
1372     ShouldNotReachHere();
1373   }
1374 }
1375 
1376 // If instruction is a stack bang of the form
1377 //    std    R0,    x(Ry),       (see bang_stack_with_offset())
1378 //    stdu   R1_SP, x(R1_SP),    (see push_frame(), resize_frame())
1379 // or stdux  R1_SP, Rx, R1_SP    (see push_frame(), resize_frame())
1380 // return the banged address. Otherwise, return 0.
1381 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) {
1382 #ifdef LINUX
1383   ucontext_t* uc = (ucontext_t*) ucontext;
1384   int rs = inv_rs_field(instruction);
1385   int ra = inv_ra_field(instruction);
1386   if (   (is_ld(instruction)   && rs == 0 &&  UseLoadInstructionsForStackBangingPPC64)
1387       || (is_std(instruction)  && rs == 0 && !UseLoadInstructionsForStackBangingPPC64)
1388       || (is_stdu(instruction) && rs == 1)) {
1389     int ds = inv_ds_field(instruction);
1390     // return banged address
1391     return ds+(address)uc->uc_mcontext.regs->gpr[ra];
1392   } else if (is_stdux(instruction) && rs == 1) {
1393     int rb = inv_rb_field(instruction);
1394     address sp = (address)uc->uc_mcontext.regs->gpr[1];
1395     long rb_val = (long)uc->uc_mcontext.regs->gpr[rb];
1396     return ra != 1 || rb_val >= 0 ? NULL         // not a stack bang
1397                                   : sp + rb_val; // banged address
1398   }
1399   return NULL; // not a stack bang
1400 #else
1401   // workaround not needed on !LINUX :-)
1402   ShouldNotCallThis();
1403   return NULL;
1404 #endif
1405 }
1406 
1407 void MacroAssembler::reserved_stack_check(Register return_pc) {
1408   // Test if reserved zone needs to be enabled.
1409   Label no_reserved_zone_enabling;
1410 
1411   ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread);
1412   cmpld(CCR0, R1_SP, R0);
1413   blt_predict_taken(CCR0, no_reserved_zone_enabling);
1414 
1415   // Enable reserved zone again, throw stack overflow exception.
1416   push_frame_reg_args(0, R0);
1417   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread);
1418   pop_frame();
1419   mtlr(return_pc);
1420   load_const_optimized(R0, StubRoutines::throw_delayed_StackOverflowError_entry());
1421   mtctr(R0);
1422   bctr();
1423 
1424   should_not_reach_here();
1425 
1426   bind(no_reserved_zone_enabling);
1427 }
1428 
1429 void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base,
1430                                 bool cmpxchgx_hint) {
1431   Label retry;
1432   bind(retry);
1433   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1434   stdcx_(exchange_value, addr_base);
1435   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1436     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1437   } else {
1438     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1439   }
1440 }
1441 
1442 void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base,
1443                                 Register tmp, bool cmpxchgx_hint) {
1444   Label retry;
1445   bind(retry);
1446   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1447   add(tmp, dest_current_value, inc_value);
1448   stdcx_(tmp, addr_base);
1449   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1450     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1451   } else {
1452     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1453   }
1454 }
1455 
1456 // Word/sub-word atomic helper functions
1457 
1458 // Temps and addr_base are killed if size < 4 and processor does not support respective instructions.
1459 // Only signed types are supported with size < 4.
1460 // Atomic add always kills tmp1.
1461 void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value,
1462                                                    Register addr_base, Register tmp1, Register tmp2, Register tmp3,
1463                                                    bool cmpxchgx_hint, bool is_add, int size) {
1464   // Sub-word instructions are available since Power 8.
1465   // For older processors, instruction_type != size holds, and we
1466   // emulate the sub-word instructions by constructing a 4-byte value
1467   // that leaves the other bytes unchanged.
1468   const int instruction_type = VM_Version::has_lqarx() ? size : 4;
1469 
1470   Label retry;
1471   Register shift_amount = noreg,
1472            val32 = dest_current_value,
1473            modval = is_add ? tmp1 : exchange_value;
1474 
1475   if (instruction_type != size) {
1476     assert_different_registers(tmp1, tmp2, tmp3, dest_current_value, exchange_value, addr_base);
1477     modval = tmp1;
1478     shift_amount = tmp2;
1479     val32 = tmp3;
1480     // Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.
1481 #ifdef VM_LITTLE_ENDIAN
1482     rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;
1483     clrrdi(addr_base, addr_base, 2);
1484 #else
1485     xori(shift_amount, addr_base, (size == 1) ? 3 : 2);
1486     clrrdi(addr_base, addr_base, 2);
1487     rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;
1488 #endif
1489   }
1490 
1491   // atomic emulation loop
1492   bind(retry);
1493 
1494   switch (instruction_type) {
1495     case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1496     case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1497     case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1498     default: ShouldNotReachHere();
1499   }
1500 
1501   if (instruction_type != size) {
1502     srw(dest_current_value, val32, shift_amount);
1503   }
1504 
1505   if (is_add) { add(modval, dest_current_value, exchange_value); }
1506 
1507   if (instruction_type != size) {
1508     // Transform exchange value such that the replacement can be done by one xor instruction.
1509     xorr(modval, dest_current_value, is_add ? modval : exchange_value);
1510     clrldi(modval, modval, (size == 1) ? 56 : 48);
1511     slw(modval, modval, shift_amount);
1512     xorr(modval, val32, modval);
1513   }
1514 
1515   switch (instruction_type) {
1516     case 4: stwcx_(modval, addr_base); break;
1517     case 2: sthcx_(modval, addr_base); break;
1518     case 1: stbcx_(modval, addr_base); break;
1519     default: ShouldNotReachHere();
1520   }
1521 
1522   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1523     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1524   } else {
1525     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1526   }
1527 
1528   // l?arx zero-extends, but Java wants byte/short values sign-extended.
1529   if (size == 1) {
1530     extsb(dest_current_value, dest_current_value);
1531   } else if (size == 2) {
1532     extsh(dest_current_value, dest_current_value);
1533   };
1534 }
1535 
1536 // Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions.
1537 // Only signed types are supported with size < 4.
1538 void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value,
1539                                        Register compare_value, Register exchange_value,
1540                                        Register addr_base, Register tmp1, Register tmp2,
1541                                        Label &retry, Label &failed, bool cmpxchgx_hint, int size) {
1542   // Sub-word instructions are available since Power 8.
1543   // For older processors, instruction_type != size holds, and we
1544   // emulate the sub-word instructions by constructing a 4-byte value
1545   // that leaves the other bytes unchanged.
1546   const int instruction_type = VM_Version::has_lqarx() ? size : 4;
1547 
1548   Register shift_amount = noreg,
1549            val32 = dest_current_value,
1550            modval = exchange_value;
1551 
1552   if (instruction_type != size) {
1553     assert_different_registers(tmp1, tmp2, dest_current_value, compare_value, exchange_value, addr_base);
1554     shift_amount = tmp1;
1555     val32 = tmp2;
1556     modval = tmp2;
1557     // Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.
1558 #ifdef VM_LITTLE_ENDIAN
1559     rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;
1560     clrrdi(addr_base, addr_base, 2);
1561 #else
1562     xori(shift_amount, addr_base, (size == 1) ? 3 : 2);
1563     clrrdi(addr_base, addr_base, 2);
1564     rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;
1565 #endif
1566     // Transform exchange value such that the replacement can be done by one xor instruction.
1567     xorr(exchange_value, compare_value, exchange_value);
1568     clrldi(exchange_value, exchange_value, (size == 1) ? 56 : 48);
1569     slw(exchange_value, exchange_value, shift_amount);
1570   }
1571 
1572   // atomic emulation loop
1573   bind(retry);
1574 
1575   switch (instruction_type) {
1576     case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1577     case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1578     case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1579     default: ShouldNotReachHere();
1580   }
1581 
1582   if (instruction_type != size) {
1583     srw(dest_current_value, val32, shift_amount);
1584   }
1585   if (size == 1) {
1586     extsb(dest_current_value, dest_current_value);
1587   } else if (size == 2) {
1588     extsh(dest_current_value, dest_current_value);
1589   };
1590 
1591   cmpw(flag, dest_current_value, compare_value);
1592   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1593     bne_predict_not_taken(flag, failed);
1594   } else {
1595     bne(                  flag, failed);
1596   }
1597   // branch to done  => (flag == ne), (dest_current_value != compare_value)
1598   // fall through    => (flag == eq), (dest_current_value == compare_value)
1599 
1600   if (instruction_type != size) {
1601     xorr(modval, val32, exchange_value);
1602   }
1603 
1604   switch (instruction_type) {
1605     case 4: stwcx_(modval, addr_base); break;
1606     case 2: sthcx_(modval, addr_base); break;
1607     case 1: stbcx_(modval, addr_base); break;
1608     default: ShouldNotReachHere();
1609   }
1610 }
1611 
1612 // CmpxchgX sets condition register to cmpX(current, compare).
1613 void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value,
1614                                      Register compare_value, Register exchange_value,
1615                                      Register addr_base, Register tmp1, Register tmp2,
1616                                      int semantics, bool cmpxchgx_hint,
1617                                      Register int_flag_success, bool contention_hint, bool weak, int size) {
1618   Label retry;
1619   Label failed;
1620   Label done;
1621 
1622   // Save one branch if result is returned via register and
1623   // result register is different from the other ones.
1624   bool use_result_reg    = (int_flag_success != noreg);
1625   bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value &&
1626                             int_flag_success != exchange_value && int_flag_success != addr_base &&
1627                             int_flag_success != tmp1 && int_flag_success != tmp2);
1628   assert(!weak || flag == CCR0, "weak only supported with CCR0");
1629   assert(size == 1 || size == 2 || size == 4, "unsupported");
1630 
1631   if (use_result_reg && preset_result_reg) {
1632     li(int_flag_success, 0); // preset (assume cas failed)
1633   }
1634 
1635   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1636   if (contention_hint) { // Don't try to reserve if cmp fails.
1637     switch (size) {
1638       case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break;
1639       case 2: lha(dest_current_value, 0, addr_base); break;
1640       case 4: lwz(dest_current_value, 0, addr_base); break;
1641       default: ShouldNotReachHere();
1642     }
1643     cmpw(flag, dest_current_value, compare_value);
1644     bne(flag, failed);
1645   }
1646 
1647   // release/fence semantics
1648   if (semantics & MemBarRel) {
1649     release();
1650   }
1651 
1652   cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base, tmp1, tmp2,
1653                     retry, failed, cmpxchgx_hint, size);
1654   if (!weak || use_result_reg) {
1655     if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1656       bne_predict_not_taken(CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1657     } else {
1658       bne(                  CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1659     }
1660   }
1661   // fall through    => (flag == eq), (dest_current_value == compare_value), (swapped)
1662 
1663   // Result in register (must do this at the end because int_flag_success can be the
1664   // same register as one above).
1665   if (use_result_reg) {
1666     li(int_flag_success, 1);
1667   }
1668 
1669   if (semantics & MemBarFenceAfter) {
1670     fence();
1671   } else if (semantics & MemBarAcq) {
1672     isync();
1673   }
1674 
1675   if (use_result_reg && !preset_result_reg) {
1676     b(done);
1677   }
1678 
1679   bind(failed);
1680   if (use_result_reg && !preset_result_reg) {
1681     li(int_flag_success, 0);
1682   }
1683 
1684   bind(done);
1685   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1686   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1687 }
1688 
1689 // Preforms atomic compare exchange:
1690 //   if (compare_value == *addr_base)
1691 //     *addr_base = exchange_value
1692 //     int_flag_success = 1;
1693 //   else
1694 //     int_flag_success = 0;
1695 //
1696 // ConditionRegister flag       = cmp(compare_value, *addr_base)
1697 // Register dest_current_value  = *addr_base
1698 // Register compare_value       Used to compare with value in memory
1699 // Register exchange_value      Written to memory if compare_value == *addr_base
1700 // Register addr_base           The memory location to compareXChange
1701 // Register int_flag_success    Set to 1 if exchange_value was written to *addr_base
1702 //
1703 // To avoid the costly compare exchange the value is tested beforehand.
1704 // Several special cases exist to avoid that unnecessary information is generated.
1705 //
1706 void MacroAssembler::cmpxchgd(ConditionRegister flag,
1707                               Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value,
1708                               Register addr_base, int semantics, bool cmpxchgx_hint,
1709                               Register int_flag_success, Label* failed_ext, bool contention_hint, bool weak) {
1710   Label retry;
1711   Label failed_int;
1712   Label& failed = (failed_ext != NULL) ? *failed_ext : failed_int;
1713   Label done;
1714 
1715   // Save one branch if result is returned via register and result register is different from the other ones.
1716   bool use_result_reg    = (int_flag_success!=noreg);
1717   bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() &&
1718                             int_flag_success!=exchange_value && int_flag_success!=addr_base);
1719   assert(!weak || flag == CCR0, "weak only supported with CCR0");
1720   assert(int_flag_success == noreg || failed_ext == NULL, "cannot have both");
1721 
1722   if (use_result_reg && preset_result_reg) {
1723     li(int_flag_success, 0); // preset (assume cas failed)
1724   }
1725 
1726   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1727   if (contention_hint) { // Don't try to reserve if cmp fails.
1728     ld(dest_current_value, 0, addr_base);
1729     cmpd(flag, compare_value, dest_current_value);
1730     bne(flag, failed);
1731   }
1732 
1733   // release/fence semantics
1734   if (semantics & MemBarRel) {
1735     release();
1736   }
1737 
1738   // atomic emulation loop
1739   bind(retry);
1740 
1741   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1742   cmpd(flag, compare_value, dest_current_value);
1743   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1744     bne_predict_not_taken(flag, failed);
1745   } else {
1746     bne(                  flag, failed);
1747   }
1748 
1749   stdcx_(exchange_value, addr_base);
1750   if (!weak || use_result_reg || failed_ext) {
1751     if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1752       bne_predict_not_taken(CCR0, weak ? failed : retry); // stXcx_ sets CCR0
1753     } else {
1754       bne(                  CCR0, weak ? failed : retry); // stXcx_ sets CCR0
1755     }
1756   }
1757 
1758   // result in register (must do this at the end because int_flag_success can be the same register as one above)
1759   if (use_result_reg) {
1760     li(int_flag_success, 1);
1761   }
1762 
1763   if (semantics & MemBarFenceAfter) {
1764     fence();
1765   } else if (semantics & MemBarAcq) {
1766     isync();
1767   }
1768 
1769   if (use_result_reg && !preset_result_reg) {
1770     b(done);
1771   }
1772 
1773   bind(failed_int);
1774   if (use_result_reg && !preset_result_reg) {
1775     li(int_flag_success, 0);
1776   }
1777 
1778   bind(done);
1779   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1780   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1781 }
1782 
1783 // Look up the method for a megamorphic invokeinterface call.
1784 // The target method is determined by <intf_klass, itable_index>.
1785 // The receiver klass is in recv_klass.
1786 // On success, the result will be in method_result, and execution falls through.
1787 // On failure, execution transfers to the given label.
1788 void MacroAssembler::lookup_interface_method(Register recv_klass,
1789                                              Register intf_klass,
1790                                              RegisterOrConstant itable_index,
1791                                              Register method_result,
1792                                              Register scan_temp,
1793                                              Register temp2,
1794                                              Label& L_no_such_interface,
1795                                              bool return_method) {
1796   assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
1797 
1798   // Compute start of first itableOffsetEntry (which is at the end of the vtable).
1799   int vtable_base = in_bytes(Klass::vtable_start_offset());
1800   int itentry_off = itableMethodEntry::method_offset_in_bytes();
1801   int logMEsize   = exact_log2(itableMethodEntry::size() * wordSize);
1802   int scan_step   = itableOffsetEntry::size() * wordSize;
1803   int log_vte_size= exact_log2(vtableEntry::size_in_bytes());
1804 
1805   lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass);
1806   // %%% We should store the aligned, prescaled offset in the klassoop.
1807   // Then the next several instructions would fold away.
1808 
1809   sldi(scan_temp, scan_temp, log_vte_size);
1810   addi(scan_temp, scan_temp, vtable_base);
1811   add(scan_temp, recv_klass, scan_temp);
1812 
1813   // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1814   if (return_method) {
1815     if (itable_index.is_register()) {
1816       Register itable_offset = itable_index.as_register();
1817       sldi(method_result, itable_offset, logMEsize);
1818       if (itentry_off) { addi(method_result, method_result, itentry_off); }
1819       add(method_result, method_result, recv_klass);
1820     } else {
1821       long itable_offset = (long)itable_index.as_constant();
1822       // static address, no relocation
1823       add_const_optimized(method_result, recv_klass, (itable_offset << logMEsize) + itentry_off, temp2);
1824     }
1825   }
1826 
1827   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
1828   //   if (scan->interface() == intf) {
1829   //     result = (klass + scan->offset() + itable_index);
1830   //   }
1831   // }
1832   Label search, found_method;
1833 
1834   for (int peel = 1; peel >= 0; peel--) {
1835     // %%%% Could load both offset and interface in one ldx, if they were
1836     // in the opposite order. This would save a load.
1837     ld(temp2, itableOffsetEntry::interface_offset_in_bytes(), scan_temp);
1838 
1839     // Check that this entry is non-null. A null entry means that
1840     // the receiver class doesn't implement the interface, and wasn't the
1841     // same as when the caller was compiled.
1842     cmpd(CCR0, temp2, intf_klass);
1843 
1844     if (peel) {
1845       beq(CCR0, found_method);
1846     } else {
1847       bne(CCR0, search);
1848       // (invert the test to fall through to found_method...)
1849     }
1850 
1851     if (!peel) break;
1852 
1853     bind(search);
1854 
1855     cmpdi(CCR0, temp2, 0);
1856     beq(CCR0, L_no_such_interface);
1857     addi(scan_temp, scan_temp, scan_step);
1858   }
1859 
1860   bind(found_method);
1861 
1862   // Got a hit.
1863   if (return_method) {
1864     int ito_offset = itableOffsetEntry::offset_offset_in_bytes();
1865     lwz(scan_temp, ito_offset, scan_temp);
1866     ldx(method_result, scan_temp, method_result);
1867   }
1868 }
1869 
1870 // virtual method calling
1871 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1872                                            RegisterOrConstant vtable_index,
1873                                            Register method_result) {
1874 
1875   assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg());
1876 
1877   const int base = in_bytes(Klass::vtable_start_offset());
1878   assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1879 
1880   if (vtable_index.is_register()) {
1881     sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord);
1882     add(recv_klass, vtable_index.as_register(), recv_klass);
1883   } else {
1884     addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord);
1885   }
1886   ld(R19_method, base + vtableEntry::method_offset_in_bytes(), recv_klass);
1887 }
1888 
1889 /////////////////////////////////////////// subtype checking ////////////////////////////////////////////
1890 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1891                                                    Register super_klass,
1892                                                    Register temp1_reg,
1893                                                    Register temp2_reg,
1894                                                    Label* L_success,
1895                                                    Label* L_failure,
1896                                                    Label* L_slow_path,
1897                                                    RegisterOrConstant super_check_offset) {
1898 
1899   const Register check_cache_offset = temp1_reg;
1900   const Register cached_super       = temp2_reg;
1901 
1902   assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super);
1903 
1904   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1905   int sc_offset  = in_bytes(Klass::secondary_super_cache_offset());
1906 
1907   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1908   bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset);
1909 
1910   Label L_fallthrough;
1911   int label_nulls = 0;
1912   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1913   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1914   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1915   assert(label_nulls <= 1 ||
1916          (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path),
1917          "at most one NULL in the batch, usually");
1918 
1919   // If the pointers are equal, we are done (e.g., String[] elements).
1920   // This self-check enables sharing of secondary supertype arrays among
1921   // non-primary types such as array-of-interface. Otherwise, each such
1922   // type would need its own customized SSA.
1923   // We move this check to the front of the fast path because many
1924   // type checks are in fact trivially successful in this manner,
1925   // so we get a nicely predicted branch right at the start of the check.
1926   cmpd(CCR0, sub_klass, super_klass);
1927   beq(CCR0, *L_success);
1928 
1929   // Check the supertype display:
1930   if (must_load_sco) {
1931     // The super check offset is always positive...
1932     lwz(check_cache_offset, sco_offset, super_klass);
1933     super_check_offset = RegisterOrConstant(check_cache_offset);
1934     // super_check_offset is register.
1935     assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register());
1936   }
1937   // The loaded value is the offset from KlassOopDesc.
1938 
1939   ld(cached_super, super_check_offset, sub_klass);
1940   cmpd(CCR0, cached_super, super_klass);
1941 
1942   // This check has worked decisively for primary supers.
1943   // Secondary supers are sought in the super_cache ('super_cache_addr').
1944   // (Secondary supers are interfaces and very deeply nested subtypes.)
1945   // This works in the same check above because of a tricky aliasing
1946   // between the super_cache and the primary super display elements.
1947   // (The 'super_check_addr' can address either, as the case requires.)
1948   // Note that the cache is updated below if it does not help us find
1949   // what we need immediately.
1950   // So if it was a primary super, we can just fail immediately.
1951   // Otherwise, it's the slow path for us (no success at this point).
1952 
1953 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); }
1954 
1955   if (super_check_offset.is_register()) {
1956     beq(CCR0, *L_success);
1957     cmpwi(CCR0, super_check_offset.as_register(), sc_offset);
1958     if (L_failure == &L_fallthrough) {
1959       beq(CCR0, *L_slow_path);
1960     } else {
1961       bne(CCR0, *L_failure);
1962       FINAL_JUMP(*L_slow_path);
1963     }
1964   } else {
1965     if (super_check_offset.as_constant() == sc_offset) {
1966       // Need a slow path; fast failure is impossible.
1967       if (L_slow_path == &L_fallthrough) {
1968         beq(CCR0, *L_success);
1969       } else {
1970         bne(CCR0, *L_slow_path);
1971         FINAL_JUMP(*L_success);
1972       }
1973     } else {
1974       // No slow path; it's a fast decision.
1975       if (L_failure == &L_fallthrough) {
1976         beq(CCR0, *L_success);
1977       } else {
1978         bne(CCR0, *L_failure);
1979         FINAL_JUMP(*L_success);
1980       }
1981     }
1982   }
1983 
1984   bind(L_fallthrough);
1985 #undef FINAL_JUMP
1986 }
1987 
1988 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1989                                                    Register super_klass,
1990                                                    Register temp1_reg,
1991                                                    Register temp2_reg,
1992                                                    Label* L_success,
1993                                                    Register result_reg) {
1994   const Register array_ptr = temp1_reg; // current value from cache array
1995   const Register temp      = temp2_reg;
1996 
1997   assert_different_registers(sub_klass, super_klass, array_ptr, temp);
1998 
1999   int source_offset = in_bytes(Klass::secondary_supers_offset());
2000   int target_offset = in_bytes(Klass::secondary_super_cache_offset());
2001 
2002   int length_offset = Array<Klass*>::length_offset_in_bytes();
2003   int base_offset   = Array<Klass*>::base_offset_in_bytes();
2004 
2005   Label hit, loop, failure, fallthru;
2006 
2007   ld(array_ptr, source_offset, sub_klass);
2008 
2009   // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated.");
2010   lwz(temp, length_offset, array_ptr);
2011   cmpwi(CCR0, temp, 0);
2012   beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0
2013 
2014   mtctr(temp); // load ctr
2015 
2016   bind(loop);
2017   // Oops in table are NO MORE compressed.
2018   ld(temp, base_offset, array_ptr);
2019   cmpd(CCR0, temp, super_klass);
2020   beq(CCR0, hit);
2021   addi(array_ptr, array_ptr, BytesPerWord);
2022   bdnz(loop);
2023 
2024   bind(failure);
2025   if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss)
2026   b(fallthru);
2027 
2028   bind(hit);
2029   std(super_klass, target_offset, sub_klass); // save result to cache
2030   if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit)
2031   if (L_success != NULL) { b(*L_success); }
2032   else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided
2033 
2034   bind(fallthru);
2035 }
2036 
2037 // Try fast path, then go to slow one if not successful
2038 void MacroAssembler::check_klass_subtype(Register sub_klass,
2039                          Register super_klass,
2040                          Register temp1_reg,
2041                          Register temp2_reg,
2042                          Label& L_success) {
2043   Label L_failure;
2044   check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure);
2045   check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success);
2046   bind(L_failure); // Fallthru if not successful.
2047 }
2048 
2049 void MacroAssembler::check_method_handle_type(Register mtype_reg, Register mh_reg,
2050                                               Register temp_reg,
2051                                               Label& wrong_method_type) {
2052   assert_different_registers(mtype_reg, mh_reg, temp_reg);
2053   // Compare method type against that of the receiver.
2054   load_heap_oop_not_null(temp_reg, delayed_value(java_lang_invoke_MethodHandle::type_offset_in_bytes, temp_reg), mh_reg);
2055   cmpd(CCR0, temp_reg, mtype_reg);
2056   bne(CCR0, wrong_method_type);
2057 }
2058 
2059 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot,
2060                                                    Register temp_reg,
2061                                                    int extra_slot_offset) {
2062   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
2063   int stackElementSize = Interpreter::stackElementSize;
2064   int offset = extra_slot_offset * stackElementSize;
2065   if (arg_slot.is_constant()) {
2066     offset += arg_slot.as_constant() * stackElementSize;
2067     return offset;
2068   } else {
2069     assert(temp_reg != noreg, "must specify");
2070     sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize));
2071     if (offset != 0)
2072       addi(temp_reg, temp_reg, offset);
2073     return temp_reg;
2074   }
2075 }
2076 
2077 // Supports temp2_reg = R0.
2078 void MacroAssembler::biased_locking_enter(ConditionRegister cr_reg, Register obj_reg,
2079                                           Register mark_reg, Register temp_reg,
2080                                           Register temp2_reg, Label& done, Label* slow_case) {
2081   assert(UseBiasedLocking, "why call this otherwise?");
2082 
2083 #ifdef ASSERT
2084   assert_different_registers(obj_reg, mark_reg, temp_reg, temp2_reg);
2085 #endif
2086 
2087   Label cas_label;
2088 
2089   // Branch to done if fast path fails and no slow_case provided.
2090   Label *slow_case_int = (slow_case != NULL) ? slow_case : &done;
2091 
2092   // Biased locking
2093   // See whether the lock is currently biased toward our thread and
2094   // whether the epoch is still valid
2095   // Note that the runtime guarantees sufficient alignment of JavaThread
2096   // pointers to allow age to be placed into low bits
2097   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits,
2098          "biased locking makes assumptions about bit layout");
2099 
2100   if (PrintBiasedLockingStatistics) {
2101     load_const(temp2_reg, (address) BiasedLocking::total_entry_count_addr(), temp_reg);
2102     lwzx(temp_reg, temp2_reg);
2103     addi(temp_reg, temp_reg, 1);
2104     stwx(temp_reg, temp2_reg);
2105   }
2106 
2107   andi(temp_reg, mark_reg, markOopDesc::biased_lock_mask_in_place);
2108   cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern);
2109   bne(cr_reg, cas_label);
2110 
2111   load_klass(temp_reg, obj_reg);
2112 
2113   load_const_optimized(temp2_reg, ~((int) markOopDesc::age_mask_in_place));
2114   ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
2115   orr(temp_reg, R16_thread, temp_reg);
2116   xorr(temp_reg, mark_reg, temp_reg);
2117   andr(temp_reg, temp_reg, temp2_reg);
2118   cmpdi(cr_reg, temp_reg, 0);
2119   if (PrintBiasedLockingStatistics) {
2120     Label l;
2121     bne(cr_reg, l);
2122     load_const(temp2_reg, (address) BiasedLocking::biased_lock_entry_count_addr());
2123     lwzx(mark_reg, temp2_reg);
2124     addi(mark_reg, mark_reg, 1);
2125     stwx(mark_reg, temp2_reg);
2126     // restore mark_reg
2127     ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);
2128     bind(l);
2129   }
2130   beq(cr_reg, done);
2131 
2132   Label try_revoke_bias;
2133   Label try_rebias;
2134 
2135   // At this point we know that the header has the bias pattern and
2136   // that we are not the bias owner in the current epoch. We need to
2137   // figure out more details about the state of the header in order to
2138   // know what operations can be legally performed on the object's
2139   // header.
2140 
2141   // If the low three bits in the xor result aren't clear, that means
2142   // the prototype header is no longer biased and we have to revoke
2143   // the bias on this object.
2144   andi(temp2_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
2145   cmpwi(cr_reg, temp2_reg, 0);
2146   bne(cr_reg, try_revoke_bias);
2147 
2148   // Biasing is still enabled for this data type. See whether the
2149   // epoch of the current bias is still valid, meaning that the epoch
2150   // bits of the mark word are equal to the epoch bits of the
2151   // prototype header. (Note that the prototype header's epoch bits
2152   // only change at a safepoint.) If not, attempt to rebias the object
2153   // toward the current thread. Note that we must be absolutely sure
2154   // that the current epoch is invalid in order to do this because
2155   // otherwise the manipulations it performs on the mark word are
2156   // illegal.
2157 
2158   int shift_amount = 64 - markOopDesc::epoch_shift;
2159   // rotate epoch bits to right (little) end and set other bits to 0
2160   // [ big part | epoch | little part ] -> [ 0..0 | epoch ]
2161   rldicl_(temp2_reg, temp_reg, shift_amount, 64 - markOopDesc::epoch_bits);
2162   // branch if epoch bits are != 0, i.e. they differ, because the epoch has been incremented
2163   bne(CCR0, try_rebias);
2164 
2165   // The epoch of the current bias is still valid but we know nothing
2166   // about the owner; it might be set or it might be clear. Try to
2167   // acquire the bias of the object using an atomic operation. If this
2168   // fails we will go in to the runtime to revoke the object's bias.
2169   // Note that we first construct the presumed unbiased header so we
2170   // don't accidentally blow away another thread's valid bias.
2171   andi(mark_reg, mark_reg, (markOopDesc::biased_lock_mask_in_place |
2172                                 markOopDesc::age_mask_in_place |
2173                                 markOopDesc::epoch_mask_in_place));
2174   orr(temp_reg, R16_thread, mark_reg);
2175 
2176   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2177 
2178   // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
2179   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
2180            /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
2181            /*where=*/obj_reg,
2182            MacroAssembler::MemBarAcq,
2183            MacroAssembler::cmpxchgx_hint_acquire_lock(),
2184            noreg, slow_case_int); // bail out if failed
2185 
2186   // If the biasing toward our thread failed, this means that
2187   // another thread succeeded in biasing it toward itself and we
2188   // need to revoke that bias. The revocation will occur in the
2189   // interpreter runtime in the slow case.
2190   if (PrintBiasedLockingStatistics) {
2191     load_const(temp2_reg, (address) BiasedLocking::anonymously_biased_lock_entry_count_addr(), temp_reg);
2192     lwzx(temp_reg, temp2_reg);
2193     addi(temp_reg, temp_reg, 1);
2194     stwx(temp_reg, temp2_reg);
2195   }
2196   b(done);
2197 
2198   bind(try_rebias);
2199   // At this point we know the epoch has expired, meaning that the
2200   // current "bias owner", if any, is actually invalid. Under these
2201   // circumstances _only_, we are allowed to use the current header's
2202   // value as the comparison value when doing the cas to acquire the
2203   // bias in the current epoch. In other words, we allow transfer of
2204   // the bias from one thread to another directly in this situation.
2205   load_klass(temp_reg, obj_reg);
2206   andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place);
2207   orr(temp2_reg, R16_thread, temp2_reg);
2208   ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
2209   orr(temp_reg, temp2_reg, temp_reg);
2210 
2211   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2212 
2213   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
2214                  /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
2215                  /*where=*/obj_reg,
2216                  MacroAssembler::MemBarAcq,
2217                  MacroAssembler::cmpxchgx_hint_acquire_lock(),
2218                  noreg, slow_case_int); // bail out if failed
2219 
2220   // If the biasing toward our thread failed, this means that
2221   // another thread succeeded in biasing it toward itself and we
2222   // need to revoke that bias. The revocation will occur in the
2223   // interpreter runtime in the slow case.
2224   if (PrintBiasedLockingStatistics) {
2225     load_const(temp2_reg, (address) BiasedLocking::rebiased_lock_entry_count_addr(), temp_reg);
2226     lwzx(temp_reg, temp2_reg);
2227     addi(temp_reg, temp_reg, 1);
2228     stwx(temp_reg, temp2_reg);
2229   }
2230   b(done);
2231 
2232   bind(try_revoke_bias);
2233   // The prototype mark in the klass doesn't have the bias bit set any
2234   // more, indicating that objects of this data type are not supposed
2235   // to be biased any more. We are going to try to reset the mark of
2236   // this object to the prototype value and fall through to the
2237   // CAS-based locking scheme. Note that if our CAS fails, it means
2238   // that another thread raced us for the privilege of revoking the
2239   // bias of this particular object, so it's okay to continue in the
2240   // normal locking code.
2241   load_klass(temp_reg, obj_reg);
2242   ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
2243   andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place);
2244   orr(temp_reg, temp_reg, temp2_reg);
2245 
2246   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2247 
2248   // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
2249   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
2250                  /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
2251                  /*where=*/obj_reg,
2252                  MacroAssembler::MemBarAcq,
2253                  MacroAssembler::cmpxchgx_hint_acquire_lock());
2254 
2255   // reload markOop in mark_reg before continuing with lightweight locking
2256   ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);
2257 
2258   // Fall through to the normal CAS-based lock, because no matter what
2259   // the result of the above CAS, some thread must have succeeded in
2260   // removing the bias bit from the object's header.
2261   if (PrintBiasedLockingStatistics) {
2262     Label l;
2263     bne(cr_reg, l);
2264     load_const(temp2_reg, (address) BiasedLocking::revoked_lock_entry_count_addr(), temp_reg);
2265     lwzx(temp_reg, temp2_reg);
2266     addi(temp_reg, temp_reg, 1);
2267     stwx(temp_reg, temp2_reg);
2268     bind(l);
2269   }
2270 
2271   bind(cas_label);
2272 }
2273 
2274 void MacroAssembler::biased_locking_exit (ConditionRegister cr_reg, Register mark_addr, Register temp_reg, Label& done) {
2275   // Check for biased locking unlock case, which is a no-op
2276   // Note: we do not have to check the thread ID for two reasons.
2277   // First, the interpreter checks for IllegalMonitorStateException at
2278   // a higher level. Second, if the bias was revoked while we held the
2279   // lock, the object could not be rebiased toward another thread, so
2280   // the bias bit would be clear.
2281 
2282   ld(temp_reg, 0, mark_addr);
2283   andi(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
2284 
2285   cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern);
2286   beq(cr_reg, done);
2287 }
2288 
2289 // allocation (for C1)
2290 void MacroAssembler::eden_allocate(
2291   Register obj,                      // result: pointer to object after successful allocation
2292   Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
2293   int      con_size_in_bytes,        // object size in bytes if   known at compile time
2294   Register t1,                       // temp register
2295   Register t2,                       // temp register
2296   Label&   slow_case                 // continuation point if fast allocation fails
2297 ) {
2298   b(slow_case);
2299 }
2300 
2301 void MacroAssembler::tlab_allocate(
2302   Register obj,                      // result: pointer to object after successful allocation
2303   Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
2304   int      con_size_in_bytes,        // object size in bytes if   known at compile time
2305   Register t1,                       // temp register
2306   Label&   slow_case                 // continuation point if fast allocation fails
2307 ) {
2308   // make sure arguments make sense
2309   assert_different_registers(obj, var_size_in_bytes, t1);
2310   assert(0 <= con_size_in_bytes && is_simm13(con_size_in_bytes), "illegal object size");
2311   assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment");
2312 
2313   const Register new_top = t1;
2314   //verify_tlab(); not implemented
2315 
2316   ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2317   ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread);
2318   if (var_size_in_bytes == noreg) {
2319     addi(new_top, obj, con_size_in_bytes);
2320   } else {
2321     add(new_top, obj, var_size_in_bytes);
2322   }
2323   cmpld(CCR0, new_top, R0);
2324   bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case);
2325 
2326 #ifdef ASSERT
2327   // make sure new free pointer is properly aligned
2328   {
2329     Label L;
2330     andi_(R0, new_top, MinObjAlignmentInBytesMask);
2331     beq(CCR0, L);
2332     stop("updated TLAB free is not properly aligned", 0x934);
2333     bind(L);
2334   }
2335 #endif // ASSERT
2336 
2337   // update the tlab top pointer
2338   std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2339   //verify_tlab(); not implemented
2340 }
2341 void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register t1, Register t2) {
2342   unimplemented("incr_allocated_bytes");
2343 }
2344 
2345 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset,
2346                                              int insts_call_instruction_offset, Register Rtoc) {
2347   // Start the stub.
2348   address stub = start_a_stub(64);
2349   if (stub == NULL) { return NULL; } // CodeCache full: bail out
2350 
2351   // Create a trampoline stub relocation which relates this trampoline stub
2352   // with the call instruction at insts_call_instruction_offset in the
2353   // instructions code-section.
2354   relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset));
2355   const int stub_start_offset = offset();
2356 
2357   // For java_to_interp stubs we use R11_scratch1 as scratch register
2358   // and in call trampoline stubs we use R12_scratch2. This way we
2359   // can distinguish them (see is_NativeCallTrampolineStub_at()).
2360   Register reg_scratch = R12_scratch2;
2361 
2362   // Now, create the trampoline stub's code:
2363   // - load the TOC
2364   // - load the call target from the constant pool
2365   // - call
2366   if (Rtoc == noreg) {
2367     calculate_address_from_global_toc(reg_scratch, method_toc());
2368     Rtoc = reg_scratch;
2369   }
2370 
2371   ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false);
2372   mtctr(reg_scratch);
2373   bctr();
2374 
2375   const address stub_start_addr = addr_at(stub_start_offset);
2376 
2377   // Assert that the encoded destination_toc_offset can be identified and that it is correct.
2378   assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(),
2379          "encoded offset into the constant pool must match");
2380   // Trampoline_stub_size should be good.
2381   assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size");
2382   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
2383 
2384   // End the stub.
2385   end_a_stub();
2386   return stub;
2387 }
2388 
2389 // TM on PPC64.
2390 void MacroAssembler::atomic_inc_ptr(Register addr, Register result, int simm16) {
2391   Label retry;
2392   bind(retry);
2393   ldarx(result, addr, /*hint*/ false);
2394   addi(result, result, simm16);
2395   stdcx_(result, addr);
2396   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
2397     bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
2398   } else {
2399     bne(                  CCR0, retry); // stXcx_ sets CCR0
2400   }
2401 }
2402 
2403 void MacroAssembler::atomic_ori_int(Register addr, Register result, int uimm16) {
2404   Label retry;
2405   bind(retry);
2406   lwarx(result, addr, /*hint*/ false);
2407   ori(result, result, uimm16);
2408   stwcx_(result, addr);
2409   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
2410     bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
2411   } else {
2412     bne(                  CCR0, retry); // stXcx_ sets CCR0
2413   }
2414 }
2415 
2416 #if INCLUDE_RTM_OPT
2417 
2418 // Update rtm_counters based on abort status
2419 // input: abort_status
2420 //        rtm_counters (RTMLockingCounters*)
2421 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters_Reg) {
2422   // Mapping to keep PreciseRTMLockingStatistics similar to x86.
2423   // x86 ppc (! means inverted, ? means not the same)
2424   //  0   31  Set if abort caused by XABORT instruction.
2425   //  1  ! 7  If set, the transaction may succeed on a retry. This bit is always clear if bit 0 is set.
2426   //  2   13  Set if another logical processor conflicted with a memory address that was part of the transaction that aborted.
2427   //  3   10  Set if an internal buffer overflowed.
2428   //  4  ?12  Set if a debug breakpoint was hit.
2429   //  5  ?32  Set if an abort occurred during execution of a nested transaction.
2430   const  int tm_failure_bit[] = {Assembler::tm_tabort, // Note: Seems like signal handler sets this, too.
2431                                  Assembler::tm_failure_persistent, // inverted: transient
2432                                  Assembler::tm_trans_cf,
2433                                  Assembler::tm_footprint_of,
2434                                  Assembler::tm_non_trans_cf,
2435                                  Assembler::tm_suspended};
2436   const bool tm_failure_inv[] = {false, true, false, false, false, false};
2437   assert(sizeof(tm_failure_bit)/sizeof(int) == RTMLockingCounters::ABORT_STATUS_LIMIT, "adapt mapping!");
2438 
2439   const Register addr_Reg = R0;
2440   // Keep track of offset to where rtm_counters_Reg had pointed to.
2441   int counters_offs = RTMLockingCounters::abort_count_offset();
2442   addi(addr_Reg, rtm_counters_Reg, counters_offs);
2443   const Register temp_Reg = rtm_counters_Reg;
2444 
2445   //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically
2446   ldx(temp_Reg, addr_Reg);
2447   addi(temp_Reg, temp_Reg, 1);
2448   stdx(temp_Reg, addr_Reg);
2449 
2450   if (PrintPreciseRTMLockingStatistics) {
2451     int counters_offs_delta = RTMLockingCounters::abortX_count_offset() - counters_offs;
2452 
2453     //mftexasr(abort_status); done by caller
2454     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
2455       counters_offs += counters_offs_delta;
2456       li(temp_Reg, counters_offs_delta); // can't use addi with R0
2457       add(addr_Reg, addr_Reg, temp_Reg); // point to next counter
2458       counters_offs_delta = sizeof(uintx);
2459 
2460       Label check_abort;
2461       rldicr_(temp_Reg, abort_status, tm_failure_bit[i], 0);
2462       if (tm_failure_inv[i]) {
2463         bne(CCR0, check_abort);
2464       } else {
2465         beq(CCR0, check_abort);
2466       }
2467       //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically
2468       ldx(temp_Reg, addr_Reg);
2469       addi(temp_Reg, temp_Reg, 1);
2470       stdx(temp_Reg, addr_Reg);
2471       bind(check_abort);
2472     }
2473   }
2474   li(temp_Reg, -counters_offs); // can't use addi with R0
2475   add(rtm_counters_Reg, addr_Reg, temp_Reg); // restore
2476 }
2477 
2478 // Branch if (random & (count-1) != 0), count is 2^n
2479 // tmp and CR0 are killed
2480 void MacroAssembler::branch_on_random_using_tb(Register tmp, int count, Label& brLabel) {
2481   mftb(tmp);
2482   andi_(tmp, tmp, count-1);
2483   bne(CCR0, brLabel);
2484 }
2485 
2486 // Perform abort ratio calculation, set no_rtm bit if high ratio.
2487 // input:  rtm_counters_Reg (RTMLockingCounters* address) - KILLED
2488 void MacroAssembler::rtm_abort_ratio_calculation(Register rtm_counters_Reg,
2489                                                  RTMLockingCounters* rtm_counters,
2490                                                  Metadata* method_data) {
2491   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
2492 
2493   if (RTMLockingCalculationDelay > 0) {
2494     // Delay calculation.
2495     ld(rtm_counters_Reg, (RegisterOrConstant)(intptr_t)RTMLockingCounters::rtm_calculation_flag_addr());
2496     cmpdi(CCR0, rtm_counters_Reg, 0);
2497     beq(CCR0, L_done);
2498     load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2499   }
2500   // Abort ratio calculation only if abort_count > RTMAbortThreshold.
2501   //   Aborted transactions = abort_count * 100
2502   //   All transactions = total_count *  RTMTotalCountIncrRate
2503   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
2504   ld(R0, RTMLockingCounters::abort_count_offset(), rtm_counters_Reg);
2505   if (is_simm(RTMAbortThreshold, 16)) {   // cmpdi can handle 16bit immediate only.
2506     cmpdi(CCR0, R0, RTMAbortThreshold);
2507     blt(CCR0, L_check_always_rtm2);  // reload of rtm_counters_Reg not necessary
2508   } else {
2509     load_const_optimized(rtm_counters_Reg, RTMAbortThreshold);
2510     cmpd(CCR0, R0, rtm_counters_Reg);
2511     blt(CCR0, L_check_always_rtm1);  // reload of rtm_counters_Reg required
2512   }
2513   mulli(R0, R0, 100);
2514 
2515   const Register tmpReg = rtm_counters_Reg;
2516   ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2517   mulli(tmpReg, tmpReg, RTMTotalCountIncrRate); // allowable range: int16
2518   mulli(tmpReg, tmpReg, RTMAbortRatio);         // allowable range: int16
2519   cmpd(CCR0, R0, tmpReg);
2520   blt(CCR0, L_check_always_rtm1); // jump to reload
2521   if (method_data != NULL) {
2522     // Set rtm_state to "no rtm" in MDO.
2523     // Not using a metadata relocation. Method and Class Loader are kept alive anyway.
2524     // (See nmethod::metadata_do and CodeBuffer::finalize_oop_references.)
2525     load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2526     atomic_ori_int(R0, tmpReg, NoRTM);
2527   }
2528   b(L_done);
2529 
2530   bind(L_check_always_rtm1);
2531   load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2532   bind(L_check_always_rtm2);
2533   ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2534   int64_t thresholdValue = RTMLockingThreshold / RTMTotalCountIncrRate;
2535   if (is_simm(thresholdValue, 16)) {   // cmpdi can handle 16bit immediate only.
2536     cmpdi(CCR0, tmpReg, thresholdValue);
2537   } else {
2538     load_const_optimized(R0, thresholdValue);
2539     cmpd(CCR0, tmpReg, R0);
2540   }
2541   blt(CCR0, L_done);
2542   if (method_data != NULL) {
2543     // Set rtm_state to "always rtm" in MDO.
2544     // Not using a metadata relocation. See above.
2545     load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2546     atomic_ori_int(R0, tmpReg, UseRTM);
2547   }
2548   bind(L_done);
2549 }
2550 
2551 // Update counters and perform abort ratio calculation.
2552 // input: abort_status_Reg
2553 void MacroAssembler::rtm_profiling(Register abort_status_Reg, Register temp_Reg,
2554                                    RTMLockingCounters* rtm_counters,
2555                                    Metadata* method_data,
2556                                    bool profile_rtm) {
2557 
2558   assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2559   // Update rtm counters based on state at abort.
2560   // Reads abort_status_Reg, updates flags.
2561   assert_different_registers(abort_status_Reg, temp_Reg);
2562   load_const_optimized(temp_Reg, (address)rtm_counters, R0);
2563   rtm_counters_update(abort_status_Reg, temp_Reg);
2564   if (profile_rtm) {
2565     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2566     rtm_abort_ratio_calculation(temp_Reg, rtm_counters, method_data);
2567   }
2568 }
2569 
2570 // Retry on abort if abort's status indicates non-persistent failure.
2571 // inputs: retry_count_Reg
2572 //       : abort_status_Reg
2573 // output: retry_count_Reg decremented by 1
2574 void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg,
2575                                              Label& retryLabel, Label* checkRetry) {
2576   Label doneRetry;
2577   rldicr_(R0, abort_status_Reg, tm_failure_persistent, 0);
2578   bne(CCR0, doneRetry);
2579   if (checkRetry) { bind(*checkRetry); }
2580   addic_(retry_count_Reg, retry_count_Reg, -1);
2581   blt(CCR0, doneRetry);
2582   smt_yield(); // Can't use wait(). No permission (SIGILL).
2583   b(retryLabel);
2584   bind(doneRetry);
2585 }
2586 
2587 // Spin and retry if lock is busy.
2588 // inputs: owner_addr_Reg (monitor address)
2589 //       : retry_count_Reg
2590 // output: retry_count_Reg decremented by 1
2591 // CTR is killed
2592 void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register owner_addr_Reg, Label& retryLabel) {
2593   Label SpinLoop, doneRetry;
2594   addic_(retry_count_Reg, retry_count_Reg, -1);
2595   blt(CCR0, doneRetry);
2596 
2597   if (RTMSpinLoopCount > 1) {
2598     li(R0, RTMSpinLoopCount);
2599     mtctr(R0);
2600   }
2601 
2602   bind(SpinLoop);
2603   smt_yield(); // Can't use waitrsv(). No permission (SIGILL).
2604 
2605   if (RTMSpinLoopCount > 1) {
2606     bdz(retryLabel);
2607     ld(R0, 0, owner_addr_Reg);
2608     cmpdi(CCR0, R0, 0);
2609     bne(CCR0, SpinLoop);
2610   }
2611 
2612   b(retryLabel);
2613 
2614   bind(doneRetry);
2615 }
2616 
2617 // Use RTM for normal stack locks.
2618 // Input: objReg (object to lock)
2619 void MacroAssembler::rtm_stack_locking(ConditionRegister flag,
2620                                        Register obj, Register mark_word, Register tmp,
2621                                        Register retry_on_abort_count_Reg,
2622                                        RTMLockingCounters* stack_rtm_counters,
2623                                        Metadata* method_data, bool profile_rtm,
2624                                        Label& DONE_LABEL, Label& IsInflated) {
2625   assert(UseRTMForStackLocks, "why call this otherwise?");
2626   assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
2627   Label L_rtm_retry, L_decrement_retry, L_on_abort;
2628 
2629   if (RTMRetryCount > 0) {
2630     load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
2631     bind(L_rtm_retry);
2632   }
2633   andi_(R0, mark_word, markOopDesc::monitor_value);  // inflated vs stack-locked|neutral|biased
2634   bne(CCR0, IsInflated);
2635 
2636   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2637     Label L_noincrement;
2638     if (RTMTotalCountIncrRate > 1) {
2639       branch_on_random_using_tb(tmp, RTMTotalCountIncrRate, L_noincrement);
2640     }
2641     assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
2642     load_const_optimized(tmp, (address)stack_rtm_counters->total_count_addr(), R0);
2643     //atomic_inc_ptr(tmp, /*temp, will be reloaded*/mark_word); We don't increment atomically
2644     ldx(mark_word, tmp);
2645     addi(mark_word, mark_word, 1);
2646     stdx(mark_word, tmp);
2647     bind(L_noincrement);
2648   }
2649   tbegin_();
2650   beq(CCR0, L_on_abort);
2651   ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);         // Reload in transaction, conflicts need to be tracked.
2652   andi(R0, mark_word, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
2653   cmpwi(flag, R0, markOopDesc::unlocked_value);                // bits = 001 unlocked
2654   beq(flag, DONE_LABEL);                                       // all done if unlocked
2655 
2656   if (UseRTMXendForLockBusy) {
2657     tend_();
2658     b(L_decrement_retry);
2659   } else {
2660     tabort_();
2661   }
2662   bind(L_on_abort);
2663   const Register abort_status_Reg = tmp;
2664   mftexasr(abort_status_Reg);
2665   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2666     rtm_profiling(abort_status_Reg, /*temp*/mark_word, stack_rtm_counters, method_data, profile_rtm);
2667   }
2668   ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // reload
2669   if (RTMRetryCount > 0) {
2670     // Retry on lock abort if abort status is not permanent.
2671     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry, &L_decrement_retry);
2672   } else {
2673     bind(L_decrement_retry);
2674   }
2675 }
2676 
2677 // Use RTM for inflating locks
2678 // inputs: obj       (object to lock)
2679 //         mark_word (current header - KILLED)
2680 //         boxReg    (on-stack box address (displaced header location) - KILLED)
2681 void MacroAssembler::rtm_inflated_locking(ConditionRegister flag,
2682                                           Register obj, Register mark_word, Register boxReg,
2683                                           Register retry_on_busy_count_Reg, Register retry_on_abort_count_Reg,
2684                                           RTMLockingCounters* rtm_counters,
2685                                           Metadata* method_data, bool profile_rtm,
2686                                           Label& DONE_LABEL) {
2687   assert(UseRTMLocking, "why call this otherwise?");
2688   Label L_rtm_retry, L_decrement_retry, L_on_abort;
2689   // Clean monitor_value bit to get valid pointer.
2690   int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markOopDesc::monitor_value;
2691 
2692   // Store non-null, using boxReg instead of (intptr_t)markOopDesc::unused_mark().
2693   std(boxReg, BasicLock::displaced_header_offset_in_bytes(), boxReg);
2694   const Register tmpReg = boxReg;
2695   const Register owner_addr_Reg = mark_word;
2696   addi(owner_addr_Reg, mark_word, owner_offset);
2697 
2698   if (RTMRetryCount > 0) {
2699     load_const_optimized(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy.
2700     load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort.
2701     bind(L_rtm_retry);
2702   }
2703   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2704     Label L_noincrement;
2705     if (RTMTotalCountIncrRate > 1) {
2706       branch_on_random_using_tb(R0, RTMTotalCountIncrRate, L_noincrement);
2707     }
2708     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2709     load_const(R0, (address)rtm_counters->total_count_addr(), tmpReg);
2710     //atomic_inc_ptr(R0, tmpReg); We don't increment atomically
2711     ldx(tmpReg, R0);
2712     addi(tmpReg, tmpReg, 1);
2713     stdx(tmpReg, R0);
2714     bind(L_noincrement);
2715   }
2716   tbegin_();
2717   beq(CCR0, L_on_abort);
2718   // We don't reload mark word. Will only be reset at safepoint.
2719   ld(R0, 0, owner_addr_Reg); // Load in transaction, conflicts need to be tracked.
2720   cmpdi(flag, R0, 0);
2721   beq(flag, DONE_LABEL);
2722 
2723   if (UseRTMXendForLockBusy) {
2724     tend_();
2725     b(L_decrement_retry);
2726   } else {
2727     tabort_();
2728   }
2729   bind(L_on_abort);
2730   const Register abort_status_Reg = tmpReg;
2731   mftexasr(abort_status_Reg);
2732   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2733     rtm_profiling(abort_status_Reg, /*temp*/ owner_addr_Reg, rtm_counters, method_data, profile_rtm);
2734     // Restore owner_addr_Reg
2735     ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);
2736 #ifdef ASSERT
2737     andi_(R0, mark_word, markOopDesc::monitor_value);
2738     asm_assert_ne("must be inflated", 0xa754); // Deflating only allowed at safepoint.
2739 #endif
2740     addi(owner_addr_Reg, mark_word, owner_offset);
2741   }
2742   if (RTMRetryCount > 0) {
2743     // Retry on lock abort if abort status is not permanent.
2744     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
2745   }
2746 
2747   // Appears unlocked - try to swing _owner from null to non-null.
2748   cmpxchgd(flag, /*current val*/ R0, (intptr_t)0, /*new val*/ R16_thread, owner_addr_Reg,
2749            MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2750            MacroAssembler::cmpxchgx_hint_acquire_lock(), noreg, &L_decrement_retry, true);
2751 
2752   if (RTMRetryCount > 0) {
2753     // success done else retry
2754     b(DONE_LABEL);
2755     bind(L_decrement_retry);
2756     // Spin and retry if lock is busy.
2757     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, owner_addr_Reg, L_rtm_retry);
2758   } else {
2759     bind(L_decrement_retry);
2760   }
2761 }
2762 
2763 #endif //  INCLUDE_RTM_OPT
2764 
2765 // "The box" is the space on the stack where we copy the object mark.
2766 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box,
2767                                                Register temp, Register displaced_header, Register current_header,
2768                                                bool try_bias,
2769                                                RTMLockingCounters* rtm_counters,
2770                                                RTMLockingCounters* stack_rtm_counters,
2771                                                Metadata* method_data,
2772                                                bool use_rtm, bool profile_rtm) {
2773   assert_different_registers(oop, box, temp, displaced_header, current_header);
2774   assert(flag != CCR0, "bad condition register");
2775   Label cont;
2776   Label object_has_monitor;
2777   Label cas_failed;
2778 
2779   // Load markOop from object into displaced_header.
2780   ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop);
2781 
2782 
2783   // Always do locking in runtime.
2784   if (EmitSync & 0x01) {
2785     cmpdi(flag, oop, 0); // Oop can't be 0 here => always false.
2786     return;
2787   }
2788 
2789   if (try_bias) {
2790     biased_locking_enter(flag, oop, displaced_header, temp, current_header, cont);
2791   }
2792 
2793 #if INCLUDE_RTM_OPT
2794   if (UseRTMForStackLocks && use_rtm) {
2795     rtm_stack_locking(flag, oop, displaced_header, temp, /*temp*/ current_header,
2796                       stack_rtm_counters, method_data, profile_rtm,
2797                       cont, object_has_monitor);
2798   }
2799 #endif // INCLUDE_RTM_OPT
2800 
2801   // Handle existing monitor.
2802   if ((EmitSync & 0x02) == 0) {
2803     // The object has an existing monitor iff (mark & monitor_value) != 0.
2804     andi_(temp, displaced_header, markOopDesc::monitor_value);
2805     bne(CCR0, object_has_monitor);
2806   }
2807 
2808   // Set displaced_header to be (markOop of object | UNLOCK_VALUE).
2809   ori(displaced_header, displaced_header, markOopDesc::unlocked_value);
2810 
2811   // Load Compare Value application register.
2812 
2813   // Initialize the box. (Must happen before we update the object mark!)
2814   std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2815 
2816   // Must fence, otherwise, preceding store(s) may float below cmpxchg.
2817   // Compare object markOop with mark and if equal exchange scratch1 with object markOop.
2818   cmpxchgd(/*flag=*/flag,
2819            /*current_value=*/current_header,
2820            /*compare_value=*/displaced_header,
2821            /*exchange_value=*/box,
2822            /*where=*/oop,
2823            MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2824            MacroAssembler::cmpxchgx_hint_acquire_lock(),
2825            noreg,
2826            &cas_failed,
2827            /*check without membar and ldarx first*/true);
2828   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2829 
2830   // If the compare-and-exchange succeeded, then we found an unlocked
2831   // object and we have now locked it.
2832   b(cont);
2833 
2834   bind(cas_failed);
2835   // We did not see an unlocked object so try the fast recursive case.
2836 
2837   // Check if the owner is self by comparing the value in the markOop of object
2838   // (current_header) with the stack pointer.
2839   sub(current_header, current_header, R1_SP);
2840   load_const_optimized(temp, ~(os::vm_page_size()-1) | markOopDesc::lock_mask_in_place);
2841 
2842   and_(R0/*==0?*/, current_header, temp);
2843   // If condition is true we are cont and hence we can store 0 as the
2844   // displaced header in the box, which indicates that it is a recursive lock.
2845   mcrf(flag,CCR0);
2846   std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box);
2847 
2848   // Handle existing monitor.
2849   if ((EmitSync & 0x02) == 0) {
2850     b(cont);
2851 
2852     bind(object_has_monitor);
2853     // The object's monitor m is unlocked iff m->owner == NULL,
2854     // otherwise m->owner may contain a thread or a stack address.
2855 
2856 #if INCLUDE_RTM_OPT
2857     // Use the same RTM locking code in 32- and 64-bit VM.
2858     if (use_rtm) {
2859       rtm_inflated_locking(flag, oop, displaced_header, box, temp, /*temp*/ current_header,
2860                            rtm_counters, method_data, profile_rtm, cont);
2861     } else {
2862 #endif // INCLUDE_RTM_OPT
2863 
2864     // Try to CAS m->owner from NULL to current thread.
2865     addi(temp, displaced_header, ObjectMonitor::owner_offset_in_bytes()-markOopDesc::monitor_value);
2866     cmpxchgd(/*flag=*/flag,
2867              /*current_value=*/current_header,
2868              /*compare_value=*/(intptr_t)0,
2869              /*exchange_value=*/R16_thread,
2870              /*where=*/temp,
2871              MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2872              MacroAssembler::cmpxchgx_hint_acquire_lock());
2873 
2874     // Store a non-null value into the box.
2875     std(box, BasicLock::displaced_header_offset_in_bytes(), box);
2876 
2877 #   ifdef ASSERT
2878     bne(flag, cont);
2879     // We have acquired the monitor, check some invariants.
2880     addi(/*monitor=*/temp, temp, -ObjectMonitor::owner_offset_in_bytes());
2881     // Invariant 1: _recursions should be 0.
2882     //assert(ObjectMonitor::recursions_size_in_bytes() == 8, "unexpected size");
2883     asm_assert_mem8_is_zero(ObjectMonitor::recursions_offset_in_bytes(), temp,
2884                             "monitor->_recursions should be 0", -1);
2885 #   endif
2886 
2887 #if INCLUDE_RTM_OPT
2888     } // use_rtm()
2889 #endif
2890   }
2891 
2892   bind(cont);
2893   // flag == EQ indicates success
2894   // flag == NE indicates failure
2895 }
2896 
2897 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box,
2898                                                  Register temp, Register displaced_header, Register current_header,
2899                                                  bool try_bias, bool use_rtm) {
2900   assert_different_registers(oop, box, temp, displaced_header, current_header);
2901   assert(flag != CCR0, "bad condition register");
2902   Label cont;
2903   Label object_has_monitor;
2904 
2905   // Always do locking in runtime.
2906   if (EmitSync & 0x01) {
2907     cmpdi(flag, oop, 0); // Oop can't be 0 here => always false.
2908     return;
2909   }
2910 
2911   if (try_bias) {
2912     biased_locking_exit(flag, oop, current_header, cont);
2913   }
2914 
2915 #if INCLUDE_RTM_OPT
2916   if (UseRTMForStackLocks && use_rtm) {
2917     assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
2918     Label L_regular_unlock;
2919     ld(current_header, oopDesc::mark_offset_in_bytes(), oop);         // fetch markword
2920     andi(R0, current_header, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
2921     cmpwi(flag, R0, markOopDesc::unlocked_value);                     // bits = 001 unlocked
2922     bne(flag, L_regular_unlock);                                      // else RegularLock
2923     tend_();                                                          // otherwise end...
2924     b(cont);                                                          // ... and we're done
2925     bind(L_regular_unlock);
2926   }
2927 #endif
2928 
2929   // Find the lock address and load the displaced header from the stack.
2930   ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2931 
2932   // If the displaced header is 0, we have a recursive unlock.
2933   cmpdi(flag, displaced_header, 0);
2934   beq(flag, cont);
2935 
2936   // Handle existing monitor.
2937   if ((EmitSync & 0x02) == 0) {
2938     // The object has an existing monitor iff (mark & monitor_value) != 0.
2939     RTM_OPT_ONLY( if (!(UseRTMForStackLocks && use_rtm)) ) // skip load if already done
2940     ld(current_header, oopDesc::mark_offset_in_bytes(), oop);
2941     andi_(R0, current_header, markOopDesc::monitor_value);
2942     bne(CCR0, object_has_monitor);
2943   }
2944 
2945   // Check if it is still a light weight lock, this is is true if we see
2946   // the stack address of the basicLock in the markOop of the object.
2947   // Cmpxchg sets flag to cmpd(current_header, box).
2948   cmpxchgd(/*flag=*/flag,
2949            /*current_value=*/current_header,
2950            /*compare_value=*/box,
2951            /*exchange_value=*/displaced_header,
2952            /*where=*/oop,
2953            MacroAssembler::MemBarRel,
2954            MacroAssembler::cmpxchgx_hint_release_lock(),
2955            noreg,
2956            &cont);
2957 
2958   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2959 
2960   // Handle existing monitor.
2961   if ((EmitSync & 0x02) == 0) {
2962     b(cont);
2963 
2964     bind(object_has_monitor);
2965     addi(current_header, current_header, -markOopDesc::monitor_value); // monitor
2966     ld(temp,             ObjectMonitor::owner_offset_in_bytes(), current_header);
2967 
2968     // It's inflated.
2969 #if INCLUDE_RTM_OPT
2970     if (use_rtm) {
2971       Label L_regular_inflated_unlock;
2972       // Clean monitor_value bit to get valid pointer
2973       cmpdi(flag, temp, 0);
2974       bne(flag, L_regular_inflated_unlock);
2975       tend_();
2976       b(cont);
2977       bind(L_regular_inflated_unlock);
2978     }
2979 #endif
2980 
2981     ld(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header);
2982     xorr(temp, R16_thread, temp);      // Will be 0 if we are the owner.
2983     orr(temp, temp, displaced_header); // Will be 0 if there are 0 recursions.
2984     cmpdi(flag, temp, 0);
2985     bne(flag, cont);
2986 
2987     ld(temp,             ObjectMonitor::EntryList_offset_in_bytes(), current_header);
2988     ld(displaced_header, ObjectMonitor::cxq_offset_in_bytes(), current_header);
2989     orr(temp, temp, displaced_header); // Will be 0 if both are 0.
2990     cmpdi(flag, temp, 0);
2991     bne(flag, cont);
2992     release();
2993     std(temp, ObjectMonitor::owner_offset_in_bytes(), current_header);
2994   }
2995 
2996   bind(cont);
2997   // flag == EQ indicates success
2998   // flag == NE indicates failure
2999 }
3000 
3001 // Write serialization page so VM thread can do a pseudo remote membar.
3002 // We use the current thread pointer to calculate a thread specific
3003 // offset to write to within the page. This minimizes bus traffic
3004 // due to cache line collision.
3005 void MacroAssembler::serialize_memory(Register thread, Register tmp1, Register tmp2) {
3006   srdi(tmp2, thread, os::get_serialize_page_shift_count());
3007 
3008   int mask = os::vm_page_size() - sizeof(int);
3009   if (Assembler::is_simm(mask, 16)) {
3010     andi(tmp2, tmp2, mask);
3011   } else {
3012     lis(tmp1, (int)((signed short) (mask >> 16)));
3013     ori(tmp1, tmp1, mask & 0x0000ffff);
3014     andr(tmp2, tmp2, tmp1);
3015   }
3016 
3017   load_const(tmp1, (long) os::get_memory_serialize_page());
3018   release();
3019   stwx(R0, tmp1, tmp2);
3020 }
3021 
3022 void MacroAssembler::safepoint_poll(Label& slow_path, Register temp_reg) {
3023   if (SafepointMechanism::uses_thread_local_poll()) {
3024     ld(temp_reg, in_bytes(Thread::polling_page_offset()), R16_thread);
3025     // Armed page has poll_bit set.
3026     andi_(temp_reg, temp_reg, SafepointMechanism::poll_bit());
3027   } else {
3028     lwz(temp_reg, (RegisterOrConstant)(intptr_t)SafepointSynchronize::address_of_state());
3029     cmpwi(CCR0, temp_reg, SafepointSynchronize::_not_synchronized);
3030   }
3031   bne(CCR0, slow_path);
3032 }
3033 
3034 
3035 // GC barrier helper macros
3036 
3037 // Write the card table byte if needed.
3038 void MacroAssembler::card_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp) {
3039   CardTableModRefBS* bs =
3040     barrier_set_cast<CardTableModRefBS>(Universe::heap()->barrier_set());
3041   assert(bs->kind() == BarrierSet::CardTableModRef, "wrong barrier");
3042   CardTable* ct = bs->card_table();
3043 #ifdef ASSERT
3044   cmpdi(CCR0, Rnew_val, 0);
3045   asm_assert_ne("null oop not allowed", 0x321);
3046 #endif
3047   card_table_write(ct->byte_map_base(), Rtmp, Rstore_addr);
3048 }
3049 
3050 // Write the card table byte.
3051 void MacroAssembler::card_table_write(jbyte* byte_map_base, Register Rtmp, Register Robj) {
3052   assert_different_registers(Robj, Rtmp, R0);
3053   load_const_optimized(Rtmp, (address)byte_map_base, R0);
3054   srdi(Robj, Robj, CardTable::card_shift);
3055   li(R0, 0); // dirty
3056   if (UseConcMarkSweepGC) membar(Assembler::StoreStore);
3057   stbx(R0, Rtmp, Robj);
3058 }
3059 
3060 // Kills R31 if value is a volatile register.
3061 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2, bool needs_frame) {
3062   Label done;
3063   cmpdi(CCR0, value, 0);
3064   beq(CCR0, done);         // Use NULL as-is.
3065 
3066   clrrdi(tmp1, value, JNIHandles::weak_tag_size);
3067 #if INCLUDE_ALL_GCS
3068   if (UseG1GC) { andi_(tmp2, value, JNIHandles::weak_tag_mask); }
3069 #endif
3070   ld(value, 0, tmp1);      // Resolve (untagged) jobject.
3071 
3072 #if INCLUDE_ALL_GCS
3073   if (UseG1GC) {
3074     Label not_weak;
3075     beq(CCR0, not_weak);   // Test for jweak tag.
3076     verify_oop(value);
3077     g1_write_barrier_pre(noreg, // obj
3078                          noreg, // offset
3079                          value, // pre_val
3080                          tmp1, tmp2, needs_frame);
3081     bind(not_weak);
3082   }
3083 #endif // INCLUDE_ALL_GCS
3084   verify_oop(value);
3085   bind(done);
3086 }
3087 
3088 #if INCLUDE_ALL_GCS
3089 // General G1 pre-barrier generator.
3090 // Goal: record the previous value if it is not null.
3091 void MacroAssembler::g1_write_barrier_pre(Register Robj, RegisterOrConstant offset, Register Rpre_val,
3092                                           Register Rtmp1, Register Rtmp2, bool needs_frame) {
3093   Label runtime, filtered;
3094 
3095   // Is marking active?
3096   if (in_bytes(SATBMarkQueue::byte_width_of_active()) == 4) {
3097     lwz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_active()), R16_thread);
3098   } else {
3099     guarantee(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, "Assumption");
3100     lbz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_active()), R16_thread);
3101   }
3102   cmpdi(CCR0, Rtmp1, 0);
3103   beq(CCR0, filtered);
3104 
3105   // Do we need to load the previous value?
3106   if (Robj != noreg) {
3107     // Load the previous value...
3108     if (UseCompressedOops) {
3109       lwz(Rpre_val, offset, Robj);
3110     } else {
3111       ld(Rpre_val, offset, Robj);
3112     }
3113     // Previous value has been loaded into Rpre_val.
3114   }
3115   assert(Rpre_val != noreg, "must have a real register");
3116 
3117   // Is the previous value null?
3118   cmpdi(CCR0, Rpre_val, 0);
3119   beq(CCR0, filtered);
3120 
3121   if (Robj != noreg && UseCompressedOops) {
3122     decode_heap_oop_not_null(Rpre_val);
3123   }
3124 
3125   // OK, it's not filtered, so we'll need to call enqueue. In the normal
3126   // case, pre_val will be a scratch G-reg, but there are some cases in
3127   // which it's an O-reg. In the first case, do a normal call. In the
3128   // latter, do a save here and call the frameless version.
3129 
3130   // Can we store original value in the thread's buffer?
3131   // Is index == 0?
3132   // (The index field is typed as size_t.)
3133   const Register Rbuffer = Rtmp1, Rindex = Rtmp2;
3134 
3135   ld(Rindex, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_index()), R16_thread);
3136   cmpdi(CCR0, Rindex, 0);
3137   beq(CCR0, runtime); // If index == 0, goto runtime.
3138   ld(Rbuffer, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_buf()), R16_thread);
3139 
3140   addi(Rindex, Rindex, -wordSize); // Decrement index.
3141   std(Rindex, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_index()), R16_thread);
3142 
3143   // Record the previous value.
3144   stdx(Rpre_val, Rbuffer, Rindex);
3145   b(filtered);
3146 
3147   bind(runtime);
3148 
3149   // May need to preserve LR. Also needed if current frame is not compatible with C calling convention.
3150   if (needs_frame) {
3151     save_LR_CR(Rtmp1);
3152     push_frame_reg_args(0, Rtmp2);
3153   }
3154 
3155   if (Rpre_val->is_volatile() && Robj == noreg) mr(R31, Rpre_val); // Save pre_val across C call if it was preloaded.
3156   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), Rpre_val, R16_thread);
3157   if (Rpre_val->is_volatile() && Robj == noreg) mr(Rpre_val, R31); // restore
3158 
3159   if (needs_frame) {
3160     pop_frame();
3161     restore_LR_CR(Rtmp1);
3162   }
3163 
3164   bind(filtered);
3165 }
3166 
3167 // General G1 post-barrier generator
3168 // Store cross-region card.
3169 void MacroAssembler::g1_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp1, Register Rtmp2, Register Rtmp3, Label *filtered_ext) {
3170   Label runtime, filtered_int;
3171   Label& filtered = (filtered_ext != NULL) ? *filtered_ext : filtered_int;
3172   assert_different_registers(Rstore_addr, Rnew_val, Rtmp1, Rtmp2);
3173 
3174   G1SATBCardTableLoggingModRefBS* bs =
3175     barrier_set_cast<G1SATBCardTableLoggingModRefBS>(Universe::heap()->barrier_set());
3176   CardTable* ct = bs->card_table();
3177 
3178   // Does store cross heap regions?
3179   if (G1RSBarrierRegionFilter) {
3180     xorr(Rtmp1, Rstore_addr, Rnew_val);
3181     srdi_(Rtmp1, Rtmp1, HeapRegion::LogOfHRGrainBytes);
3182     beq(CCR0, filtered);
3183   }
3184 
3185   // Crosses regions, storing NULL?
3186 #ifdef ASSERT
3187   cmpdi(CCR0, Rnew_val, 0);
3188   asm_assert_ne("null oop not allowed (G1)", 0x322); // Checked by caller on PPC64, so following branch is obsolete:
3189   //beq(CCR0, filtered);
3190 #endif
3191 
3192   // Storing region crossing non-NULL, is card already dirty?
3193   assert(sizeof(*ct->byte_map_base()) == sizeof(jbyte), "adjust this code");
3194   const Register Rcard_addr = Rtmp1;
3195   Register Rbase = Rtmp2;
3196   load_const_optimized(Rbase, (address)ct->byte_map_base(), /*temp*/ Rtmp3);
3197 
3198   srdi(Rcard_addr, Rstore_addr, CardTable::card_shift);
3199 
3200   // Get the address of the card.
3201   lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr);
3202   cmpwi(CCR0, Rtmp3, (int)G1CardTable::g1_young_card_val());
3203   beq(CCR0, filtered);
3204 
3205   membar(Assembler::StoreLoad);
3206   lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr);  // Reload after membar.
3207   cmpwi(CCR0, Rtmp3 /* card value */, CardTable::dirty_card_val());
3208   beq(CCR0, filtered);
3209 
3210   // Storing a region crossing, non-NULL oop, card is clean.
3211   // Dirty card and log.
3212   li(Rtmp3, CardTable::dirty_card_val());
3213   //release(); // G1: oops are allowed to get visible after dirty marking.
3214   stbx(Rtmp3, Rbase, Rcard_addr);
3215 
3216   add(Rcard_addr, Rbase, Rcard_addr); // This is the address which needs to get enqueued.
3217   Rbase = noreg; // end of lifetime
3218 
3219   const Register Rqueue_index = Rtmp2,
3220                  Rqueue_buf   = Rtmp3;
3221   ld(Rqueue_index, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_index()), R16_thread);
3222   cmpdi(CCR0, Rqueue_index, 0);
3223   beq(CCR0, runtime); // index == 0 then jump to runtime
3224   ld(Rqueue_buf, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_buf()), R16_thread);
3225 
3226   addi(Rqueue_index, Rqueue_index, -wordSize); // decrement index
3227   std(Rqueue_index, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_index()), R16_thread);
3228 
3229   stdx(Rcard_addr, Rqueue_buf, Rqueue_index); // store card
3230   b(filtered);
3231 
3232   bind(runtime);
3233 
3234   // Save the live input values.
3235   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), Rcard_addr, R16_thread);
3236 
3237   bind(filtered_int);
3238 }
3239 #endif // INCLUDE_ALL_GCS
3240 
3241 // Values for last_Java_pc, and last_Java_sp must comply to the rules
3242 // in frame_ppc.hpp.
3243 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) {
3244   // Always set last_Java_pc and flags first because once last_Java_sp
3245   // is visible has_last_Java_frame is true and users will look at the
3246   // rest of the fields. (Note: flags should always be zero before we
3247   // get here so doesn't need to be set.)
3248 
3249   // Verify that last_Java_pc was zeroed on return to Java
3250   asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread,
3251                           "last_Java_pc not zeroed before leaving Java", 0x200);
3252 
3253   // When returning from calling out from Java mode the frame anchor's
3254   // last_Java_pc will always be set to NULL. It is set here so that
3255   // if we are doing a call to native (not VM) that we capture the
3256   // known pc and don't have to rely on the native call having a
3257   // standard frame linkage where we can find the pc.
3258   if (last_Java_pc != noreg)
3259     std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
3260 
3261   // Set last_Java_sp last.
3262   std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
3263 }
3264 
3265 void MacroAssembler::reset_last_Java_frame(void) {
3266   asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()),
3267                              R16_thread, "SP was not set, still zero", 0x202);
3268 
3269   BLOCK_COMMENT("reset_last_Java_frame {");
3270   li(R0, 0);
3271 
3272   // _last_Java_sp = 0
3273   std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
3274 
3275   // _last_Java_pc = 0
3276   std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
3277   BLOCK_COMMENT("} reset_last_Java_frame");
3278 }
3279 
3280 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) {
3281   assert_different_registers(sp, tmp1);
3282 
3283   // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via
3284   // TOP_IJAVA_FRAME_ABI.
3285   // FIXME: assert that we really have a TOP_IJAVA_FRAME here!
3286   address entry = pc();
3287   load_const_optimized(tmp1, entry);
3288 
3289   set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1);
3290 }
3291 
3292 void MacroAssembler::get_vm_result(Register oop_result) {
3293   // Read:
3294   //   R16_thread
3295   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
3296   //
3297   // Updated:
3298   //   oop_result
3299   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
3300 
3301   verify_thread();
3302 
3303   ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread);
3304   li(R0, 0);
3305   std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread);
3306 
3307   verify_oop(oop_result);
3308 }
3309 
3310 void MacroAssembler::get_vm_result_2(Register metadata_result) {
3311   // Read:
3312   //   R16_thread
3313   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
3314   //
3315   // Updated:
3316   //   metadata_result
3317   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
3318 
3319   ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
3320   li(R0, 0);
3321   std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
3322 }
3323 
3324 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3325   Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided.
3326   if (Universe::narrow_klass_base() != 0) {
3327     // Use dst as temp if it is free.
3328     sub_const_optimized(dst, current, Universe::narrow_klass_base(), R0);
3329     current = dst;
3330   }
3331   if (Universe::narrow_klass_shift() != 0) {
3332     srdi(dst, current, Universe::narrow_klass_shift());
3333     current = dst;
3334   }
3335   return current;
3336 }
3337 
3338 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) {
3339   if (UseCompressedClassPointers) {
3340     Register compressedKlass = encode_klass_not_null(ck, klass);
3341     stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop);
3342   } else {
3343     std(klass, oopDesc::klass_offset_in_bytes(), dst_oop);
3344   }
3345 }
3346 
3347 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) {
3348   if (UseCompressedClassPointers) {
3349     if (val == noreg) {
3350       val = R0;
3351       li(val, 0);
3352     }
3353     stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed
3354   }
3355 }
3356 
3357 int MacroAssembler::instr_size_for_decode_klass_not_null() {
3358   if (!UseCompressedClassPointers) return 0;
3359   int num_instrs = 1;  // shift or move
3360   if (Universe::narrow_klass_base() != 0) num_instrs = 7;  // shift + load const + add
3361   return num_instrs * BytesPerInstWord;
3362 }
3363 
3364 void MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3365   assert(dst != R0, "Dst reg may not be R0, as R0 is used here.");
3366   if (src == noreg) src = dst;
3367   Register shifted_src = src;
3368   if (Universe::narrow_klass_shift() != 0 ||
3369       Universe::narrow_klass_base() == 0 && src != dst) {  // Move required.
3370     shifted_src = dst;
3371     sldi(shifted_src, src, Universe::narrow_klass_shift());
3372   }
3373   if (Universe::narrow_klass_base() != 0) {
3374     add_const_optimized(dst, shifted_src, Universe::narrow_klass_base(), R0);
3375   }
3376 }
3377 
3378 void MacroAssembler::load_klass(Register dst, Register src) {
3379   if (UseCompressedClassPointers) {
3380     lwz(dst, oopDesc::klass_offset_in_bytes(), src);
3381     // Attention: no null check here!
3382     decode_klass_not_null(dst, dst);
3383   } else {
3384     ld(dst, oopDesc::klass_offset_in_bytes(), src);
3385   }
3386 }
3387 
3388 // ((OopHandle)result).resolve();
3389 void MacroAssembler::resolve_oop_handle(Register result) {
3390   // OopHandle::resolve is an indirection.
3391   ld(result, 0, result);
3392 }
3393 
3394 void MacroAssembler::load_mirror_from_const_method(Register mirror, Register const_method) {
3395   ld(mirror, in_bytes(ConstMethod::constants_offset()), const_method);
3396   ld(mirror, ConstantPool::pool_holder_offset_in_bytes(), mirror);
3397   ld(mirror, in_bytes(Klass::java_mirror_offset()), mirror);
3398   resolve_oop_handle(mirror);
3399 }
3400 
3401 // Clear Array
3402 // For very short arrays. tmp == R0 is allowed.
3403 void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) {
3404   if (cnt_dwords > 0) { li(tmp, 0); }
3405   for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); }
3406 }
3407 
3408 // Version for constant short array length. Kills base_ptr. tmp == R0 is allowed.
3409 void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) {
3410   if (cnt_dwords < 8) {
3411     clear_memory_unrolled(base_ptr, cnt_dwords, tmp);
3412     return;
3413   }
3414 
3415   Label loop;
3416   const long loopcnt   = cnt_dwords >> 1,
3417              remainder = cnt_dwords & 1;
3418 
3419   li(tmp, loopcnt);
3420   mtctr(tmp);
3421   li(tmp, 0);
3422   bind(loop);
3423     std(tmp, 0, base_ptr);
3424     std(tmp, 8, base_ptr);
3425     addi(base_ptr, base_ptr, 16);
3426     bdnz(loop);
3427   if (remainder) { std(tmp, 0, base_ptr); }
3428 }
3429 
3430 // Kills both input registers. tmp == R0 is allowed.
3431 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) {
3432   // Procedure for large arrays (uses data cache block zero instruction).
3433     Label startloop, fast, fastloop, small_rest, restloop, done;
3434     const int cl_size         = VM_Version::L1_data_cache_line_size(),
3435               cl_dwords       = cl_size >> 3,
3436               cl_dw_addr_bits = exact_log2(cl_dwords),
3437               dcbz_min        = 1,  // Min count of dcbz executions, needs to be >0.
3438               min_cnt         = ((dcbz_min + 1) << cl_dw_addr_bits) - 1;
3439 
3440   if (const_cnt >= 0) {
3441     // Constant case.
3442     if (const_cnt < min_cnt) {
3443       clear_memory_constlen(base_ptr, const_cnt, tmp);
3444       return;
3445     }
3446     load_const_optimized(cnt_dwords, const_cnt, tmp);
3447   } else {
3448     // cnt_dwords already loaded in register. Need to check size.
3449     cmpdi(CCR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included).
3450     blt(CCR1, small_rest);
3451   }
3452     rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line.
3453     beq(CCR0, fast);                                  // Already 128byte aligned.
3454 
3455     subfic(tmp, tmp, cl_dwords);
3456     mtctr(tmp);                        // Set ctr to hit 128byte boundary (0<ctr<cl_dwords).
3457     subf(cnt_dwords, tmp, cnt_dwords); // rest.
3458     li(tmp, 0);
3459 
3460   bind(startloop);                     // Clear at the beginning to reach 128byte boundary.
3461     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3462     addi(base_ptr, base_ptr, 8);
3463     bdnz(startloop);
3464 
3465   bind(fast);                                  // Clear 128byte blocks.
3466     srdi(tmp, cnt_dwords, cl_dw_addr_bits);    // Loop count for 128byte loop (>0).
3467     andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords.
3468     mtctr(tmp);                                // Load counter.
3469 
3470   bind(fastloop);
3471     dcbz(base_ptr);                    // Clear 128byte aligned block.
3472     addi(base_ptr, base_ptr, cl_size);
3473     bdnz(fastloop);
3474 
3475   bind(small_rest);
3476     cmpdi(CCR0, cnt_dwords, 0);        // size 0?
3477     beq(CCR0, done);                   // rest == 0
3478     li(tmp, 0);
3479     mtctr(cnt_dwords);                 // Load counter.
3480 
3481   bind(restloop);                      // Clear rest.
3482     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3483     addi(base_ptr, base_ptr, 8);
3484     bdnz(restloop);
3485 
3486   bind(done);
3487 }
3488 
3489 /////////////////////////////////////////// String intrinsics ////////////////////////////////////////////
3490 
3491 #ifdef COMPILER2
3492 // Intrinsics for CompactStrings
3493 
3494 // Compress char[] to byte[] by compressing 16 bytes at once.
3495 void MacroAssembler::string_compress_16(Register src, Register dst, Register cnt,
3496                                         Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5,
3497                                         Label& Lfailure) {
3498 
3499   const Register tmp0 = R0;
3500   assert_different_registers(src, dst, cnt, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5);
3501   Label Lloop, Lslow;
3502 
3503   // Check if cnt >= 8 (= 16 bytes)
3504   lis(tmp1, 0xFF);                // tmp1 = 0x00FF00FF00FF00FF
3505   srwi_(tmp2, cnt, 3);
3506   beq(CCR0, Lslow);
3507   ori(tmp1, tmp1, 0xFF);
3508   rldimi(tmp1, tmp1, 32, 0);
3509   mtctr(tmp2);
3510 
3511   // 2x unrolled loop
3512   bind(Lloop);
3513   ld(tmp2, 0, src);               // _0_1_2_3 (Big Endian)
3514   ld(tmp4, 8, src);               // _4_5_6_7
3515 
3516   orr(tmp0, tmp2, tmp4);
3517   rldicl(tmp3, tmp2, 6*8, 64-24); // _____1_2
3518   rldimi(tmp2, tmp2, 2*8, 2*8);   // _0_2_3_3
3519   rldicl(tmp5, tmp4, 6*8, 64-24); // _____5_6
3520   rldimi(tmp4, tmp4, 2*8, 2*8);   // _4_6_7_7
3521 
3522   andc_(tmp0, tmp0, tmp1);
3523   bne(CCR0, Lfailure);            // Not latin1.
3524   addi(src, src, 16);
3525 
3526   rlwimi(tmp3, tmp2, 0*8, 24, 31);// _____1_3
3527   srdi(tmp2, tmp2, 3*8);          // ____0_2_
3528   rlwimi(tmp5, tmp4, 0*8, 24, 31);// _____5_7
3529   srdi(tmp4, tmp4, 3*8);          // ____4_6_
3530 
3531   orr(tmp2, tmp2, tmp3);          // ____0123
3532   orr(tmp4, tmp4, tmp5);          // ____4567
3533 
3534   stw(tmp2, 0, dst);
3535   stw(tmp4, 4, dst);
3536   addi(dst, dst, 8);
3537   bdnz(Lloop);
3538 
3539   bind(Lslow);                    // Fallback to slow version
3540 }
3541 
3542 // Compress char[] to byte[]. cnt must be positive int.
3543 void MacroAssembler::string_compress(Register src, Register dst, Register cnt, Register tmp, Label& Lfailure) {
3544   Label Lloop;
3545   mtctr(cnt);
3546 
3547   bind(Lloop);
3548   lhz(tmp, 0, src);
3549   cmplwi(CCR0, tmp, 0xff);
3550   bgt(CCR0, Lfailure);            // Not latin1.
3551   addi(src, src, 2);
3552   stb(tmp, 0, dst);
3553   addi(dst, dst, 1);
3554   bdnz(Lloop);
3555 }
3556 
3557 // Inflate byte[] to char[] by inflating 16 bytes at once.
3558 void MacroAssembler::string_inflate_16(Register src, Register dst, Register cnt,
3559                                        Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
3560   const Register tmp0 = R0;
3561   assert_different_registers(src, dst, cnt, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5);
3562   Label Lloop, Lslow;
3563 
3564   // Check if cnt >= 8
3565   srwi_(tmp2, cnt, 3);
3566   beq(CCR0, Lslow);
3567   lis(tmp1, 0xFF);                // tmp1 = 0x00FF00FF
3568   ori(tmp1, tmp1, 0xFF);
3569   mtctr(tmp2);
3570 
3571   // 2x unrolled loop
3572   bind(Lloop);
3573   lwz(tmp2, 0, src);              // ____0123 (Big Endian)
3574   lwz(tmp4, 4, src);              // ____4567
3575   addi(src, src, 8);
3576 
3577   rldicl(tmp3, tmp2, 7*8, 64-8);  // _______2
3578   rlwimi(tmp2, tmp2, 3*8, 16, 23);// ____0113
3579   rldicl(tmp5, tmp4, 7*8, 64-8);  // _______6
3580   rlwimi(tmp4, tmp4, 3*8, 16, 23);// ____4557
3581 
3582   andc(tmp0, tmp2, tmp1);         // ____0_1_
3583   rlwimi(tmp2, tmp3, 2*8, 0, 23); // _____2_3
3584   andc(tmp3, tmp4, tmp1);         // ____4_5_
3585   rlwimi(tmp4, tmp5, 2*8, 0, 23); // _____6_7
3586 
3587   rldimi(tmp2, tmp0, 3*8, 0*8);   // _0_1_2_3
3588   rldimi(tmp4, tmp3, 3*8, 0*8);   // _4_5_6_7
3589 
3590   std(tmp2, 0, dst);
3591   std(tmp4, 8, dst);
3592   addi(dst, dst, 16);
3593   bdnz(Lloop);
3594 
3595   bind(Lslow);                    // Fallback to slow version
3596 }
3597 
3598 // Inflate byte[] to char[]. cnt must be positive int.
3599 void MacroAssembler::string_inflate(Register src, Register dst, Register cnt, Register tmp) {
3600   Label Lloop;
3601   mtctr(cnt);
3602 
3603   bind(Lloop);
3604   lbz(tmp, 0, src);
3605   addi(src, src, 1);
3606   sth(tmp, 0, dst);
3607   addi(dst, dst, 2);
3608   bdnz(Lloop);
3609 }
3610 
3611 void MacroAssembler::string_compare(Register str1, Register str2,
3612                                     Register cnt1, Register cnt2,
3613                                     Register tmp1, Register result, int ae) {
3614   const Register tmp0 = R0,
3615                  diff = tmp1;
3616 
3617   assert_different_registers(str1, str2, cnt1, cnt2, tmp0, tmp1, result);
3618   Label Ldone, Lslow, Lloop, Lreturn_diff;
3619 
3620   // Note: Making use of the fact that compareTo(a, b) == -compareTo(b, a)
3621   // we interchange str1 and str2 in the UL case and negate the result.
3622   // Like this, str1 is always latin1 encoded, except for the UU case.
3623   // In addition, we need 0 (or sign which is 0) extend.
3624 
3625   if (ae == StrIntrinsicNode::UU) {
3626     srwi(cnt1, cnt1, 1);
3627   } else {
3628     clrldi(cnt1, cnt1, 32);
3629   }
3630 
3631   if (ae != StrIntrinsicNode::LL) {
3632     srwi(cnt2, cnt2, 1);
3633   } else {
3634     clrldi(cnt2, cnt2, 32);
3635   }
3636 
3637   // See if the lengths are different, and calculate min in cnt1.
3638   // Save diff in case we need it for a tie-breaker.
3639   subf_(diff, cnt2, cnt1); // diff = cnt1 - cnt2
3640   // if (diff > 0) { cnt1 = cnt2; }
3641   if (VM_Version::has_isel()) {
3642     isel(cnt1, CCR0, Assembler::greater, /*invert*/ false, cnt2);
3643   } else {
3644     Label Lskip;
3645     blt(CCR0, Lskip);
3646     mr(cnt1, cnt2);
3647     bind(Lskip);
3648   }
3649 
3650   // Rename registers
3651   Register chr1 = result;
3652   Register chr2 = tmp0;
3653 
3654   // Compare multiple characters in fast loop (only implemented for same encoding).
3655   int stride1 = 8, stride2 = 8;
3656   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3657     int log2_chars_per_iter = (ae == StrIntrinsicNode::LL) ? 3 : 2;
3658     Label Lfastloop, Lskipfast;
3659 
3660     srwi_(tmp0, cnt1, log2_chars_per_iter);
3661     beq(CCR0, Lskipfast);
3662     rldicl(cnt2, cnt1, 0, 64 - log2_chars_per_iter); // Remaining characters.
3663     li(cnt1, 1 << log2_chars_per_iter); // Initialize for failure case: Rescan characters from current iteration.
3664     mtctr(tmp0);
3665 
3666     bind(Lfastloop);
3667     ld(chr1, 0, str1);
3668     ld(chr2, 0, str2);
3669     cmpd(CCR0, chr1, chr2);
3670     bne(CCR0, Lslow);
3671     addi(str1, str1, stride1);
3672     addi(str2, str2, stride2);
3673     bdnz(Lfastloop);
3674     mr(cnt1, cnt2); // Remaining characters.
3675     bind(Lskipfast);
3676   }
3677 
3678   // Loop which searches the first difference character by character.
3679   cmpwi(CCR0, cnt1, 0);
3680   beq(CCR0, Lreturn_diff);
3681   bind(Lslow);
3682   mtctr(cnt1);
3683 
3684   switch (ae) {
3685     case StrIntrinsicNode::LL: stride1 = 1; stride2 = 1; break;
3686     case StrIntrinsicNode::UL: // fallthru (see comment above)
3687     case StrIntrinsicNode::LU: stride1 = 1; stride2 = 2; break;
3688     case StrIntrinsicNode::UU: stride1 = 2; stride2 = 2; break;
3689     default: ShouldNotReachHere(); break;
3690   }
3691 
3692   bind(Lloop);
3693   if (stride1 == 1) { lbz(chr1, 0, str1); } else { lhz(chr1, 0, str1); }
3694   if (stride2 == 1) { lbz(chr2, 0, str2); } else { lhz(chr2, 0, str2); }
3695   subf_(result, chr2, chr1); // result = chr1 - chr2
3696   bne(CCR0, Ldone);
3697   addi(str1, str1, stride1);
3698   addi(str2, str2, stride2);
3699   bdnz(Lloop);
3700 
3701   // If strings are equal up to min length, return the length difference.
3702   bind(Lreturn_diff);
3703   mr(result, diff);
3704 
3705   // Otherwise, return the difference between the first mismatched chars.
3706   bind(Ldone);
3707   if (ae == StrIntrinsicNode::UL) {
3708     neg(result, result); // Negate result (see note above).
3709   }
3710 }
3711 
3712 void MacroAssembler::array_equals(bool is_array_equ, Register ary1, Register ary2,
3713                                   Register limit, Register tmp1, Register result, bool is_byte) {
3714   const Register tmp0 = R0;
3715   assert_different_registers(ary1, ary2, limit, tmp0, tmp1, result);
3716   Label Ldone, Lskiploop, Lloop, Lfastloop, Lskipfast;
3717   bool limit_needs_shift = false;
3718 
3719   if (is_array_equ) {
3720     const int length_offset = arrayOopDesc::length_offset_in_bytes();
3721     const int base_offset   = arrayOopDesc::base_offset_in_bytes(is_byte ? T_BYTE : T_CHAR);
3722 
3723     // Return true if the same array.
3724     cmpd(CCR0, ary1, ary2);
3725     beq(CCR0, Lskiploop);
3726 
3727     // Return false if one of them is NULL.
3728     cmpdi(CCR0, ary1, 0);
3729     cmpdi(CCR1, ary2, 0);
3730     li(result, 0);
3731     cror(CCR0, Assembler::equal, CCR1, Assembler::equal);
3732     beq(CCR0, Ldone);
3733 
3734     // Load the lengths of arrays.
3735     lwz(limit, length_offset, ary1);
3736     lwz(tmp0, length_offset, ary2);
3737 
3738     // Return false if the two arrays are not equal length.
3739     cmpw(CCR0, limit, tmp0);
3740     bne(CCR0, Ldone);
3741 
3742     // Load array addresses.
3743     addi(ary1, ary1, base_offset);
3744     addi(ary2, ary2, base_offset);
3745   } else {
3746     limit_needs_shift = !is_byte;
3747     li(result, 0); // Assume not equal.
3748   }
3749 
3750   // Rename registers
3751   Register chr1 = tmp0;
3752   Register chr2 = tmp1;
3753 
3754   // Compare 8 bytes per iteration in fast loop.
3755   const int log2_chars_per_iter = is_byte ? 3 : 2;
3756 
3757   srwi_(tmp0, limit, log2_chars_per_iter + (limit_needs_shift ? 1 : 0));
3758   beq(CCR0, Lskipfast);
3759   mtctr(tmp0);
3760 
3761   bind(Lfastloop);
3762   ld(chr1, 0, ary1);
3763   ld(chr2, 0, ary2);
3764   addi(ary1, ary1, 8);
3765   addi(ary2, ary2, 8);
3766   cmpd(CCR0, chr1, chr2);
3767   bne(CCR0, Ldone);
3768   bdnz(Lfastloop);
3769 
3770   bind(Lskipfast);
3771   rldicl_(limit, limit, limit_needs_shift ? 64 - 1 : 0, 64 - log2_chars_per_iter); // Remaining characters.
3772   beq(CCR0, Lskiploop);
3773   mtctr(limit);
3774 
3775   // Character by character.
3776   bind(Lloop);
3777   if (is_byte) {
3778     lbz(chr1, 0, ary1);
3779     lbz(chr2, 0, ary2);
3780     addi(ary1, ary1, 1);
3781     addi(ary2, ary2, 1);
3782   } else {
3783     lhz(chr1, 0, ary1);
3784     lhz(chr2, 0, ary2);
3785     addi(ary1, ary1, 2);
3786     addi(ary2, ary2, 2);
3787   }
3788   cmpw(CCR0, chr1, chr2);
3789   bne(CCR0, Ldone);
3790   bdnz(Lloop);
3791 
3792   bind(Lskiploop);
3793   li(result, 1); // All characters are equal.
3794   bind(Ldone);
3795 }
3796 
3797 void MacroAssembler::string_indexof(Register result, Register haystack, Register haycnt,
3798                                     Register needle, ciTypeArray* needle_values, Register needlecnt, int needlecntval,
3799                                     Register tmp1, Register tmp2, Register tmp3, Register tmp4, int ae) {
3800 
3801   // Ensure 0<needlecnt<=haycnt in ideal graph as prerequisite!
3802   Label L_TooShort, L_Found, L_NotFound, L_End;
3803   Register last_addr = haycnt, // Kill haycnt at the beginning.
3804   addr      = tmp1,
3805   n_start   = tmp2,
3806   ch1       = tmp3,
3807   ch2       = R0;
3808 
3809   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
3810   const int h_csize = (ae == StrIntrinsicNode::LL) ? 1 : 2;
3811   const int n_csize = (ae == StrIntrinsicNode::UU) ? 2 : 1;
3812 
3813   // **************************************************************************************************
3814   // Prepare for main loop: optimized for needle count >=2, bail out otherwise.
3815   // **************************************************************************************************
3816 
3817   // Compute last haystack addr to use if no match gets found.
3818   clrldi(haycnt, haycnt, 32);         // Ensure positive int is valid as 64 bit value.
3819   addi(addr, haystack, -h_csize);     // Accesses use pre-increment.
3820   if (needlecntval == 0) { // variable needlecnt
3821    cmpwi(CCR6, needlecnt, 2);
3822    clrldi(needlecnt, needlecnt, 32);  // Ensure positive int is valid as 64 bit value.
3823    blt(CCR6, L_TooShort);             // Variable needlecnt: handle short needle separately.
3824   }
3825 
3826   if (n_csize == 2) { lwz(n_start, 0, needle); } else { lhz(n_start, 0, needle); } // Load first 2 characters of needle.
3827 
3828   if (needlecntval == 0) { // variable needlecnt
3829    subf(ch1, needlecnt, haycnt);      // Last character index to compare is haycnt-needlecnt.
3830    addi(needlecnt, needlecnt, -2);    // Rest of needle.
3831   } else { // constant needlecnt
3832   guarantee(needlecntval != 1, "IndexOf with single-character needle must be handled separately");
3833   assert((needlecntval & 0x7fff) == needlecntval, "wrong immediate");
3834    addi(ch1, haycnt, -needlecntval);  // Last character index to compare is haycnt-needlecnt.
3835    if (needlecntval > 3) { li(needlecnt, needlecntval - 2); } // Rest of needle.
3836   }
3837 
3838   if (h_csize == 2) { slwi(ch1, ch1, 1); } // Scale to number of bytes.
3839 
3840   if (ae ==StrIntrinsicNode::UL) {
3841    srwi(tmp4, n_start, 1*8);          // ___0
3842    rlwimi(n_start, tmp4, 2*8, 0, 23); // _0_1
3843   }
3844 
3845   add(last_addr, haystack, ch1);      // Point to last address to compare (haystack+2*(haycnt-needlecnt)).
3846 
3847   // Main Loop (now we have at least 2 characters).
3848   Label L_OuterLoop, L_InnerLoop, L_FinalCheck, L_Comp1, L_Comp2;
3849   bind(L_OuterLoop); // Search for 1st 2 characters.
3850   Register addr_diff = tmp4;
3851    subf(addr_diff, addr, last_addr);  // Difference between already checked address and last address to check.
3852    addi(addr, addr, h_csize);         // This is the new address we want to use for comparing.
3853    srdi_(ch2, addr_diff, h_csize);
3854    beq(CCR0, L_FinalCheck);           // 2 characters left?
3855    mtctr(ch2);                        // num of characters / 2
3856   bind(L_InnerLoop);                  // Main work horse (2x unrolled search loop)
3857    if (h_csize == 2) {                // Load 2 characters of haystack (ignore alignment).
3858     lwz(ch1, 0, addr);
3859     lwz(ch2, 2, addr);
3860    } else {
3861     lhz(ch1, 0, addr);
3862     lhz(ch2, 1, addr);
3863    }
3864    cmpw(CCR0, ch1, n_start);          // Compare 2 characters (1 would be sufficient but try to reduce branches to CompLoop).
3865    cmpw(CCR1, ch2, n_start);
3866    beq(CCR0, L_Comp1);                // Did we find the needle start?
3867    beq(CCR1, L_Comp2);
3868    addi(addr, addr, 2 * h_csize);
3869    bdnz(L_InnerLoop);
3870   bind(L_FinalCheck);
3871    andi_(addr_diff, addr_diff, h_csize); // Remaining characters not covered by InnerLoop: (num of characters) & 1.
3872    beq(CCR0, L_NotFound);
3873    if (h_csize == 2) { lwz(ch1, 0, addr); } else { lhz(ch1, 0, addr); } // One position left at which we have to compare.
3874    cmpw(CCR1, ch1, n_start);
3875    beq(CCR1, L_Comp1);
3876   bind(L_NotFound);
3877    li(result, -1);                    // not found
3878    b(L_End);
3879 
3880    // **************************************************************************************************
3881    // Special Case: unfortunately, the variable needle case can be called with needlecnt<2
3882    // **************************************************************************************************
3883   if (needlecntval == 0) {           // We have to handle these cases separately.
3884   Label L_OneCharLoop;
3885   bind(L_TooShort);
3886    mtctr(haycnt);
3887    if (n_csize == 2) { lhz(n_start, 0, needle); } else { lbz(n_start, 0, needle); } // First character of needle
3888   bind(L_OneCharLoop);
3889    if (h_csize == 2) { lhzu(ch1, 2, addr); } else { lbzu(ch1, 1, addr); }
3890    cmpw(CCR1, ch1, n_start);
3891    beq(CCR1, L_Found);               // Did we find the one character needle?
3892    bdnz(L_OneCharLoop);
3893    li(result, -1);                   // Not found.
3894    b(L_End);
3895   }
3896 
3897   // **************************************************************************************************
3898   // Regular Case Part II: compare rest of needle (first 2 characters have been compared already)
3899   // **************************************************************************************************
3900 
3901   // Compare the rest
3902   bind(L_Comp2);
3903    addi(addr, addr, h_csize);        // First comparison has failed, 2nd one hit.
3904   bind(L_Comp1);                     // Addr points to possible needle start.
3905   if (needlecntval != 2) {           // Const needlecnt==2?
3906    if (needlecntval != 3) {
3907     if (needlecntval == 0) { beq(CCR6, L_Found); } // Variable needlecnt==2?
3908     Register n_ind = tmp4,
3909              h_ind = n_ind;
3910     li(n_ind, 2 * n_csize);          // First 2 characters are already compared, use index 2.
3911     mtctr(needlecnt);                // Decremented by 2, still > 0.
3912    Label L_CompLoop;
3913    bind(L_CompLoop);
3914     if (ae ==StrIntrinsicNode::UL) {
3915       h_ind = ch1;
3916       sldi(h_ind, n_ind, 1);
3917     }
3918     if (n_csize == 2) { lhzx(ch2, needle, n_ind); } else { lbzx(ch2, needle, n_ind); }
3919     if (h_csize == 2) { lhzx(ch1, addr, h_ind); } else { lbzx(ch1, addr, h_ind); }
3920     cmpw(CCR1, ch1, ch2);
3921     bne(CCR1, L_OuterLoop);
3922     addi(n_ind, n_ind, n_csize);
3923     bdnz(L_CompLoop);
3924    } else { // No loop required if there's only one needle character left.
3925     if (n_csize == 2) { lhz(ch2, 2 * 2, needle); } else { lbz(ch2, 2 * 1, needle); }
3926     if (h_csize == 2) { lhz(ch1, 2 * 2, addr); } else { lbz(ch1, 2 * 1, addr); }
3927     cmpw(CCR1, ch1, ch2);
3928     bne(CCR1, L_OuterLoop);
3929    }
3930   }
3931   // Return index ...
3932   bind(L_Found);
3933    subf(result, haystack, addr);     // relative to haystack, ...
3934    if (h_csize == 2) { srdi(result, result, 1); } // in characters.
3935   bind(L_End);
3936 } // string_indexof
3937 
3938 void MacroAssembler::string_indexof_char(Register result, Register haystack, Register haycnt,
3939                                          Register needle, jchar needleChar, Register tmp1, Register tmp2, bool is_byte) {
3940   assert_different_registers(haystack, haycnt, needle, tmp1, tmp2);
3941 
3942   Label L_InnerLoop, L_FinalCheck, L_Found1, L_Found2, L_NotFound, L_End;
3943   Register addr = tmp1,
3944            ch1 = tmp2,
3945            ch2 = R0;
3946 
3947   const int h_csize = is_byte ? 1 : 2;
3948 
3949 //4:
3950    srwi_(tmp2, haycnt, 1);   // Shift right by exact_log2(UNROLL_FACTOR).
3951    mr(addr, haystack);
3952    beq(CCR0, L_FinalCheck);
3953    mtctr(tmp2);              // Move to count register.
3954 //8:
3955   bind(L_InnerLoop);         // Main work horse (2x unrolled search loop).
3956    if (!is_byte) {
3957     lhz(ch1, 0, addr);
3958     lhz(ch2, 2, addr);
3959    } else {
3960     lbz(ch1, 0, addr);
3961     lbz(ch2, 1, addr);
3962    }
3963    (needle != R0) ? cmpw(CCR0, ch1, needle) : cmplwi(CCR0, ch1, (unsigned int)needleChar);
3964    (needle != R0) ? cmpw(CCR1, ch2, needle) : cmplwi(CCR1, ch2, (unsigned int)needleChar);
3965    beq(CCR0, L_Found1);      // Did we find the needle?
3966    beq(CCR1, L_Found2);
3967    addi(addr, addr, 2 * h_csize);
3968    bdnz(L_InnerLoop);
3969 //16:
3970   bind(L_FinalCheck);
3971    andi_(R0, haycnt, 1);
3972    beq(CCR0, L_NotFound);
3973    if (!is_byte) { lhz(ch1, 0, addr); } else { lbz(ch1, 0, addr); } // One position left at which we have to compare.
3974    (needle != R0) ? cmpw(CCR1, ch1, needle) : cmplwi(CCR1, ch1, (unsigned int)needleChar);
3975    beq(CCR1, L_Found1);
3976 //21:
3977   bind(L_NotFound);
3978    li(result, -1);           // Not found.
3979    b(L_End);
3980 
3981   bind(L_Found2);
3982    addi(addr, addr, h_csize);
3983 //24:
3984   bind(L_Found1);            // Return index ...
3985    subf(result, haystack, addr); // relative to haystack, ...
3986    if (!is_byte) { srdi(result, result, 1); } // in characters.
3987   bind(L_End);
3988 } // string_indexof_char
3989 
3990 
3991 void MacroAssembler::has_negatives(Register src, Register cnt, Register result,
3992                                    Register tmp1, Register tmp2) {
3993   const Register tmp0 = R0;
3994   assert_different_registers(src, result, cnt, tmp0, tmp1, tmp2);
3995   Label Lfastloop, Lslow, Lloop, Lnoneg, Ldone;
3996 
3997   // Check if cnt >= 8 (= 16 bytes)
3998   lis(tmp1, (int)(short)0x8080);  // tmp1 = 0x8080808080808080
3999   srwi_(tmp2, cnt, 4);
4000   li(result, 1);                  // Assume there's a negative byte.
4001   beq(CCR0, Lslow);
4002   ori(tmp1, tmp1, 0x8080);
4003   rldimi(tmp1, tmp1, 32, 0);
4004   mtctr(tmp2);
4005 
4006   // 2x unrolled loop
4007   bind(Lfastloop);
4008   ld(tmp2, 0, src);
4009   ld(tmp0, 8, src);
4010 
4011   orr(tmp0, tmp2, tmp0);
4012 
4013   and_(tmp0, tmp0, tmp1);
4014   bne(CCR0, Ldone);               // Found negative byte.
4015   addi(src, src, 16);
4016 
4017   bdnz(Lfastloop);
4018 
4019   bind(Lslow);                    // Fallback to slow version
4020   rldicl_(tmp0, cnt, 0, 64-4);
4021   beq(CCR0, Lnoneg);
4022   mtctr(tmp0);
4023   bind(Lloop);
4024   lbz(tmp0, 0, src);
4025   addi(src, src, 1);
4026   andi_(tmp0, tmp0, 0x80);
4027   bne(CCR0, Ldone);               // Found negative byte.
4028   bdnz(Lloop);
4029   bind(Lnoneg);
4030   li(result, 0);
4031 
4032   bind(Ldone);
4033 }
4034 
4035 #endif // Compiler2
4036 
4037 // Helpers for Intrinsic Emitters
4038 //
4039 // Revert the byte order of a 32bit value in a register
4040 //   src: 0x44556677
4041 //   dst: 0x77665544
4042 // Three steps to obtain the result:
4043 //  1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word
4044 //     into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared.
4045 //     This value initializes dst.
4046 //  2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost
4047 //     byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go.
4048 //     This value is mask inserted into dst with a [0..23] mask of 1s.
4049 //  3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position.
4050 //     This value is mask inserted into dst with a [8..15] mask of 1s.
4051 void MacroAssembler::load_reverse_32(Register dst, Register src) {
4052   assert_different_registers(dst, src);
4053 
4054   rldicl(dst, src, (4+1)*8, 56);       // Rotate byte 4 into position 7 (rightmost), clear all to the left.
4055   rlwimi(dst, src,     3*8,  0, 23);   // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone.
4056   rlwimi(dst, src,     1*8,  8, 15);   // Insert byte 6 into position 5, leave the rest alone.
4057 }
4058 
4059 // Calculate the column addresses of the crc32 lookup table into distinct registers.
4060 // This loop-invariant calculation is moved out of the loop body, reducing the loop
4061 // body size from 20 to 16 instructions.
4062 // Returns the offset that was used to calculate the address of column tc3.
4063 // Due to register shortage, setting tc3 may overwrite table. With the return offset
4064 // at hand, the original table address can be easily reconstructed.
4065 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {
4066 
4067 #ifdef VM_LITTLE_ENDIAN
4068   // This is what we implement (the DOLIT4 part):
4069   // ========================================================================= */
4070   // #define DOLIT4 c ^= *buf4++; \
4071   //         c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \
4072   //             crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24]
4073   // #define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4
4074   // ========================================================================= */
4075   const int ix0 = 3*(4*CRC32_COLUMN_SIZE);
4076   const int ix1 = 2*(4*CRC32_COLUMN_SIZE);
4077   const int ix2 = 1*(4*CRC32_COLUMN_SIZE);
4078   const int ix3 = 0*(4*CRC32_COLUMN_SIZE);
4079 #else
4080   // This is what we implement (the DOBIG4 part):
4081   // =========================================================================
4082   // #define DOBIG4 c ^= *++buf4; \
4083   //         c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \
4084   //             crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24]
4085   // #define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4
4086   // =========================================================================
4087   const int ix0 = 4*(4*CRC32_COLUMN_SIZE);
4088   const int ix1 = 5*(4*CRC32_COLUMN_SIZE);
4089   const int ix2 = 6*(4*CRC32_COLUMN_SIZE);
4090   const int ix3 = 7*(4*CRC32_COLUMN_SIZE);
4091 #endif
4092   assert_different_registers(table, tc0, tc1, tc2);
4093   assert(table == tc3, "must be!");
4094 
4095   addi(tc0, table, ix0);
4096   addi(tc1, table, ix1);
4097   addi(tc2, table, ix2);
4098   if (ix3 != 0) addi(tc3, table, ix3);
4099 
4100   return ix3;
4101 }
4102 
4103 /**
4104  * uint32_t crc;
4105  * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
4106  */
4107 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) {
4108   assert_different_registers(crc, table, tmp);
4109   assert_different_registers(val, table);
4110 
4111   if (crc == val) {                   // Must rotate first to use the unmodified value.
4112     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
4113                                       // As we use a word (4-byte) instruction, we have to adapt the mask bit positions.
4114     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
4115   } else {
4116     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
4117     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
4118   }
4119   lwzx(tmp, table, tmp);
4120   xorr(crc, crc, tmp);
4121 }
4122 
4123 /**
4124  * uint32_t crc;
4125  * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
4126  */
4127 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
4128   fold_byte_crc32(crc, crc, table, tmp);
4129 }
4130 
4131 /**
4132  * Emits code to update CRC-32 with a byte value according to constants in table.
4133  *
4134  * @param [in,out]crc   Register containing the crc.
4135  * @param [in]val       Register containing the byte to fold into the CRC.
4136  * @param [in]table     Register containing the table of crc constants.
4137  *
4138  * uint32_t crc;
4139  * val = crc_table[(val ^ crc) & 0xFF];
4140  * crc = val ^ (crc >> 8);
4141  */
4142 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
4143   BLOCK_COMMENT("update_byte_crc32:");
4144   xorr(val, val, crc);
4145   fold_byte_crc32(crc, val, table, val);
4146 }
4147 
4148 /**
4149  * @param crc   register containing existing CRC (32-bit)
4150  * @param buf   register pointing to input byte buffer (byte*)
4151  * @param len   register containing number of bytes
4152  * @param table register pointing to CRC table
4153  */
4154 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,
4155                                            Register data, bool loopAlignment) {
4156   assert_different_registers(crc, buf, len, table, data);
4157 
4158   Label L_mainLoop, L_done;
4159   const int mainLoop_stepping  = 1;
4160   const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4;
4161 
4162   // Process all bytes in a single-byte loop.
4163   clrldi_(len, len, 32);                         // Enforce 32 bit. Anything to do?
4164   beq(CCR0, L_done);
4165 
4166   mtctr(len);
4167   align(mainLoop_alignment);
4168   BIND(L_mainLoop);
4169     lbz(data, 0, buf);                           // Byte from buffer, zero-extended.
4170     addi(buf, buf, mainLoop_stepping);           // Advance buffer position.
4171     update_byte_crc32(crc, data, table);
4172     bdnz(L_mainLoop);                            // Iterate.
4173 
4174   bind(L_done);
4175 }
4176 
4177 /**
4178  * Emits code to update CRC-32 with a 4-byte value according to constants in table
4179  * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
4180  */
4181 // A not on the lookup table address(es):
4182 // The lookup table consists of two sets of four columns each.
4183 // The columns {0..3} are used for little-endian machines.
4184 // The columns {4..7} are used for big-endian machines.
4185 // To save the effort of adding the column offset to the table address each time
4186 // a table element is looked up, it is possible to pass the pre-calculated
4187 // column addresses.
4188 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary.
4189 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,
4190                                         Register t0,  Register t1,  Register t2,  Register t3,
4191                                         Register tc0, Register tc1, Register tc2, Register tc3) {
4192   assert_different_registers(crc, t3);
4193 
4194   // XOR crc with next four bytes of buffer.
4195   lwz(t3, bufDisp, buf);
4196   if (bufInc != 0) {
4197     addi(buf, buf, bufInc);
4198   }
4199   xorr(t3, t3, crc);
4200 
4201   // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices.
4202   rlwinm(t0, t3,  2,         24-2, 31-2);  // ((t1 >>  0) & 0xff) << 2
4203   rlwinm(t1, t3,  32+(2- 8), 24-2, 31-2);  // ((t1 >>  8) & 0xff) << 2
4204   rlwinm(t2, t3,  32+(2-16), 24-2, 31-2);  // ((t1 >> 16) & 0xff) << 2
4205   rlwinm(t3, t3,  32+(2-24), 24-2, 31-2);  // ((t1 >> 24) & 0xff) << 2
4206 
4207   // Use the pre-calculated column addresses.
4208   // Load pre-calculated table values.
4209   lwzx(t0, tc0, t0);
4210   lwzx(t1, tc1, t1);
4211   lwzx(t2, tc2, t2);
4212   lwzx(t3, tc3, t3);
4213 
4214   // Calculate new crc from table values.
4215   xorr(t0,  t0, t1);
4216   xorr(t2,  t2, t3);
4217   xorr(crc, t0, t2);  // Now crc contains the final checksum value.
4218 }
4219 
4220 /**
4221  * @param crc   register containing existing CRC (32-bit)
4222  * @param buf   register pointing to input byte buffer (byte*)
4223  * @param len   register containing number of bytes
4224  * @param table register pointing to CRC table
4225  *
4226  * Uses R9..R12 as work register. Must be saved/restored by caller!
4227  */
4228 void MacroAssembler::kernel_crc32_2word(Register crc, Register buf, Register len, Register table,
4229                                         Register t0,  Register t1,  Register t2,  Register t3,
4230                                         Register tc0, Register tc1, Register tc2, Register tc3,
4231                                         bool invertCRC) {
4232   assert_different_registers(crc, buf, len, table);
4233 
4234   Label L_mainLoop, L_tail;
4235   Register  tmp  = t0;
4236   Register  data = t0;
4237   Register  tmp2 = t1;
4238   const int mainLoop_stepping  = 8;
4239   const int tailLoop_stepping  = 1;
4240   const int log_stepping       = exact_log2(mainLoop_stepping);
4241   const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
4242   const int complexThreshold   = 2*mainLoop_stepping;
4243 
4244   // Don't test for len <= 0 here. This pathological case should not occur anyway.
4245   // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles
4246   // for all well-behaved cases. The situation itself is detected and handled correctly
4247   // within update_byteLoop_crc32.
4248   assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
4249 
4250   BLOCK_COMMENT("kernel_crc32_2word {");
4251 
4252   if (invertCRC) {
4253     nand(crc, crc, crc);                      // 1s complement of crc
4254   }
4255 
4256   // Check for short (<mainLoop_stepping) buffer.
4257   cmpdi(CCR0, len, complexThreshold);
4258   blt(CCR0, L_tail);
4259 
4260   // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
4261   // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
4262   {
4263     // Align buf addr to mainLoop_stepping boundary.
4264     neg(tmp2, buf);                           // Calculate # preLoop iterations for alignment.
4265     rldicl(tmp2, tmp2, 0, 64-log_stepping);   // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
4266 
4267     if (complexThreshold > mainLoop_stepping) {
4268       sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4269     } else {
4270       sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
4271       cmpdi(CCR0, tmp, mainLoop_stepping);
4272       blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
4273       mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4274     }
4275     update_byteLoop_crc32(crc, buf, tmp2, table, data, false);
4276   }
4277 
4278   srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
4279   andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
4280   mtctr(tmp2);
4281 
4282 #ifdef VM_LITTLE_ENDIAN
4283   Register crc_rv = crc;
4284 #else
4285   Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
4286                                                  // Occupies tmp, but frees up crc.
4287   load_reverse_32(crc_rv, crc);                  // Revert byte order because we are dealing with big-endian data.
4288   tmp = crc;
4289 #endif
4290 
4291   int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
4292 
4293   align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
4294   BIND(L_mainLoop);
4295     update_1word_crc32(crc_rv, buf, table, 0, 0, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
4296     update_1word_crc32(crc_rv, buf, table, 4, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
4297     bdnz(L_mainLoop);
4298 
4299 #ifndef VM_LITTLE_ENDIAN
4300   load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
4301   tmp = crc_rv;                                  // Tmp uses it's original register again.
4302 #endif
4303 
4304   // Restore original table address for tailLoop.
4305   if (reconstructTableOffset != 0) {
4306     addi(table, table, -reconstructTableOffset);
4307   }
4308 
4309   // Process last few (<complexThreshold) bytes of buffer.
4310   BIND(L_tail);
4311   update_byteLoop_crc32(crc, buf, len, table, data, false);
4312 
4313   if (invertCRC) {
4314     nand(crc, crc, crc);                      // 1s complement of crc
4315   }
4316   BLOCK_COMMENT("} kernel_crc32_2word");
4317 }
4318 
4319 /**
4320  * @param crc   register containing existing CRC (32-bit)
4321  * @param buf   register pointing to input byte buffer (byte*)
4322  * @param len   register containing number of bytes
4323  * @param table register pointing to CRC table
4324  *
4325  * uses R9..R12 as work register. Must be saved/restored by caller!
4326  */
4327 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table,
4328                                         Register t0,  Register t1,  Register t2,  Register t3,
4329                                         Register tc0, Register tc1, Register tc2, Register tc3,
4330                                         bool invertCRC) {
4331   assert_different_registers(crc, buf, len, table);
4332 
4333   Label L_mainLoop, L_tail;
4334   Register  tmp          = t0;
4335   Register  data         = t0;
4336   Register  tmp2         = t1;
4337   const int mainLoop_stepping  = 4;
4338   const int tailLoop_stepping  = 1;
4339   const int log_stepping       = exact_log2(mainLoop_stepping);
4340   const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
4341   const int complexThreshold   = 2*mainLoop_stepping;
4342 
4343   // Don't test for len <= 0 here. This pathological case should not occur anyway.
4344   // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles
4345   // for all well-behaved cases. The situation itself is detected and handled correctly
4346   // within update_byteLoop_crc32.
4347   assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
4348 
4349   BLOCK_COMMENT("kernel_crc32_1word {");
4350 
4351   if (invertCRC) {
4352     nand(crc, crc, crc);                      // 1s complement of crc
4353   }
4354 
4355   // Check for short (<mainLoop_stepping) buffer.
4356   cmpdi(CCR0, len, complexThreshold);
4357   blt(CCR0, L_tail);
4358 
4359   // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
4360   // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
4361   {
4362     // Align buf addr to mainLoop_stepping boundary.
4363     neg(tmp2, buf);                              // Calculate # preLoop iterations for alignment.
4364     rldicl(tmp2, tmp2, 0, 64-log_stepping);      // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
4365 
4366     if (complexThreshold > mainLoop_stepping) {
4367       sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4368     } else {
4369       sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
4370       cmpdi(CCR0, tmp, mainLoop_stepping);
4371       blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
4372       mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4373     }
4374     update_byteLoop_crc32(crc, buf, tmp2, table, data, false);
4375   }
4376 
4377   srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
4378   andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
4379   mtctr(tmp2);
4380 
4381 #ifdef VM_LITTLE_ENDIAN
4382   Register crc_rv = crc;
4383 #else
4384   Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
4385                                                  // Occupies tmp, but frees up crc.
4386   load_reverse_32(crc_rv, crc);                  // Revert byte order because we are dealing with big-endian data.
4387   tmp = crc;
4388 #endif
4389 
4390   int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
4391 
4392   align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
4393   BIND(L_mainLoop);
4394     update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
4395     bdnz(L_mainLoop);
4396 
4397 #ifndef VM_LITTLE_ENDIAN
4398   load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
4399   tmp = crc_rv;                                  // Tmp uses it's original register again.
4400 #endif
4401 
4402   // Restore original table address for tailLoop.
4403   if (reconstructTableOffset != 0) {
4404     addi(table, table, -reconstructTableOffset);
4405   }
4406 
4407   // Process last few (<complexThreshold) bytes of buffer.
4408   BIND(L_tail);
4409   update_byteLoop_crc32(crc, buf, len, table, data, false);
4410 
4411   if (invertCRC) {
4412     nand(crc, crc, crc);                      // 1s complement of crc
4413   }
4414   BLOCK_COMMENT("} kernel_crc32_1word");
4415 }
4416 
4417 /**
4418  * @param crc   register containing existing CRC (32-bit)
4419  * @param buf   register pointing to input byte buffer (byte*)
4420  * @param len   register containing number of bytes
4421  * @param table register pointing to CRC table
4422  *
4423  * Uses R7_ARG5, R8_ARG6 as work registers.
4424  */
4425 void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len, Register table,
4426                                         Register t0,  Register t1,  Register t2,  Register t3,
4427                                         bool invertCRC) {
4428   assert_different_registers(crc, buf, len, table);
4429 
4430   Register  data = t0;                   // Holds the current byte to be folded into crc.
4431 
4432   BLOCK_COMMENT("kernel_crc32_1byte {");
4433 
4434   if (invertCRC) {
4435     nand(crc, crc, crc);                      // 1s complement of crc
4436   }
4437 
4438   // Process all bytes in a single-byte loop.
4439   update_byteLoop_crc32(crc, buf, len, table, data, true);
4440 
4441   if (invertCRC) {
4442     nand(crc, crc, crc);                      // 1s complement of crc
4443   }
4444   BLOCK_COMMENT("} kernel_crc32_1byte");
4445 }
4446 
4447 /**
4448  * @param crc             register containing existing CRC (32-bit)
4449  * @param buf             register pointing to input byte buffer (byte*)
4450  * @param len             register containing number of bytes
4451  * @param table           register pointing to CRC table
4452  * @param constants       register pointing to CRC table for 128-bit aligned memory
4453  * @param barretConstants register pointing to table for barrett reduction
4454  * @param t0              volatile register
4455  * @param t1              volatile register
4456  * @param t2              volatile register
4457  * @param t3              volatile register
4458  */
4459 void MacroAssembler::kernel_crc32_1word_vpmsumd(Register crc, Register buf, Register len, Register table,
4460                                                 Register constants,  Register barretConstants,
4461                                                 Register t0,  Register t1, Register t2, Register t3, Register t4,
4462                                                 bool invertCRC) {
4463   assert_different_registers(crc, buf, len, table);
4464 
4465   Label L_alignedHead, L_tail, L_alignTail, L_start, L_end;
4466 
4467   Register  prealign     = t0;
4468   Register  postalign    = t0;
4469 
4470   BLOCK_COMMENT("kernel_crc32_1word_vpmsumb {");
4471 
4472   // 1. use kernel_crc32_1word for shorter than 384bit
4473   clrldi(len, len, 32);
4474   cmpdi(CCR0, len, 384);
4475   bge(CCR0, L_start);
4476 
4477     Register tc0 = t4;
4478     Register tc1 = constants;
4479     Register tc2 = barretConstants;
4480     kernel_crc32_1word(crc, buf, len, table,t0, t1, t2, t3, tc0, tc1, tc2, table, invertCRC);
4481     b(L_end);
4482 
4483   BIND(L_start);
4484 
4485     // 2. ~c
4486     if (invertCRC) {
4487       nand(crc, crc, crc);                      // 1s complement of crc
4488     }
4489 
4490     // 3. calculate from 0 to first 128bit-aligned address
4491     clrldi_(prealign, buf, 57);
4492     beq(CCR0, L_alignedHead);
4493 
4494     subfic(prealign, prealign, 128);
4495 
4496     subf(len, prealign, len);
4497     update_byteLoop_crc32(crc, buf, prealign, table, t2, false);
4498 
4499     // 4. calculate from first 128bit-aligned address to last 128bit-aligned address
4500     BIND(L_alignedHead);
4501 
4502     clrldi(postalign, len, 57);
4503     subf(len, postalign, len);
4504 
4505     // len must be more than 256bit
4506     kernel_crc32_1word_aligned(crc, buf, len, constants, barretConstants, t1, t2, t3);
4507 
4508     // 5. calculate remaining
4509     cmpdi(CCR0, postalign, 0);
4510     beq(CCR0, L_tail);
4511 
4512     update_byteLoop_crc32(crc, buf, postalign, table, t2, false);
4513 
4514     BIND(L_tail);
4515 
4516     // 6. ~c
4517     if (invertCRC) {
4518       nand(crc, crc, crc);                      // 1s complement of crc
4519     }
4520 
4521   BIND(L_end);
4522 
4523   BLOCK_COMMENT("} kernel_crc32_1word_vpmsumb");
4524 }
4525 
4526 /**
4527  * @param crc             register containing existing CRC (32-bit)
4528  * @param buf             register pointing to input byte buffer (byte*)
4529  * @param len             register containing number of bytes
4530  * @param constants       register pointing to CRC table for 128-bit aligned memory
4531  * @param barretConstants register pointing to table for barrett reduction
4532  * @param t0              volatile register
4533  * @param t1              volatile register
4534  * @param t2              volatile register
4535  */
4536 void MacroAssembler::kernel_crc32_1word_aligned(Register crc, Register buf, Register len,
4537     Register constants, Register barretConstants, Register t0, Register t1, Register t2) {
4538   Label L_mainLoop, L_tail, L_alignTail, L_barrett_reduction, L_end, L_first_warm_up_done, L_first_cool_down, L_second_cool_down, L_XOR, L_test;
4539   Label L_lv0, L_lv1, L_lv2, L_lv3, L_lv4, L_lv5, L_lv6, L_lv7, L_lv8, L_lv9, L_lv10, L_lv11, L_lv12, L_lv13, L_lv14, L_lv15;
4540   Label L_1, L_2, L_3, L_4;
4541 
4542   Register  rLoaded      = t0;
4543   Register  rTmp1        = t1;
4544   Register  rTmp2        = t2;
4545   Register  off16        = R22;
4546   Register  off32        = R23;
4547   Register  off48        = R24;
4548   Register  off64        = R25;
4549   Register  off80        = R26;
4550   Register  off96        = R27;
4551   Register  off112       = R28;
4552   Register  rIdx         = R29;
4553   Register  rMax         = R30;
4554   Register  constantsPos = R31;
4555 
4556   VectorRegister mask_32bit = VR24;
4557   VectorRegister mask_64bit = VR25;
4558   VectorRegister zeroes     = VR26;
4559   VectorRegister const1     = VR27;
4560   VectorRegister const2     = VR28;
4561 
4562   // Save non-volatile vector registers (frameless).
4563   Register offset = t1;   int offsetInt = 0;
4564   offsetInt -= 16; li(offset, -16);           stvx(VR20, offset, R1_SP);
4565   offsetInt -= 16; addi(offset, offset, -16); stvx(VR21, offset, R1_SP);
4566   offsetInt -= 16; addi(offset, offset, -16); stvx(VR22, offset, R1_SP);
4567   offsetInt -= 16; addi(offset, offset, -16); stvx(VR23, offset, R1_SP);
4568   offsetInt -= 16; addi(offset, offset, -16); stvx(VR24, offset, R1_SP);
4569   offsetInt -= 16; addi(offset, offset, -16); stvx(VR25, offset, R1_SP);
4570   offsetInt -= 16; addi(offset, offset, -16); stvx(VR26, offset, R1_SP);
4571   offsetInt -= 16; addi(offset, offset, -16); stvx(VR27, offset, R1_SP);
4572   offsetInt -= 16; addi(offset, offset, -16); stvx(VR28, offset, R1_SP);
4573   offsetInt -= 8; std(R22, offsetInt, R1_SP);
4574   offsetInt -= 8; std(R23, offsetInt, R1_SP);
4575   offsetInt -= 8; std(R24, offsetInt, R1_SP);
4576   offsetInt -= 8; std(R25, offsetInt, R1_SP);
4577   offsetInt -= 8; std(R26, offsetInt, R1_SP);
4578   offsetInt -= 8; std(R27, offsetInt, R1_SP);
4579   offsetInt -= 8; std(R28, offsetInt, R1_SP);
4580   offsetInt -= 8; std(R29, offsetInt, R1_SP);
4581   offsetInt -= 8; std(R30, offsetInt, R1_SP);
4582   offsetInt -= 8; std(R31, offsetInt, R1_SP);
4583 
4584   // Set constants
4585   li(off16, 16);
4586   li(off32, 32);
4587   li(off48, 48);
4588   li(off64, 64);
4589   li(off80, 80);
4590   li(off96, 96);
4591   li(off112, 112);
4592 
4593   clrldi(crc, crc, 32);
4594 
4595   vxor(zeroes, zeroes, zeroes);
4596   vspltisw(VR0, -1);
4597 
4598   vsldoi(mask_32bit, zeroes, VR0, 4);
4599   vsldoi(mask_64bit, zeroes, VR0, 8);
4600 
4601   // Get the initial value into v8
4602   vxor(VR8, VR8, VR8);
4603   mtvrd(VR8, crc);
4604   vsldoi(VR8, zeroes, VR8, 8); // shift into bottom 32 bits
4605 
4606   li (rLoaded, 0);
4607 
4608   rldicr(rIdx, len, 0, 56);
4609 
4610   {
4611     BIND(L_1);
4612     // Checksum in blocks of MAX_SIZE (32768)
4613     lis(rMax, 0);
4614     ori(rMax, rMax, 32768);
4615     mr(rTmp2, rMax);
4616     cmpd(CCR0, rIdx, rMax);
4617     bgt(CCR0, L_2);
4618     mr(rMax, rIdx);
4619 
4620     BIND(L_2);
4621     subf(rIdx, rMax, rIdx);
4622 
4623     // our main loop does 128 bytes at a time
4624     srdi(rMax, rMax, 7);
4625 
4626     /*
4627      * Work out the offset into the constants table to start at. Each
4628      * constant is 16 bytes, and it is used against 128 bytes of input
4629      * data - 128 / 16 = 8
4630      */
4631     sldi(rTmp1, rMax, 4);
4632     srdi(rTmp2, rTmp2, 3);
4633     subf(rTmp1, rTmp1, rTmp2);
4634 
4635     // We reduce our final 128 bytes in a separate step
4636     addi(rMax, rMax, -1);
4637     mtctr(rMax);
4638 
4639     // Find the start of our constants
4640     add(constantsPos, constants, rTmp1);
4641 
4642     // zero VR0-v7 which will contain our checksums
4643     vxor(VR0, VR0, VR0);
4644     vxor(VR1, VR1, VR1);
4645     vxor(VR2, VR2, VR2);
4646     vxor(VR3, VR3, VR3);
4647     vxor(VR4, VR4, VR4);
4648     vxor(VR5, VR5, VR5);
4649     vxor(VR6, VR6, VR6);
4650     vxor(VR7, VR7, VR7);
4651 
4652     lvx(const1, constantsPos);
4653 
4654     /*
4655      * If we are looping back to consume more data we use the values
4656      * already in VR16-v23.
4657      */
4658     cmpdi(CCR0, rLoaded, 1);
4659     beq(CCR0, L_3);
4660     {
4661 
4662       // First warm up pass
4663       lvx(VR16, buf);
4664       lvx(VR17, off16, buf);
4665       lvx(VR18, off32, buf);
4666       lvx(VR19, off48, buf);
4667       lvx(VR20, off64, buf);
4668       lvx(VR21, off80, buf);
4669       lvx(VR22, off96, buf);
4670       lvx(VR23, off112, buf);
4671       addi(buf, buf, 8*16);
4672 
4673       // xor in initial value
4674       vxor(VR16, VR16, VR8);
4675     }
4676 
4677     BIND(L_3);
4678     bdz(L_first_warm_up_done);
4679 
4680     addi(constantsPos, constantsPos, 16);
4681     lvx(const2, constantsPos);
4682 
4683     // Second warm up pass
4684     vpmsumd(VR8, VR16, const1);
4685     lvx(VR16, buf);
4686 
4687     vpmsumd(VR9, VR17, const1);
4688     lvx(VR17, off16, buf);
4689 
4690     vpmsumd(VR10, VR18, const1);
4691     lvx(VR18, off32, buf);
4692 
4693     vpmsumd(VR11, VR19, const1);
4694     lvx(VR19, off48, buf);
4695 
4696     vpmsumd(VR12, VR20, const1);
4697     lvx(VR20, off64, buf);
4698 
4699     vpmsumd(VR13, VR21, const1);
4700     lvx(VR21, off80, buf);
4701 
4702     vpmsumd(VR14, VR22, const1);
4703     lvx(VR22, off96, buf);
4704 
4705     vpmsumd(VR15, VR23, const1);
4706     lvx(VR23, off112, buf);
4707 
4708     addi(buf, buf, 8 * 16);
4709 
4710     bdz(L_first_cool_down);
4711 
4712     /*
4713      * main loop. We modulo schedule it such that it takes three iterations
4714      * to complete - first iteration load, second iteration vpmsum, third
4715      * iteration xor.
4716      */
4717     {
4718       BIND(L_4);
4719       lvx(const1, constantsPos); addi(constantsPos, constantsPos, 16);
4720 
4721       vxor(VR0, VR0, VR8);
4722       vpmsumd(VR8, VR16, const2);
4723       lvx(VR16, buf);
4724 
4725       vxor(VR1, VR1, VR9);
4726       vpmsumd(VR9, VR17, const2);
4727       lvx(VR17, off16, buf);
4728 
4729       vxor(VR2, VR2, VR10);
4730       vpmsumd(VR10, VR18, const2);
4731       lvx(VR18, off32, buf);
4732 
4733       vxor(VR3, VR3, VR11);
4734       vpmsumd(VR11, VR19, const2);
4735       lvx(VR19, off48, buf);
4736       lvx(const2, constantsPos);
4737 
4738       vxor(VR4, VR4, VR12);
4739       vpmsumd(VR12, VR20, const1);
4740       lvx(VR20, off64, buf);
4741 
4742       vxor(VR5, VR5, VR13);
4743       vpmsumd(VR13, VR21, const1);
4744       lvx(VR21, off80, buf);
4745 
4746       vxor(VR6, VR6, VR14);
4747       vpmsumd(VR14, VR22, const1);
4748       lvx(VR22, off96, buf);
4749 
4750       vxor(VR7, VR7, VR15);
4751       vpmsumd(VR15, VR23, const1);
4752       lvx(VR23, off112, buf);
4753 
4754       addi(buf, buf, 8 * 16);
4755 
4756       bdnz(L_4);
4757     }
4758 
4759     BIND(L_first_cool_down);
4760 
4761     // First cool down pass
4762     lvx(const1, constantsPos);
4763     addi(constantsPos, constantsPos, 16);
4764 
4765     vxor(VR0, VR0, VR8);
4766     vpmsumd(VR8, VR16, const1);
4767 
4768     vxor(VR1, VR1, VR9);
4769     vpmsumd(VR9, VR17, const1);
4770 
4771     vxor(VR2, VR2, VR10);
4772     vpmsumd(VR10, VR18, const1);
4773 
4774     vxor(VR3, VR3, VR11);
4775     vpmsumd(VR11, VR19, const1);
4776 
4777     vxor(VR4, VR4, VR12);
4778     vpmsumd(VR12, VR20, const1);
4779 
4780     vxor(VR5, VR5, VR13);
4781     vpmsumd(VR13, VR21, const1);
4782 
4783     vxor(VR6, VR6, VR14);
4784     vpmsumd(VR14, VR22, const1);
4785 
4786     vxor(VR7, VR7, VR15);
4787     vpmsumd(VR15, VR23, const1);
4788 
4789     BIND(L_second_cool_down);
4790     // Second cool down pass
4791     vxor(VR0, VR0, VR8);
4792     vxor(VR1, VR1, VR9);
4793     vxor(VR2, VR2, VR10);
4794     vxor(VR3, VR3, VR11);
4795     vxor(VR4, VR4, VR12);
4796     vxor(VR5, VR5, VR13);
4797     vxor(VR6, VR6, VR14);
4798     vxor(VR7, VR7, VR15);
4799 
4800     /*
4801      * vpmsumd produces a 96 bit result in the least significant bits
4802      * of the register. Since we are bit reflected we have to shift it
4803      * left 32 bits so it occupies the least significant bits in the
4804      * bit reflected domain.
4805      */
4806     vsldoi(VR0, VR0, zeroes, 4);
4807     vsldoi(VR1, VR1, zeroes, 4);
4808     vsldoi(VR2, VR2, zeroes, 4);
4809     vsldoi(VR3, VR3, zeroes, 4);
4810     vsldoi(VR4, VR4, zeroes, 4);
4811     vsldoi(VR5, VR5, zeroes, 4);
4812     vsldoi(VR6, VR6, zeroes, 4);
4813     vsldoi(VR7, VR7, zeroes, 4);
4814 
4815     // xor with last 1024 bits
4816     lvx(VR8, buf);
4817     lvx(VR9, off16, buf);
4818     lvx(VR10, off32, buf);
4819     lvx(VR11, off48, buf);
4820     lvx(VR12, off64, buf);
4821     lvx(VR13, off80, buf);
4822     lvx(VR14, off96, buf);
4823     lvx(VR15, off112, buf);
4824     addi(buf, buf, 8 * 16);
4825 
4826     vxor(VR16, VR0, VR8);
4827     vxor(VR17, VR1, VR9);
4828     vxor(VR18, VR2, VR10);
4829     vxor(VR19, VR3, VR11);
4830     vxor(VR20, VR4, VR12);
4831     vxor(VR21, VR5, VR13);
4832     vxor(VR22, VR6, VR14);
4833     vxor(VR23, VR7, VR15);
4834 
4835     li(rLoaded, 1);
4836     cmpdi(CCR0, rIdx, 0);
4837     addi(rIdx, rIdx, 128);
4838     bne(CCR0, L_1);
4839   }
4840 
4841   // Work out how many bytes we have left
4842   andi_(len, len, 127);
4843 
4844   // Calculate where in the constant table we need to start
4845   subfic(rTmp1, len, 128);
4846   add(constantsPos, constantsPos, rTmp1);
4847 
4848   // How many 16 byte chunks are in the tail
4849   srdi(rIdx, len, 4);
4850   mtctr(rIdx);
4851 
4852   /*
4853    * Reduce the previously calculated 1024 bits to 64 bits, shifting
4854    * 32 bits to include the trailing 32 bits of zeros
4855    */
4856   lvx(VR0, constantsPos);
4857   lvx(VR1, off16, constantsPos);
4858   lvx(VR2, off32, constantsPos);
4859   lvx(VR3, off48, constantsPos);
4860   lvx(VR4, off64, constantsPos);
4861   lvx(VR5, off80, constantsPos);
4862   lvx(VR6, off96, constantsPos);
4863   lvx(VR7, off112, constantsPos);
4864   addi(constantsPos, constantsPos, 8 * 16);
4865 
4866   vpmsumw(VR0, VR16, VR0);
4867   vpmsumw(VR1, VR17, VR1);
4868   vpmsumw(VR2, VR18, VR2);
4869   vpmsumw(VR3, VR19, VR3);
4870   vpmsumw(VR4, VR20, VR4);
4871   vpmsumw(VR5, VR21, VR5);
4872   vpmsumw(VR6, VR22, VR6);
4873   vpmsumw(VR7, VR23, VR7);
4874 
4875   // Now reduce the tail (0 - 112 bytes)
4876   cmpdi(CCR0, rIdx, 0);
4877   beq(CCR0, L_XOR);
4878 
4879   lvx(VR16, buf); addi(buf, buf, 16);
4880   lvx(VR17, constantsPos);
4881   vpmsumw(VR16, VR16, VR17);
4882   vxor(VR0, VR0, VR16);
4883   beq(CCR0, L_XOR);
4884 
4885   lvx(VR16, buf); addi(buf, buf, 16);
4886   lvx(VR17, off16, constantsPos);
4887   vpmsumw(VR16, VR16, VR17);
4888   vxor(VR0, VR0, VR16);
4889   beq(CCR0, L_XOR);
4890 
4891   lvx(VR16, buf); addi(buf, buf, 16);
4892   lvx(VR17, off32, constantsPos);
4893   vpmsumw(VR16, VR16, VR17);
4894   vxor(VR0, VR0, VR16);
4895   beq(CCR0, L_XOR);
4896 
4897   lvx(VR16, buf); addi(buf, buf, 16);
4898   lvx(VR17, off48,constantsPos);
4899   vpmsumw(VR16, VR16, VR17);
4900   vxor(VR0, VR0, VR16);
4901   beq(CCR0, L_XOR);
4902 
4903   lvx(VR16, buf); addi(buf, buf, 16);
4904   lvx(VR17, off64, constantsPos);
4905   vpmsumw(VR16, VR16, VR17);
4906   vxor(VR0, VR0, VR16);
4907   beq(CCR0, L_XOR);
4908 
4909   lvx(VR16, buf); addi(buf, buf, 16);
4910   lvx(VR17, off80, constantsPos);
4911   vpmsumw(VR16, VR16, VR17);
4912   vxor(VR0, VR0, VR16);
4913   beq(CCR0, L_XOR);
4914 
4915   lvx(VR16, buf); addi(buf, buf, 16);
4916   lvx(VR17, off96, constantsPos);
4917   vpmsumw(VR16, VR16, VR17);
4918   vxor(VR0, VR0, VR16);
4919 
4920   // Now xor all the parallel chunks together
4921   BIND(L_XOR);
4922   vxor(VR0, VR0, VR1);
4923   vxor(VR2, VR2, VR3);
4924   vxor(VR4, VR4, VR5);
4925   vxor(VR6, VR6, VR7);
4926 
4927   vxor(VR0, VR0, VR2);
4928   vxor(VR4, VR4, VR6);
4929 
4930   vxor(VR0, VR0, VR4);
4931 
4932   b(L_barrett_reduction);
4933 
4934   BIND(L_first_warm_up_done);
4935   lvx(const1, constantsPos);
4936   addi(constantsPos, constantsPos, 16);
4937   vpmsumd(VR8,  VR16, const1);
4938   vpmsumd(VR9,  VR17, const1);
4939   vpmsumd(VR10, VR18, const1);
4940   vpmsumd(VR11, VR19, const1);
4941   vpmsumd(VR12, VR20, const1);
4942   vpmsumd(VR13, VR21, const1);
4943   vpmsumd(VR14, VR22, const1);
4944   vpmsumd(VR15, VR23, const1);
4945   b(L_second_cool_down);
4946 
4947   BIND(L_barrett_reduction);
4948 
4949   lvx(const1, barretConstants);
4950   addi(barretConstants, barretConstants, 16);
4951   lvx(const2, barretConstants);
4952 
4953   vsldoi(VR1, VR0, VR0, 8);
4954   vxor(VR0, VR0, VR1);    // xor two 64 bit results together
4955 
4956   // shift left one bit
4957   vspltisb(VR1, 1);
4958   vsl(VR0, VR0, VR1);
4959 
4960   vand(VR0, VR0, mask_64bit);
4961 
4962   /*
4963    * The reflected version of Barrett reduction. Instead of bit
4964    * reflecting our data (which is expensive to do), we bit reflect our
4965    * constants and our algorithm, which means the intermediate data in
4966    * our vector registers goes from 0-63 instead of 63-0. We can reflect
4967    * the algorithm because we don't carry in mod 2 arithmetic.
4968    */
4969   vand(VR1, VR0, mask_32bit);  // bottom 32 bits of a
4970   vpmsumd(VR1, VR1, const1);   // ma
4971   vand(VR1, VR1, mask_32bit);  // bottom 32bits of ma
4972   vpmsumd(VR1, VR1, const2);   // qn */
4973   vxor(VR0, VR0, VR1);         // a - qn, subtraction is xor in GF(2)
4974 
4975   /*
4976    * Since we are bit reflected, the result (ie the low 32 bits) is in
4977    * the high 32 bits. We just need to shift it left 4 bytes
4978    * V0 [ 0 1 X 3 ]
4979    * V0 [ 0 X 2 3 ]
4980    */
4981   vsldoi(VR0, VR0, zeroes, 4);    // shift result into top 64 bits of
4982 
4983   // Get it into r3
4984   mfvrd(crc, VR0);
4985 
4986   BIND(L_end);
4987 
4988   offsetInt = 0;
4989   // Restore non-volatile Vector registers (frameless).
4990   offsetInt -= 16; li(offset, -16);           lvx(VR20, offset, R1_SP);
4991   offsetInt -= 16; addi(offset, offset, -16); lvx(VR21, offset, R1_SP);
4992   offsetInt -= 16; addi(offset, offset, -16); lvx(VR22, offset, R1_SP);
4993   offsetInt -= 16; addi(offset, offset, -16); lvx(VR23, offset, R1_SP);
4994   offsetInt -= 16; addi(offset, offset, -16); lvx(VR24, offset, R1_SP);
4995   offsetInt -= 16; addi(offset, offset, -16); lvx(VR25, offset, R1_SP);
4996   offsetInt -= 16; addi(offset, offset, -16); lvx(VR26, offset, R1_SP);
4997   offsetInt -= 16; addi(offset, offset, -16); lvx(VR27, offset, R1_SP);
4998   offsetInt -= 16; addi(offset, offset, -16); lvx(VR28, offset, R1_SP);
4999   offsetInt -= 8;  ld(R22, offsetInt, R1_SP);
5000   offsetInt -= 8;  ld(R23, offsetInt, R1_SP);
5001   offsetInt -= 8;  ld(R24, offsetInt, R1_SP);
5002   offsetInt -= 8;  ld(R25, offsetInt, R1_SP);
5003   offsetInt -= 8;  ld(R26, offsetInt, R1_SP);
5004   offsetInt -= 8;  ld(R27, offsetInt, R1_SP);
5005   offsetInt -= 8;  ld(R28, offsetInt, R1_SP);
5006   offsetInt -= 8;  ld(R29, offsetInt, R1_SP);
5007   offsetInt -= 8;  ld(R30, offsetInt, R1_SP);
5008   offsetInt -= 8;  ld(R31, offsetInt, R1_SP);
5009 }
5010 
5011 void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp, bool invertCRC) {
5012   assert_different_registers(crc, buf, /* len,  not used!! */ table, tmp);
5013 
5014   BLOCK_COMMENT("kernel_crc32_singleByte:");
5015   if (invertCRC) {
5016     nand(crc, crc, crc);                // 1s complement of crc
5017   }
5018 
5019   lbz(tmp, 0, buf);                     // Byte from buffer, zero-extended.
5020   update_byte_crc32(crc, tmp, table);
5021 
5022   if (invertCRC) {
5023     nand(crc, crc, crc);                // 1s complement of crc
5024   }
5025 }
5026 
5027 void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) {
5028   assert_different_registers(crc, val, table);
5029 
5030   BLOCK_COMMENT("kernel_crc32_singleByteReg:");
5031   if (invertCRC) {
5032     nand(crc, crc, crc);                // 1s complement of crc
5033   }
5034 
5035   update_byte_crc32(crc, val, table);
5036 
5037   if (invertCRC) {
5038     nand(crc, crc, crc);                // 1s complement of crc
5039   }
5040 }
5041 
5042 // dest_lo += src1 + src2
5043 // dest_hi += carry1 + carry2
5044 void MacroAssembler::add2_with_carry(Register dest_hi,
5045                                      Register dest_lo,
5046                                      Register src1, Register src2) {
5047   li(R0, 0);
5048   addc(dest_lo, dest_lo, src1);
5049   adde(dest_hi, dest_hi, R0);
5050   addc(dest_lo, dest_lo, src2);
5051   adde(dest_hi, dest_hi, R0);
5052 }
5053 
5054 // Multiply 64 bit by 64 bit first loop.
5055 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,
5056                                            Register x_xstart,
5057                                            Register y, Register y_idx,
5058                                            Register z,
5059                                            Register carry,
5060                                            Register product_high, Register product,
5061                                            Register idx, Register kdx,
5062                                            Register tmp) {
5063   //  jlong carry, x[], y[], z[];
5064   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
5065   //    huge_128 product = y[idx] * x[xstart] + carry;
5066   //    z[kdx] = (jlong)product;
5067   //    carry  = (jlong)(product >>> 64);
5068   //  }
5069   //  z[xstart] = carry;
5070 
5071   Label L_first_loop, L_first_loop_exit;
5072   Label L_one_x, L_one_y, L_multiply;
5073 
5074   addic_(xstart, xstart, -1);
5075   blt(CCR0, L_one_x);   // Special case: length of x is 1.
5076 
5077   // Load next two integers of x.
5078   sldi(tmp, xstart, LogBytesPerInt);
5079   ldx(x_xstart, x, tmp);
5080 #ifdef VM_LITTLE_ENDIAN
5081   rldicl(x_xstart, x_xstart, 32, 0);
5082 #endif
5083 
5084   align(32, 16);
5085   bind(L_first_loop);
5086 
5087   cmpdi(CCR0, idx, 1);
5088   blt(CCR0, L_first_loop_exit);
5089   addi(idx, idx, -2);
5090   beq(CCR0, L_one_y);
5091 
5092   // Load next two integers of y.
5093   sldi(tmp, idx, LogBytesPerInt);
5094   ldx(y_idx, y, tmp);
5095 #ifdef VM_LITTLE_ENDIAN
5096   rldicl(y_idx, y_idx, 32, 0);
5097 #endif
5098 
5099 
5100   bind(L_multiply);
5101   multiply64(product_high, product, x_xstart, y_idx);
5102 
5103   li(tmp, 0);
5104   addc(product, product, carry);         // Add carry to result.
5105   adde(product_high, product_high, tmp); // Add carry of the last addition.
5106   addi(kdx, kdx, -2);
5107 
5108   // Store result.
5109 #ifdef VM_LITTLE_ENDIAN
5110   rldicl(product, product, 32, 0);
5111 #endif
5112   sldi(tmp, kdx, LogBytesPerInt);
5113   stdx(product, z, tmp);
5114   mr_if_needed(carry, product_high);
5115   b(L_first_loop);
5116 
5117 
5118   bind(L_one_y); // Load one 32 bit portion of y as (0,value).
5119 
5120   lwz(y_idx, 0, y);
5121   b(L_multiply);
5122 
5123 
5124   bind(L_one_x); // Load one 32 bit portion of x as (0,value).
5125 
5126   lwz(x_xstart, 0, x);
5127   b(L_first_loop);
5128 
5129   bind(L_first_loop_exit);
5130 }
5131 
5132 // Multiply 64 bit by 64 bit and add 128 bit.
5133 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y,
5134                                             Register z, Register yz_idx,
5135                                             Register idx, Register carry,
5136                                             Register product_high, Register product,
5137                                             Register tmp, int offset) {
5138 
5139   //  huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
5140   //  z[kdx] = (jlong)product;
5141 
5142   sldi(tmp, idx, LogBytesPerInt);
5143   if (offset) {
5144     addi(tmp, tmp, offset);
5145   }
5146   ldx(yz_idx, y, tmp);
5147 #ifdef VM_LITTLE_ENDIAN
5148   rldicl(yz_idx, yz_idx, 32, 0);
5149 #endif
5150 
5151   multiply64(product_high, product, x_xstart, yz_idx);
5152   ldx(yz_idx, z, tmp);
5153 #ifdef VM_LITTLE_ENDIAN
5154   rldicl(yz_idx, yz_idx, 32, 0);
5155 #endif
5156 
5157   add2_with_carry(product_high, product, carry, yz_idx);
5158 
5159   sldi(tmp, idx, LogBytesPerInt);
5160   if (offset) {
5161     addi(tmp, tmp, offset);
5162   }
5163 #ifdef VM_LITTLE_ENDIAN
5164   rldicl(product, product, 32, 0);
5165 #endif
5166   stdx(product, z, tmp);
5167 }
5168 
5169 // Multiply 128 bit by 128 bit. Unrolled inner loop.
5170 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart,
5171                                              Register y, Register z,
5172                                              Register yz_idx, Register idx, Register carry,
5173                                              Register product_high, Register product,
5174                                              Register carry2, Register tmp) {
5175 
5176   //  jlong carry, x[], y[], z[];
5177   //  int kdx = ystart+1;
5178   //  for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
5179   //    huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
5180   //    z[kdx+idx+1] = (jlong)product;
5181   //    jlong carry2 = (jlong)(product >>> 64);
5182   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
5183   //    z[kdx+idx] = (jlong)product;
5184   //    carry = (jlong)(product >>> 64);
5185   //  }
5186   //  idx += 2;
5187   //  if (idx > 0) {
5188   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
5189   //    z[kdx+idx] = (jlong)product;
5190   //    carry = (jlong)(product >>> 64);
5191   //  }
5192 
5193   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
5194   const Register jdx = R0;
5195 
5196   // Scale the index.
5197   srdi_(jdx, idx, 2);
5198   beq(CCR0, L_third_loop_exit);
5199   mtctr(jdx);
5200 
5201   align(32, 16);
5202   bind(L_third_loop);
5203 
5204   addi(idx, idx, -4);
5205 
5206   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8);
5207   mr_if_needed(carry2, product_high);
5208 
5209   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0);
5210   mr_if_needed(carry, product_high);
5211   bdnz(L_third_loop);
5212 
5213   bind(L_third_loop_exit);  // Handle any left-over operand parts.
5214 
5215   andi_(idx, idx, 0x3);
5216   beq(CCR0, L_post_third_loop_done);
5217 
5218   Label L_check_1;
5219 
5220   addic_(idx, idx, -2);
5221   blt(CCR0, L_check_1);
5222 
5223   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0);
5224   mr_if_needed(carry, product_high);
5225 
5226   bind(L_check_1);
5227 
5228   addi(idx, idx, 0x2);
5229   andi_(idx, idx, 0x1);
5230   addic_(idx, idx, -1);
5231   blt(CCR0, L_post_third_loop_done);
5232 
5233   sldi(tmp, idx, LogBytesPerInt);
5234   lwzx(yz_idx, y, tmp);
5235   multiply64(product_high, product, x_xstart, yz_idx);
5236   lwzx(yz_idx, z, tmp);
5237 
5238   add2_with_carry(product_high, product, yz_idx, carry);
5239 
5240   sldi(tmp, idx, LogBytesPerInt);
5241   stwx(product, z, tmp);
5242   srdi(product, product, 32);
5243 
5244   sldi(product_high, product_high, 32);
5245   orr(product, product, product_high);
5246   mr_if_needed(carry, product);
5247 
5248   bind(L_post_third_loop_done);
5249 }   // multiply_128_x_128_loop
5250 
5251 void MacroAssembler::muladd(Register out, Register in,
5252                             Register offset, Register len, Register k,
5253                             Register tmp1, Register tmp2, Register carry) {
5254 
5255   // Labels
5256   Label LOOP, SKIP;
5257 
5258   // Make sure length is positive.
5259   cmpdi  (CCR0,    len,     0);
5260 
5261   // Prepare variables
5262   subi   (offset,  offset,  4);
5263   li     (carry,   0);
5264   ble    (CCR0,    SKIP);
5265 
5266   mtctr  (len);
5267   subi   (len,     len,     1    );
5268   sldi   (len,     len,     2    );
5269 
5270   // Main loop
5271   bind(LOOP);
5272   lwzx   (tmp1,    len,     in   );
5273   lwzx   (tmp2,    offset,  out  );
5274   mulld  (tmp1,    tmp1,    k    );
5275   add    (tmp2,    carry,   tmp2 );
5276   add    (tmp2,    tmp1,    tmp2 );
5277   stwx   (tmp2,    offset,  out  );
5278   srdi   (carry,   tmp2,    32   );
5279   subi   (offset,  offset,  4    );
5280   subi   (len,     len,     4    );
5281   bdnz   (LOOP);
5282   bind(SKIP);
5283 }
5284 
5285 void MacroAssembler::multiply_to_len(Register x, Register xlen,
5286                                      Register y, Register ylen,
5287                                      Register z, Register zlen,
5288                                      Register tmp1, Register tmp2,
5289                                      Register tmp3, Register tmp4,
5290                                      Register tmp5, Register tmp6,
5291                                      Register tmp7, Register tmp8,
5292                                      Register tmp9, Register tmp10,
5293                                      Register tmp11, Register tmp12,
5294                                      Register tmp13) {
5295 
5296   ShortBranchVerifier sbv(this);
5297 
5298   assert_different_registers(x, xlen, y, ylen, z, zlen,
5299                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
5300   assert_different_registers(x, xlen, y, ylen, z, zlen,
5301                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp7);
5302   assert_different_registers(x, xlen, y, ylen, z, zlen,
5303                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp8);
5304 
5305   const Register idx = tmp1;
5306   const Register kdx = tmp2;
5307   const Register xstart = tmp3;
5308 
5309   const Register y_idx = tmp4;
5310   const Register carry = tmp5;
5311   const Register product = tmp6;
5312   const Register product_high = tmp7;
5313   const Register x_xstart = tmp8;
5314   const Register tmp = tmp9;
5315 
5316   // First Loop.
5317   //
5318   //  final static long LONG_MASK = 0xffffffffL;
5319   //  int xstart = xlen - 1;
5320   //  int ystart = ylen - 1;
5321   //  long carry = 0;
5322   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
5323   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
5324   //    z[kdx] = (int)product;
5325   //    carry = product >>> 32;
5326   //  }
5327   //  z[xstart] = (int)carry;
5328 
5329   mr_if_needed(idx, ylen);        // idx = ylen
5330   mr_if_needed(kdx, zlen);        // kdx = xlen + ylen
5331   li(carry, 0);                   // carry = 0
5332 
5333   Label L_done;
5334 
5335   addic_(xstart, xlen, -1);
5336   blt(CCR0, L_done);
5337 
5338   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z,
5339                         carry, product_high, product, idx, kdx, tmp);
5340 
5341   Label L_second_loop;
5342 
5343   cmpdi(CCR0, kdx, 0);
5344   beq(CCR0, L_second_loop);
5345 
5346   Label L_carry;
5347 
5348   addic_(kdx, kdx, -1);
5349   beq(CCR0, L_carry);
5350 
5351   // Store lower 32 bits of carry.
5352   sldi(tmp, kdx, LogBytesPerInt);
5353   stwx(carry, z, tmp);
5354   srdi(carry, carry, 32);
5355   addi(kdx, kdx, -1);
5356 
5357 
5358   bind(L_carry);
5359 
5360   // Store upper 32 bits of carry.
5361   sldi(tmp, kdx, LogBytesPerInt);
5362   stwx(carry, z, tmp);
5363 
5364   // Second and third (nested) loops.
5365   //
5366   //  for (int i = xstart-1; i >= 0; i--) { // Second loop
5367   //    carry = 0;
5368   //    for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
5369   //      long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
5370   //                     (z[k] & LONG_MASK) + carry;
5371   //      z[k] = (int)product;
5372   //      carry = product >>> 32;
5373   //    }
5374   //    z[i] = (int)carry;
5375   //  }
5376   //
5377   //  i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
5378 
5379   bind(L_second_loop);
5380 
5381   li(carry, 0);                   // carry = 0;
5382 
5383   addic_(xstart, xstart, -1);     // i = xstart-1;
5384   blt(CCR0, L_done);
5385 
5386   Register zsave = tmp10;
5387 
5388   mr(zsave, z);
5389 
5390 
5391   Label L_last_x;
5392 
5393   sldi(tmp, xstart, LogBytesPerInt);
5394   add(z, z, tmp);                 // z = z + k - j
5395   addi(z, z, 4);
5396   addic_(xstart, xstart, -1);     // i = xstart-1;
5397   blt(CCR0, L_last_x);
5398 
5399   sldi(tmp, xstart, LogBytesPerInt);
5400   ldx(x_xstart, x, tmp);
5401 #ifdef VM_LITTLE_ENDIAN
5402   rldicl(x_xstart, x_xstart, 32, 0);
5403 #endif
5404 
5405 
5406   Label L_third_loop_prologue;
5407 
5408   bind(L_third_loop_prologue);
5409 
5410   Register xsave = tmp11;
5411   Register xlensave = tmp12;
5412   Register ylensave = tmp13;
5413 
5414   mr(xsave, x);
5415   mr(xlensave, xstart);
5416   mr(ylensave, ylen);
5417 
5418 
5419   multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen,
5420                           carry, product_high, product, x, tmp);
5421 
5422   mr(z, zsave);
5423   mr(x, xsave);
5424   mr(xlen, xlensave);   // This is the decrement of the loop counter!
5425   mr(ylen, ylensave);
5426 
5427   addi(tmp3, xlen, 1);
5428   sldi(tmp, tmp3, LogBytesPerInt);
5429   stwx(carry, z, tmp);
5430   addic_(tmp3, tmp3, -1);
5431   blt(CCR0, L_done);
5432 
5433   srdi(carry, carry, 32);
5434   sldi(tmp, tmp3, LogBytesPerInt);
5435   stwx(carry, z, tmp);
5436   b(L_second_loop);
5437 
5438   // Next infrequent code is moved outside loops.
5439   bind(L_last_x);
5440 
5441   lwz(x_xstart, 0, x);
5442   b(L_third_loop_prologue);
5443 
5444   bind(L_done);
5445 }   // multiply_to_len
5446 
5447 void MacroAssembler::asm_assert(bool check_equal, const char *msg, int id) {
5448 #ifdef ASSERT
5449   Label ok;
5450   if (check_equal) {
5451     beq(CCR0, ok);
5452   } else {
5453     bne(CCR0, ok);
5454   }
5455   stop(msg, id);
5456   bind(ok);
5457 #endif
5458 }
5459 
5460 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset,
5461                                           Register mem_base, const char* msg, int id) {
5462 #ifdef ASSERT
5463   switch (size) {
5464     case 4:
5465       lwz(R0, mem_offset, mem_base);
5466       cmpwi(CCR0, R0, 0);
5467       break;
5468     case 8:
5469       ld(R0, mem_offset, mem_base);
5470       cmpdi(CCR0, R0, 0);
5471       break;
5472     default:
5473       ShouldNotReachHere();
5474   }
5475   asm_assert(check_equal, msg, id);
5476 #endif // ASSERT
5477 }
5478 
5479 void MacroAssembler::verify_thread() {
5480   if (VerifyThread) {
5481     unimplemented("'VerifyThread' currently not implemented on PPC");
5482   }
5483 }
5484 
5485 // READ: oop. KILL: R0. Volatile floats perhaps.
5486 void MacroAssembler::verify_oop(Register oop, const char* msg) {
5487   if (!VerifyOops) {
5488     return;
5489   }
5490 
5491   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
5492   const Register tmp = R11; // Will be preserved.
5493   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
5494   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
5495 
5496   mr_if_needed(R4_ARG2, oop);
5497   save_LR_CR(tmp); // save in old frame
5498   push_frame_reg_args(nbytes_save, tmp);
5499   // load FunctionDescriptor** / entry_address *
5500   load_const_optimized(tmp, fd, R0);
5501   // load FunctionDescriptor* / entry_address
5502   ld(tmp, 0, tmp);
5503   load_const_optimized(R3_ARG1, (address)msg, R0);
5504   // Call destination for its side effect.
5505   call_c(tmp);
5506 
5507   pop_frame();
5508   restore_LR_CR(tmp);
5509   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
5510 }
5511 
5512 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) {
5513   if (!VerifyOops) {
5514     return;
5515   }
5516 
5517   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
5518   const Register tmp = R11; // Will be preserved.
5519   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
5520   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
5521 
5522   ld(R4_ARG2, offs, base);
5523   save_LR_CR(tmp); // save in old frame
5524   push_frame_reg_args(nbytes_save, tmp);
5525   // load FunctionDescriptor** / entry_address *
5526   load_const_optimized(tmp, fd, R0);
5527   // load FunctionDescriptor* / entry_address
5528   ld(tmp, 0, tmp);
5529   load_const_optimized(R3_ARG1, (address)msg, R0);
5530   // Call destination for its side effect.
5531   call_c(tmp);
5532 
5533   pop_frame();
5534   restore_LR_CR(tmp);
5535   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
5536 }
5537 
5538 const char* stop_types[] = {
5539   "stop",
5540   "untested",
5541   "unimplemented",
5542   "shouldnotreachhere"
5543 };
5544 
5545 static void stop_on_request(int tp, const char* msg) {
5546   tty->print("PPC assembly code requires stop: (%s) %s\n", stop_types[tp%/*stop_end*/4], msg);
5547   guarantee(false, "PPC assembly code requires stop: %s", msg);
5548 }
5549 
5550 // Call a C-function that prints output.
5551 void MacroAssembler::stop(int type, const char* msg, int id) {
5552 #ifndef PRODUCT
5553   block_comment(err_msg("stop: %s %s {", stop_types[type%stop_end], msg));
5554 #else
5555   block_comment("stop {");
5556 #endif
5557 
5558   // setup arguments
5559   load_const_optimized(R3_ARG1, type);
5560   load_const_optimized(R4_ARG2, (void *)msg, /*tmp=*/R0);
5561   call_VM_leaf(CAST_FROM_FN_PTR(address, stop_on_request), R3_ARG1, R4_ARG2);
5562   illtrap();
5563   emit_int32(id);
5564   block_comment("} stop;");
5565 }
5566 
5567 #ifndef PRODUCT
5568 // Write pattern 0x0101010101010101 in memory region [low-before, high+after].
5569 // Val, addr are temp registers.
5570 // If low == addr, addr is killed.
5571 // High is preserved.
5572 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) {
5573   if (!ZapMemory) return;
5574 
5575   assert_different_registers(low, val);
5576 
5577   BLOCK_COMMENT("zap memory region {");
5578   load_const_optimized(val, 0x0101010101010101);
5579   int size = before + after;
5580   if (low == high && size < 5 && size > 0) {
5581     int offset = -before*BytesPerWord;
5582     for (int i = 0; i < size; ++i) {
5583       std(val, offset, low);
5584       offset += (1*BytesPerWord);
5585     }
5586   } else {
5587     addi(addr, low, -before*BytesPerWord);
5588     assert_different_registers(high, val);
5589     if (after) addi(high, high, after * BytesPerWord);
5590     Label loop;
5591     bind(loop);
5592     std(val, 0, addr);
5593     addi(addr, addr, 8);
5594     cmpd(CCR6, addr, high);
5595     ble(CCR6, loop);
5596     if (after) addi(high, high, -after * BytesPerWord);  // Correct back to old value.
5597   }
5598   BLOCK_COMMENT("} zap memory region");
5599 }
5600 
5601 #endif // !PRODUCT
5602 
5603 void SkipIfEqualZero::skip_to_label_if_equal_zero(MacroAssembler* masm, Register temp,
5604                                                   const bool* flag_addr, Label& label) {
5605   int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true);
5606   assert(sizeof(bool) == 1, "PowerPC ABI");
5607   masm->lbz(temp, simm16_offset, temp);
5608   masm->cmpwi(CCR0, temp, 0);
5609   masm->beq(CCR0, label);
5610 }
5611 
5612 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() {
5613   skip_to_label_if_equal_zero(masm, temp, flag_addr, _label);
5614 }
5615 
5616 SkipIfEqualZero::~SkipIfEqualZero() {
5617   _masm->bind(_label);
5618 }