1 /*
   2  * Copyright (c) 1997, 2020, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2012, 2019, SAP SE. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "compiler/disassembler.hpp"
  29 #include "gc/shared/collectedHeap.inline.hpp"
  30 #include "gc/shared/barrierSet.hpp"
  31 #include "gc/shared/barrierSetAssembler.hpp"
  32 #include "interpreter/interpreter.hpp"
  33 #include "memory/resourceArea.hpp"
  34 #include "nativeInst_ppc.hpp"
  35 #include "oops/klass.inline.hpp"
  36 #include "oops/methodData.hpp"
  37 #include "prims/methodHandles.hpp"
  38 #include "runtime/biasedLocking.hpp"
  39 #include "runtime/icache.hpp"
  40 #include "runtime/interfaceSupport.inline.hpp"
  41 #include "runtime/objectMonitor.hpp"
  42 #include "runtime/os.hpp"
  43 #include "runtime/safepoint.hpp"
  44 #include "runtime/safepointMechanism.hpp"
  45 #include "runtime/sharedRuntime.hpp"
  46 #include "runtime/stubRoutines.hpp"
  47 #include "utilities/macros.hpp"
  48 #include "utilities/powerOfTwo.hpp"
  49 
  50 #ifdef PRODUCT
  51 #define BLOCK_COMMENT(str) // nothing
  52 #else
  53 #define BLOCK_COMMENT(str) block_comment(str)
  54 #endif
  55 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  56 
  57 #ifdef ASSERT
  58 // On RISC, there's no benefit to verifying instruction boundaries.
  59 bool AbstractAssembler::pd_check_instruction_mark() { return false; }
  60 #endif
  61 
  62 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) {
  63   assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range");
  64   if (Assembler::is_simm(si31, 16)) {
  65     ld(d, si31, a);
  66     if (emit_filler_nop) nop();
  67   } else {
  68     const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31);
  69     const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31);
  70     addis(d, a, hi);
  71     ld(d, lo, d);
  72   }
  73 }
  74 
  75 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) {
  76   assert_different_registers(d, a);
  77   ld_largeoffset_unchecked(d, si31, a, emit_filler_nop);
  78 }
  79 
  80 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base,
  81                                       size_t size_in_bytes, bool is_signed) {
  82   switch (size_in_bytes) {
  83   case  8:              ld(dst, offs, base);                         break;
  84   case  4:  is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break;
  85   case  2:  is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break;
  86   case  1:  lbz(dst, offs, base); if (is_signed) extsb(dst, dst);    break; // lba doesn't exist :(
  87   default:  ShouldNotReachHere();
  88   }
  89 }
  90 
  91 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base,
  92                                        size_t size_in_bytes) {
  93   switch (size_in_bytes) {
  94   case  8:  std(dst, offs, base); break;
  95   case  4:  stw(dst, offs, base); break;
  96   case  2:  sth(dst, offs, base); break;
  97   case  1:  stb(dst, offs, base); break;
  98   default:  ShouldNotReachHere();
  99   }
 100 }
 101 
 102 void MacroAssembler::align(int modulus, int max, int rem) {
 103   int padding = (rem + modulus - (offset() % modulus)) % modulus;
 104   if (padding > max) return;
 105   for (int c = (padding >> 2); c > 0; --c) { nop(); }
 106 }
 107 
 108 // Issue instructions that calculate given TOC from global TOC.
 109 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16,
 110                                                        bool add_relocation, bool emit_dummy_addr) {
 111   int offset = -1;
 112   if (emit_dummy_addr) {
 113     offset = -128; // dummy address
 114   } else if (addr != (address)(intptr_t)-1) {
 115     offset = MacroAssembler::offset_to_global_toc(addr);
 116   }
 117 
 118   if (hi16) {
 119     addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset));
 120   }
 121   if (lo16) {
 122     if (add_relocation) {
 123       // Relocate at the addi to avoid confusion with a load from the method's TOC.
 124       relocate(internal_word_Relocation::spec(addr));
 125     }
 126     addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset));
 127   }
 128 }
 129 
 130 address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) {
 131   const int offset = MacroAssembler::offset_to_global_toc(addr);
 132 
 133   const address inst2_addr = a;
 134   const int inst2 = *(int *)inst2_addr;
 135 
 136   // The relocation points to the second instruction, the addi,
 137   // and the addi reads and writes the same register dst.
 138   const int dst = inv_rt_field(inst2);
 139   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 140 
 141   // Now, find the preceding addis which writes to dst.
 142   int inst1 = 0;
 143   address inst1_addr = inst2_addr - BytesPerInstWord;
 144   while (inst1_addr >= bound) {
 145     inst1 = *(int *) inst1_addr;
 146     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 147       // Stop, found the addis which writes dst.
 148       break;
 149     }
 150     inst1_addr -= BytesPerInstWord;
 151   }
 152 
 153   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 154   set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset));
 155   set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset));
 156   return inst1_addr;
 157 }
 158 
 159 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) {
 160   const address inst2_addr = a;
 161   const int inst2 = *(int *)inst2_addr;
 162 
 163   // The relocation points to the second instruction, the addi,
 164   // and the addi reads and writes the same register dst.
 165   const int dst = inv_rt_field(inst2);
 166   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 167 
 168   // Now, find the preceding addis which writes to dst.
 169   int inst1 = 0;
 170   address inst1_addr = inst2_addr - BytesPerInstWord;
 171   while (inst1_addr >= bound) {
 172     inst1 = *(int *) inst1_addr;
 173     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 174       // stop, found the addis which writes dst
 175       break;
 176     }
 177     inst1_addr -= BytesPerInstWord;
 178   }
 179 
 180   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 181 
 182   int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0);
 183   // -1 is a special case
 184   if (offset == -1) {
 185     return (address)(intptr_t)-1;
 186   } else {
 187     return global_toc() + offset;
 188   }
 189 }
 190 
 191 #ifdef _LP64
 192 // Patch compressed oops or klass constants.
 193 // Assembler sequence is
 194 // 1) compressed oops:
 195 //    lis  rx = const.hi
 196 //    ori rx = rx | const.lo
 197 // 2) compressed klass:
 198 //    lis  rx = const.hi
 199 //    clrldi rx = rx & 0xFFFFffff // clearMS32b, optional
 200 //    ori rx = rx | const.lo
 201 // Clrldi will be passed by.
 202 address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) {
 203   assert(UseCompressedOops, "Should only patch compressed oops");
 204 
 205   const address inst2_addr = a;
 206   const int inst2 = *(int *)inst2_addr;
 207 
 208   // The relocation points to the second instruction, the ori,
 209   // and the ori reads and writes the same register dst.
 210   const int dst = inv_rta_field(inst2);
 211   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 212   // Now, find the preceding addis which writes to dst.
 213   int inst1 = 0;
 214   address inst1_addr = inst2_addr - BytesPerInstWord;
 215   bool inst1_found = false;
 216   while (inst1_addr >= bound) {
 217     inst1 = *(int *)inst1_addr;
 218     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; }
 219     inst1_addr -= BytesPerInstWord;
 220   }
 221   assert(inst1_found, "inst is not lis");
 222 
 223   int xc = (data >> 16) & 0xffff;
 224   int xd = (data >>  0) & 0xffff;
 225 
 226   set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo
 227   set_imm((int *)inst2_addr,        (xd)); // unsigned int
 228   return inst1_addr;
 229 }
 230 
 231 // Get compressed oop or klass constant.
 232 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) {
 233   assert(UseCompressedOops, "Should only patch compressed oops");
 234 
 235   const address inst2_addr = a;
 236   const int inst2 = *(int *)inst2_addr;
 237 
 238   // The relocation points to the second instruction, the ori,
 239   // and the ori reads and writes the same register dst.
 240   const int dst = inv_rta_field(inst2);
 241   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 242   // Now, find the preceding lis which writes to dst.
 243   int inst1 = 0;
 244   address inst1_addr = inst2_addr - BytesPerInstWord;
 245   bool inst1_found = false;
 246 
 247   while (inst1_addr >= bound) {
 248     inst1 = *(int *) inst1_addr;
 249     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;}
 250     inst1_addr -= BytesPerInstWord;
 251   }
 252   assert(inst1_found, "inst is not lis");
 253 
 254   uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff));
 255   uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16);
 256 
 257   return (int) (xl | xh);
 258 }
 259 #endif // _LP64
 260 
 261 // Returns true if successful.
 262 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a,
 263                                                 Register toc, bool fixed_size) {
 264   int toc_offset = 0;
 265   // Use RelocationHolder::none for the constant pool entry, otherwise
 266   // we will end up with a failing NativeCall::verify(x) where x is
 267   // the address of the constant pool entry.
 268   // FIXME: We should insert relocation information for oops at the constant
 269   // pool entries instead of inserting it at the loads; patching of a constant
 270   // pool entry should be less expensive.
 271   address const_address = address_constant((address)a.value(), RelocationHolder::none);
 272   if (const_address == NULL) { return false; } // allocation failure
 273   // Relocate at the pc of the load.
 274   relocate(a.rspec());
 275   toc_offset = (int)(const_address - code()->consts()->start());
 276   ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size);
 277   return true;
 278 }
 279 
 280 bool MacroAssembler::is_load_const_from_method_toc_at(address a) {
 281   const address inst1_addr = a;
 282   const int inst1 = *(int *)inst1_addr;
 283 
 284    // The relocation points to the ld or the addis.
 285    return (is_ld(inst1)) ||
 286           (is_addis(inst1) && inv_ra_field(inst1) != 0);
 287 }
 288 
 289 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) {
 290   assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc");
 291 
 292   const address inst1_addr = a;
 293   const int inst1 = *(int *)inst1_addr;
 294 
 295   if (is_ld(inst1)) {
 296     return inv_d1_field(inst1);
 297   } else if (is_addis(inst1)) {
 298     const int dst = inv_rt_field(inst1);
 299 
 300     // Now, find the succeeding ld which reads and writes to dst.
 301     address inst2_addr = inst1_addr + BytesPerInstWord;
 302     int inst2 = 0;
 303     while (true) {
 304       inst2 = *(int *) inst2_addr;
 305       if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) {
 306         // Stop, found the ld which reads and writes dst.
 307         break;
 308       }
 309       inst2_addr += BytesPerInstWord;
 310     }
 311     return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2);
 312   }
 313   ShouldNotReachHere();
 314   return 0;
 315 }
 316 
 317 // Get the constant from a `load_const' sequence.
 318 long MacroAssembler::get_const(address a) {
 319   assert(is_load_const_at(a), "not a load of a constant");
 320   const int *p = (const int*) a;
 321   unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48);
 322   if (is_ori(*(p+1))) {
 323     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32);
 324     x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16);
 325     x |= (((unsigned long) (get_imm(a,4) & 0xffff)));
 326   } else if (is_lis(*(p+1))) {
 327     x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32);
 328     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16);
 329     x |= (((unsigned long) (get_imm(a,3) & 0xffff)));
 330   } else {
 331     ShouldNotReachHere();
 332     return (long) 0;
 333   }
 334   return (long) x;
 335 }
 336 
 337 // Patch the 64 bit constant of a `load_const' sequence. This is a low
 338 // level procedure. It neither flushes the instruction cache nor is it
 339 // mt safe.
 340 void MacroAssembler::patch_const(address a, long x) {
 341   assert(is_load_const_at(a), "not a load of a constant");
 342   int *p = (int*) a;
 343   if (is_ori(*(p+1))) {
 344     set_imm(0 + p, (x >> 48) & 0xffff);
 345     set_imm(1 + p, (x >> 32) & 0xffff);
 346     set_imm(3 + p, (x >> 16) & 0xffff);
 347     set_imm(4 + p, x & 0xffff);
 348   } else if (is_lis(*(p+1))) {
 349     set_imm(0 + p, (x >> 48) & 0xffff);
 350     set_imm(2 + p, (x >> 32) & 0xffff);
 351     set_imm(1 + p, (x >> 16) & 0xffff);
 352     set_imm(3 + p, x & 0xffff);
 353   } else {
 354     ShouldNotReachHere();
 355   }
 356 }
 357 
 358 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) {
 359   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
 360   int index = oop_recorder()->allocate_metadata_index(obj);
 361   RelocationHolder rspec = metadata_Relocation::spec(index);
 362   return AddressLiteral((address)obj, rspec);
 363 }
 364 
 365 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) {
 366   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
 367   int index = oop_recorder()->find_index(obj);
 368   RelocationHolder rspec = metadata_Relocation::spec(index);
 369   return AddressLiteral((address)obj, rspec);
 370 }
 371 
 372 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) {
 373   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
 374   int oop_index = oop_recorder()->allocate_oop_index(obj);
 375   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 376 }
 377 
 378 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) {
 379   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
 380   int oop_index = oop_recorder()->find_index(obj);
 381   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 382 }
 383 
 384 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
 385                                                       Register tmp, int offset) {
 386   intptr_t value = *delayed_value_addr;
 387   if (value != 0) {
 388     return RegisterOrConstant(value + offset);
 389   }
 390 
 391   // Load indirectly to solve generation ordering problem.
 392   // static address, no relocation
 393   int simm16_offset = load_const_optimized(tmp, delayed_value_addr, noreg, true);
 394   ld(tmp, simm16_offset, tmp); // must be aligned ((xa & 3) == 0)
 395 
 396   if (offset != 0) {
 397     addi(tmp, tmp, offset);
 398   }
 399 
 400   return RegisterOrConstant(tmp);
 401 }
 402 
 403 #ifndef PRODUCT
 404 void MacroAssembler::pd_print_patched_instruction(address branch) {
 405   Unimplemented(); // TODO: PPC port
 406 }
 407 #endif // ndef PRODUCT
 408 
 409 // Conditional far branch for destinations encodable in 24+2 bits.
 410 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) {
 411 
 412   // If requested by flag optimize, relocate the bc_far as a
 413   // runtime_call and prepare for optimizing it when the code gets
 414   // relocated.
 415   if (optimize == bc_far_optimize_on_relocate) {
 416     relocate(relocInfo::runtime_call_type);
 417   }
 418 
 419   // variant 2:
 420   //
 421   //    b!cxx SKIP
 422   //    bxx   DEST
 423   //  SKIP:
 424   //
 425 
 426   const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 427                                                 opposite_bcond(inv_boint_bcond(boint)));
 428 
 429   // We emit two branches.
 430   // First, a conditional branch which jumps around the far branch.
 431   const address not_taken_pc = pc() + 2 * BytesPerInstWord;
 432   const address bc_pc        = pc();
 433   bc(opposite_boint, biint, not_taken_pc);
 434 
 435   const int bc_instr = *(int*)bc_pc;
 436   assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition");
 437   assert(opposite_boint == inv_bo_field(bc_instr), "postcondition");
 438   assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))),
 439                                      opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))),
 440          "postcondition");
 441   assert(biint == inv_bi_field(bc_instr), "postcondition");
 442 
 443   // Second, an unconditional far branch which jumps to dest.
 444   // Note: target(dest) remembers the current pc (see CodeSection::target)
 445   //       and returns the current pc if the label is not bound yet; when
 446   //       the label gets bound, the unconditional far branch will be patched.
 447   const address target_pc = target(dest);
 448   const address b_pc  = pc();
 449   b(target_pc);
 450 
 451   assert(not_taken_pc == pc(),                     "postcondition");
 452   assert(dest.is_bound() || target_pc == b_pc, "postcondition");
 453 }
 454 
 455 // 1 or 2 instructions
 456 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) {
 457   if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) {
 458     bc(boint, biint, dest);
 459   } else {
 460     bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate);
 461   }
 462 }
 463 
 464 bool MacroAssembler::is_bc_far_at(address instruction_addr) {
 465   return is_bc_far_variant1_at(instruction_addr) ||
 466          is_bc_far_variant2_at(instruction_addr) ||
 467          is_bc_far_variant3_at(instruction_addr);
 468 }
 469 
 470 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) {
 471   if (is_bc_far_variant1_at(instruction_addr)) {
 472     const address instruction_1_addr = instruction_addr;
 473     const int instruction_1 = *(int*)instruction_1_addr;
 474     return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr);
 475   } else if (is_bc_far_variant2_at(instruction_addr)) {
 476     const address instruction_2_addr = instruction_addr + 4;
 477     return bxx_destination(instruction_2_addr);
 478   } else if (is_bc_far_variant3_at(instruction_addr)) {
 479     return instruction_addr + 8;
 480   }
 481   // variant 4 ???
 482   ShouldNotReachHere();
 483   return NULL;
 484 }
 485 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) {
 486 
 487   if (is_bc_far_variant3_at(instruction_addr)) {
 488     // variant 3, far cond branch to the next instruction, already patched to nops:
 489     //
 490     //    nop
 491     //    endgroup
 492     //  SKIP/DEST:
 493     //
 494     return;
 495   }
 496 
 497   // first, extract boint and biint from the current branch
 498   int boint = 0;
 499   int biint = 0;
 500 
 501   ResourceMark rm;
 502   const int code_size = 2 * BytesPerInstWord;
 503   CodeBuffer buf(instruction_addr, code_size);
 504   MacroAssembler masm(&buf);
 505   if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) {
 506     // Far branch to next instruction: Optimize it by patching nops (produce variant 3).
 507     masm.nop();
 508     masm.endgroup();
 509   } else {
 510     if (is_bc_far_variant1_at(instruction_addr)) {
 511       // variant 1, the 1st instruction contains the destination address:
 512       //
 513       //    bcxx  DEST
 514       //    nop
 515       //
 516       const int instruction_1 = *(int*)(instruction_addr);
 517       boint = inv_bo_field(instruction_1);
 518       biint = inv_bi_field(instruction_1);
 519     } else if (is_bc_far_variant2_at(instruction_addr)) {
 520       // variant 2, the 2nd instruction contains the destination address:
 521       //
 522       //    b!cxx SKIP
 523       //    bxx   DEST
 524       //  SKIP:
 525       //
 526       const int instruction_1 = *(int*)(instruction_addr);
 527       boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))),
 528           opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1))));
 529       biint = inv_bi_field(instruction_1);
 530     } else {
 531       // variant 4???
 532       ShouldNotReachHere();
 533     }
 534 
 535     // second, set the new branch destination and optimize the code
 536     if (dest != instruction_addr + 4 && // the bc_far is still unbound!
 537         masm.is_within_range_of_bcxx(dest, instruction_addr)) {
 538       // variant 1:
 539       //
 540       //    bcxx  DEST
 541       //    nop
 542       //
 543       masm.bc(boint, biint, dest);
 544       masm.nop();
 545     } else {
 546       // variant 2:
 547       //
 548       //    b!cxx SKIP
 549       //    bxx   DEST
 550       //  SKIP:
 551       //
 552       const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 553                                                     opposite_bcond(inv_boint_bcond(boint)));
 554       const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord;
 555       masm.bc(opposite_boint, biint, not_taken_pc);
 556       masm.b(dest);
 557     }
 558   }
 559   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 560 }
 561 
 562 // Emit a NOT mt-safe patchable 64 bit absolute call/jump.
 563 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) {
 564   // get current pc
 565   uint64_t start_pc = (uint64_t) pc();
 566 
 567   const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last
 568   const address pc_of_b  = (address) (start_pc + (0*BytesPerInstWord)); // b is first
 569 
 570   // relocate here
 571   if (rt != relocInfo::none) {
 572     relocate(rt);
 573   }
 574 
 575   if ( ReoptimizeCallSequences &&
 576        (( link && is_within_range_of_b(dest, pc_of_bl)) ||
 577         (!link && is_within_range_of_b(dest, pc_of_b)))) {
 578     // variant 2:
 579     // Emit an optimized, pc-relative call/jump.
 580 
 581     if (link) {
 582       // some padding
 583       nop();
 584       nop();
 585       nop();
 586       nop();
 587       nop();
 588       nop();
 589 
 590       // do the call
 591       assert(pc() == pc_of_bl, "just checking");
 592       bl(dest, relocInfo::none);
 593     } else {
 594       // do the jump
 595       assert(pc() == pc_of_b, "just checking");
 596       b(dest, relocInfo::none);
 597 
 598       // some padding
 599       nop();
 600       nop();
 601       nop();
 602       nop();
 603       nop();
 604       nop();
 605     }
 606 
 607     // Assert that we can identify the emitted call/jump.
 608     assert(is_bxx64_patchable_variant2_at((address)start_pc, link),
 609            "can't identify emitted call");
 610   } else {
 611     // variant 1:
 612     mr(R0, R11);  // spill R11 -> R0.
 613 
 614     // Load the destination address into CTR,
 615     // calculate destination relative to global toc.
 616     calculate_address_from_global_toc(R11, dest, true, true, false);
 617 
 618     mtctr(R11);
 619     mr(R11, R0);  // spill R11 <- R0.
 620     nop();
 621 
 622     // do the call/jump
 623     if (link) {
 624       bctrl();
 625     } else{
 626       bctr();
 627     }
 628     // Assert that we can identify the emitted call/jump.
 629     assert(is_bxx64_patchable_variant1b_at((address)start_pc, link),
 630            "can't identify emitted call");
 631   }
 632 
 633   // Assert that we can identify the emitted call/jump.
 634   assert(is_bxx64_patchable_at((address)start_pc, link),
 635          "can't identify emitted call");
 636   assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest,
 637          "wrong encoding of dest address");
 638 }
 639 
 640 // Identify a bxx64_patchable instruction.
 641 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) {
 642   return is_bxx64_patchable_variant1b_at(instruction_addr, link)
 643     //|| is_bxx64_patchable_variant1_at(instruction_addr, link)
 644       || is_bxx64_patchable_variant2_at(instruction_addr, link);
 645 }
 646 
 647 // Does the call64_patchable instruction use a pc-relative encoding of
 648 // the call destination?
 649 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) {
 650   // variant 2 is pc-relative
 651   return is_bxx64_patchable_variant2_at(instruction_addr, link);
 652 }
 653 
 654 // Identify variant 1.
 655 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) {
 656   unsigned int* instr = (unsigned int*) instruction_addr;
 657   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 658       && is_mtctr(instr[5]) // mtctr
 659     && is_load_const_at(instruction_addr);
 660 }
 661 
 662 // Identify variant 1b: load destination relative to global toc.
 663 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) {
 664   unsigned int* instr = (unsigned int*) instruction_addr;
 665   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 666     && is_mtctr(instr[3]) // mtctr
 667     && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr);
 668 }
 669 
 670 // Identify variant 2.
 671 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) {
 672   unsigned int* instr = (unsigned int*) instruction_addr;
 673   if (link) {
 674     return is_bl (instr[6])  // bl dest is last
 675       && is_nop(instr[0])  // nop
 676       && is_nop(instr[1])  // nop
 677       && is_nop(instr[2])  // nop
 678       && is_nop(instr[3])  // nop
 679       && is_nop(instr[4])  // nop
 680       && is_nop(instr[5]); // nop
 681   } else {
 682     return is_b  (instr[0])  // b  dest is first
 683       && is_nop(instr[1])  // nop
 684       && is_nop(instr[2])  // nop
 685       && is_nop(instr[3])  // nop
 686       && is_nop(instr[4])  // nop
 687       && is_nop(instr[5])  // nop
 688       && is_nop(instr[6]); // nop
 689   }
 690 }
 691 
 692 // Set dest address of a bxx64_patchable instruction.
 693 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) {
 694   ResourceMark rm;
 695   int code_size = MacroAssembler::bxx64_patchable_size;
 696   CodeBuffer buf(instruction_addr, code_size);
 697   MacroAssembler masm(&buf);
 698   masm.bxx64_patchable(dest, relocInfo::none, link);
 699   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 700 }
 701 
 702 // Get dest address of a bxx64_patchable instruction.
 703 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) {
 704   if (is_bxx64_patchable_variant1_at(instruction_addr, link)) {
 705     return (address) (unsigned long) get_const(instruction_addr);
 706   } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) {
 707     unsigned int* instr = (unsigned int*) instruction_addr;
 708     if (link) {
 709       const int instr_idx = 6; // bl is last
 710       int branchoffset = branch_destination(instr[instr_idx], 0);
 711       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 712     } else {
 713       const int instr_idx = 0; // b is first
 714       int branchoffset = branch_destination(instr[instr_idx], 0);
 715       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 716     }
 717   // Load dest relative to global toc.
 718   } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) {
 719     return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord,
 720                                                                instruction_addr);
 721   } else {
 722     ShouldNotReachHere();
 723     return NULL;
 724   }
 725 }
 726 
 727 // Uses ordering which corresponds to ABI:
 728 //    _savegpr0_14:  std  r14,-144(r1)
 729 //    _savegpr0_15:  std  r15,-136(r1)
 730 //    _savegpr0_16:  std  r16,-128(r1)
 731 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) {
 732   std(R14, offset, dst);   offset += 8;
 733   std(R15, offset, dst);   offset += 8;
 734   std(R16, offset, dst);   offset += 8;
 735   std(R17, offset, dst);   offset += 8;
 736   std(R18, offset, dst);   offset += 8;
 737   std(R19, offset, dst);   offset += 8;
 738   std(R20, offset, dst);   offset += 8;
 739   std(R21, offset, dst);   offset += 8;
 740   std(R22, offset, dst);   offset += 8;
 741   std(R23, offset, dst);   offset += 8;
 742   std(R24, offset, dst);   offset += 8;
 743   std(R25, offset, dst);   offset += 8;
 744   std(R26, offset, dst);   offset += 8;
 745   std(R27, offset, dst);   offset += 8;
 746   std(R28, offset, dst);   offset += 8;
 747   std(R29, offset, dst);   offset += 8;
 748   std(R30, offset, dst);   offset += 8;
 749   std(R31, offset, dst);   offset += 8;
 750 
 751   stfd(F14, offset, dst);   offset += 8;
 752   stfd(F15, offset, dst);   offset += 8;
 753   stfd(F16, offset, dst);   offset += 8;
 754   stfd(F17, offset, dst);   offset += 8;
 755   stfd(F18, offset, dst);   offset += 8;
 756   stfd(F19, offset, dst);   offset += 8;
 757   stfd(F20, offset, dst);   offset += 8;
 758   stfd(F21, offset, dst);   offset += 8;
 759   stfd(F22, offset, dst);   offset += 8;
 760   stfd(F23, offset, dst);   offset += 8;
 761   stfd(F24, offset, dst);   offset += 8;
 762   stfd(F25, offset, dst);   offset += 8;
 763   stfd(F26, offset, dst);   offset += 8;
 764   stfd(F27, offset, dst);   offset += 8;
 765   stfd(F28, offset, dst);   offset += 8;
 766   stfd(F29, offset, dst);   offset += 8;
 767   stfd(F30, offset, dst);   offset += 8;
 768   stfd(F31, offset, dst);
 769 }
 770 
 771 // Uses ordering which corresponds to ABI:
 772 //    _restgpr0_14:  ld   r14,-144(r1)
 773 //    _restgpr0_15:  ld   r15,-136(r1)
 774 //    _restgpr0_16:  ld   r16,-128(r1)
 775 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) {
 776   ld(R14, offset, src);   offset += 8;
 777   ld(R15, offset, src);   offset += 8;
 778   ld(R16, offset, src);   offset += 8;
 779   ld(R17, offset, src);   offset += 8;
 780   ld(R18, offset, src);   offset += 8;
 781   ld(R19, offset, src);   offset += 8;
 782   ld(R20, offset, src);   offset += 8;
 783   ld(R21, offset, src);   offset += 8;
 784   ld(R22, offset, src);   offset += 8;
 785   ld(R23, offset, src);   offset += 8;
 786   ld(R24, offset, src);   offset += 8;
 787   ld(R25, offset, src);   offset += 8;
 788   ld(R26, offset, src);   offset += 8;
 789   ld(R27, offset, src);   offset += 8;
 790   ld(R28, offset, src);   offset += 8;
 791   ld(R29, offset, src);   offset += 8;
 792   ld(R30, offset, src);   offset += 8;
 793   ld(R31, offset, src);   offset += 8;
 794 
 795   // FP registers
 796   lfd(F14, offset, src);   offset += 8;
 797   lfd(F15, offset, src);   offset += 8;
 798   lfd(F16, offset, src);   offset += 8;
 799   lfd(F17, offset, src);   offset += 8;
 800   lfd(F18, offset, src);   offset += 8;
 801   lfd(F19, offset, src);   offset += 8;
 802   lfd(F20, offset, src);   offset += 8;
 803   lfd(F21, offset, src);   offset += 8;
 804   lfd(F22, offset, src);   offset += 8;
 805   lfd(F23, offset, src);   offset += 8;
 806   lfd(F24, offset, src);   offset += 8;
 807   lfd(F25, offset, src);   offset += 8;
 808   lfd(F26, offset, src);   offset += 8;
 809   lfd(F27, offset, src);   offset += 8;
 810   lfd(F28, offset, src);   offset += 8;
 811   lfd(F29, offset, src);   offset += 8;
 812   lfd(F30, offset, src);   offset += 8;
 813   lfd(F31, offset, src);
 814 }
 815 
 816 // For verify_oops.
 817 void MacroAssembler::save_volatile_gprs(Register dst, int offset) {
 818   std(R2,  offset, dst);   offset += 8;
 819   std(R3,  offset, dst);   offset += 8;
 820   std(R4,  offset, dst);   offset += 8;
 821   std(R5,  offset, dst);   offset += 8;
 822   std(R6,  offset, dst);   offset += 8;
 823   std(R7,  offset, dst);   offset += 8;
 824   std(R8,  offset, dst);   offset += 8;
 825   std(R9,  offset, dst);   offset += 8;
 826   std(R10, offset, dst);   offset += 8;
 827   std(R11, offset, dst);   offset += 8;
 828   std(R12, offset, dst);   offset += 8;
 829 
 830   stfd(F0, offset, dst);   offset += 8;
 831   stfd(F1, offset, dst);   offset += 8;
 832   stfd(F2, offset, dst);   offset += 8;
 833   stfd(F3, offset, dst);   offset += 8;
 834   stfd(F4, offset, dst);   offset += 8;
 835   stfd(F5, offset, dst);   offset += 8;
 836   stfd(F6, offset, dst);   offset += 8;
 837   stfd(F7, offset, dst);   offset += 8;
 838   stfd(F8, offset, dst);   offset += 8;
 839   stfd(F9, offset, dst);   offset += 8;
 840   stfd(F10, offset, dst);  offset += 8;
 841   stfd(F11, offset, dst);  offset += 8;
 842   stfd(F12, offset, dst);  offset += 8;
 843   stfd(F13, offset, dst);
 844 }
 845 
 846 // For verify_oops.
 847 void MacroAssembler::restore_volatile_gprs(Register src, int offset) {
 848   ld(R2,  offset, src);   offset += 8;
 849   ld(R3,  offset, src);   offset += 8;
 850   ld(R4,  offset, src);   offset += 8;
 851   ld(R5,  offset, src);   offset += 8;
 852   ld(R6,  offset, src);   offset += 8;
 853   ld(R7,  offset, src);   offset += 8;
 854   ld(R8,  offset, src);   offset += 8;
 855   ld(R9,  offset, src);   offset += 8;
 856   ld(R10, offset, src);   offset += 8;
 857   ld(R11, offset, src);   offset += 8;
 858   ld(R12, offset, src);   offset += 8;
 859 
 860   lfd(F0, offset, src);   offset += 8;
 861   lfd(F1, offset, src);   offset += 8;
 862   lfd(F2, offset, src);   offset += 8;
 863   lfd(F3, offset, src);   offset += 8;
 864   lfd(F4, offset, src);   offset += 8;
 865   lfd(F5, offset, src);   offset += 8;
 866   lfd(F6, offset, src);   offset += 8;
 867   lfd(F7, offset, src);   offset += 8;
 868   lfd(F8, offset, src);   offset += 8;
 869   lfd(F9, offset, src);   offset += 8;
 870   lfd(F10, offset, src);  offset += 8;
 871   lfd(F11, offset, src);  offset += 8;
 872   lfd(F12, offset, src);  offset += 8;
 873   lfd(F13, offset, src);
 874 }
 875 
 876 void MacroAssembler::save_LR_CR(Register tmp) {
 877   mfcr(tmp);
 878   std(tmp, _abi(cr), R1_SP);
 879   mflr(tmp);
 880   std(tmp, _abi(lr), R1_SP);
 881   // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad)
 882 }
 883 
 884 void MacroAssembler::restore_LR_CR(Register tmp) {
 885   assert(tmp != R1_SP, "must be distinct");
 886   ld(tmp, _abi(lr), R1_SP);
 887   mtlr(tmp);
 888   ld(tmp, _abi(cr), R1_SP);
 889   mtcr(tmp);
 890 }
 891 
 892 address MacroAssembler::get_PC_trash_LR(Register result) {
 893   Label L;
 894   bl(L);
 895   bind(L);
 896   address lr_pc = pc();
 897   mflr(result);
 898   return lr_pc;
 899 }
 900 
 901 void MacroAssembler::resize_frame(Register offset, Register tmp) {
 902 #ifdef ASSERT
 903   assert_different_registers(offset, tmp, R1_SP);
 904   andi_(tmp, offset, frame::alignment_in_bytes-1);
 905   asm_assert_eq("resize_frame: unaligned");
 906 #endif
 907 
 908   // tmp <- *(SP)
 909   ld(tmp, _abi(callers_sp), R1_SP);
 910   // addr <- SP + offset;
 911   // *(addr) <- tmp;
 912   // SP <- addr
 913   stdux(tmp, R1_SP, offset);
 914 }
 915 
 916 void MacroAssembler::resize_frame(int offset, Register tmp) {
 917   assert(is_simm(offset, 16), "too big an offset");
 918   assert_different_registers(tmp, R1_SP);
 919   assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned");
 920   // tmp <- *(SP)
 921   ld(tmp, _abi(callers_sp), R1_SP);
 922   // addr <- SP + offset;
 923   // *(addr) <- tmp;
 924   // SP <- addr
 925   stdu(tmp, offset, R1_SP);
 926 }
 927 
 928 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) {
 929   // (addr == tmp1) || (addr == tmp2) is allowed here!
 930   assert(tmp1 != tmp2, "must be distinct");
 931 
 932   // compute offset w.r.t. current stack pointer
 933   // tmp_1 <- addr - SP (!)
 934   subf(tmp1, R1_SP, addr);
 935 
 936   // atomically update SP keeping back link.
 937   resize_frame(tmp1/* offset */, tmp2/* tmp */);
 938 }
 939 
 940 void MacroAssembler::push_frame(Register bytes, Register tmp) {
 941 #ifdef ASSERT
 942   assert(bytes != R0, "r0 not allowed here");
 943   andi_(R0, bytes, frame::alignment_in_bytes-1);
 944   asm_assert_eq("push_frame(Reg, Reg): unaligned");
 945 #endif
 946   neg(tmp, bytes);
 947   stdux(R1_SP, R1_SP, tmp);
 948 }
 949 
 950 // Push a frame of size `bytes'.
 951 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) {
 952   long offset = align_addr(bytes, frame::alignment_in_bytes);
 953   if (is_simm(-offset, 16)) {
 954     stdu(R1_SP, -offset, R1_SP);
 955   } else {
 956     load_const_optimized(tmp, -offset);
 957     stdux(R1_SP, R1_SP, tmp);
 958   }
 959 }
 960 
 961 // Push a frame of size `bytes' plus abi_reg_args on top.
 962 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) {
 963   push_frame(bytes + frame::abi_reg_args_size, tmp);
 964 }
 965 
 966 // Setup up a new C frame with a spill area for non-volatile GPRs and
 967 // additional space for local variables.
 968 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes,
 969                                                       Register tmp) {
 970   push_frame(bytes + frame::abi_reg_args_size + frame::spill_nonvolatiles_size, tmp);
 971 }
 972 
 973 // Pop current C frame.
 974 void MacroAssembler::pop_frame() {
 975   ld(R1_SP, _abi(callers_sp), R1_SP);
 976 }
 977 
 978 #if defined(ABI_ELFv2)
 979 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) {
 980   // TODO(asmundak): make sure the caller uses R12 as function descriptor
 981   // most of the times.
 982   if (R12 != r_function_entry) {
 983     mr(R12, r_function_entry);
 984   }
 985   mtctr(R12);
 986   // Do a call or a branch.
 987   if (and_link) {
 988     bctrl();
 989   } else {
 990     bctr();
 991   }
 992   _last_calls_return_pc = pc();
 993 
 994   return _last_calls_return_pc;
 995 }
 996 
 997 // Call a C function via a function descriptor and use full C
 998 // calling conventions. Updates and returns _last_calls_return_pc.
 999 address MacroAssembler::call_c(Register r_function_entry) {
1000   return branch_to(r_function_entry, /*and_link=*/true);
1001 }
1002 
1003 // For tail calls: only branch, don't link, so callee returns to caller of this function.
1004 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) {
1005   return branch_to(r_function_entry, /*and_link=*/false);
1006 }
1007 
1008 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) {
1009   load_const(R12, function_entry, R0);
1010   return branch_to(R12,  /*and_link=*/true);
1011 }
1012 
1013 #else
1014 // Generic version of a call to C function via a function descriptor
1015 // with variable support for C calling conventions (TOC, ENV, etc.).
1016 // Updates and returns _last_calls_return_pc.
1017 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call,
1018                                   bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) {
1019   // we emit standard ptrgl glue code here
1020   assert((function_descriptor != R0), "function_descriptor cannot be R0");
1021 
1022   // retrieve necessary entries from the function descriptor
1023   ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor);
1024   mtctr(R0);
1025 
1026   if (load_toc_of_callee) {
1027     ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor);
1028   }
1029   if (load_env_of_callee) {
1030     ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor);
1031   } else if (load_toc_of_callee) {
1032     li(R11, 0);
1033   }
1034 
1035   // do a call or a branch
1036   if (and_link) {
1037     bctrl();
1038   } else {
1039     bctr();
1040   }
1041   _last_calls_return_pc = pc();
1042 
1043   return _last_calls_return_pc;
1044 }
1045 
1046 // Call a C function via a function descriptor and use full C calling
1047 // conventions.
1048 // We don't use the TOC in generated code, so there is no need to save
1049 // and restore its value.
1050 address MacroAssembler::call_c(Register fd) {
1051   return branch_to(fd, /*and_link=*/true,
1052                        /*save toc=*/false,
1053                        /*restore toc=*/false,
1054                        /*load toc=*/true,
1055                        /*load env=*/true);
1056 }
1057 
1058 address MacroAssembler::call_c_and_return_to_caller(Register fd) {
1059   return branch_to(fd, /*and_link=*/false,
1060                        /*save toc=*/false,
1061                        /*restore toc=*/false,
1062                        /*load toc=*/true,
1063                        /*load env=*/true);
1064 }
1065 
1066 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) {
1067   if (rt != relocInfo::none) {
1068     // this call needs to be relocatable
1069     if (!ReoptimizeCallSequences
1070         || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1071         || fd == NULL   // support code-size estimation
1072         || !fd->is_friend_function()
1073         || fd->entry() == NULL) {
1074       // it's not a friend function as defined by class FunctionDescriptor,
1075       // so do a full call-c here.
1076       load_const(R11, (address)fd, R0);
1077 
1078       bool has_env = (fd != NULL && fd->env() != NULL);
1079       return branch_to(R11, /*and_link=*/true,
1080                             /*save toc=*/false,
1081                             /*restore toc=*/false,
1082                             /*load toc=*/true,
1083                             /*load env=*/has_env);
1084     } else {
1085       // It's a friend function. Load the entry point and don't care about
1086       // toc and env. Use an optimizable call instruction, but ensure the
1087       // same code-size as in the case of a non-friend function.
1088       nop();
1089       nop();
1090       nop();
1091       bl64_patchable(fd->entry(), rt);
1092       _last_calls_return_pc = pc();
1093       return _last_calls_return_pc;
1094     }
1095   } else {
1096     // This call does not need to be relocatable, do more aggressive
1097     // optimizations.
1098     if (!ReoptimizeCallSequences
1099       || !fd->is_friend_function()) {
1100       // It's not a friend function as defined by class FunctionDescriptor,
1101       // so do a full call-c here.
1102       load_const(R11, (address)fd, R0);
1103       return branch_to(R11, /*and_link=*/true,
1104                             /*save toc=*/false,
1105                             /*restore toc=*/false,
1106                             /*load toc=*/true,
1107                             /*load env=*/true);
1108     } else {
1109       // it's a friend function, load the entry point and don't care about
1110       // toc and env.
1111       address dest = fd->entry();
1112       if (is_within_range_of_b(dest, pc())) {
1113         bl(dest);
1114       } else {
1115         bl64_patchable(dest, rt);
1116       }
1117       _last_calls_return_pc = pc();
1118       return _last_calls_return_pc;
1119     }
1120   }
1121 }
1122 
1123 // Call a C function.  All constants needed reside in TOC.
1124 //
1125 // Read the address to call from the TOC.
1126 // Read env from TOC, if fd specifies an env.
1127 // Read new TOC from TOC.
1128 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd,
1129                                          relocInfo::relocType rt, Register toc) {
1130   if (!ReoptimizeCallSequences
1131     || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1132     || !fd->is_friend_function()) {
1133     // It's not a friend function as defined by class FunctionDescriptor,
1134     // so do a full call-c here.
1135     assert(fd->entry() != NULL, "function must be linked");
1136 
1137     AddressLiteral fd_entry(fd->entry());
1138     bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true);
1139     mtctr(R11);
1140     if (fd->env() == NULL) {
1141       li(R11, 0);
1142       nop();
1143     } else {
1144       AddressLiteral fd_env(fd->env());
1145       success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true);
1146     }
1147     AddressLiteral fd_toc(fd->toc());
1148     // Set R2_TOC (load from toc)
1149     success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true);
1150     bctrl();
1151     _last_calls_return_pc = pc();
1152     if (!success) { return NULL; }
1153   } else {
1154     // It's a friend function, load the entry point and don't care about
1155     // toc and env. Use an optimizable call instruction, but ensure the
1156     // same code-size as in the case of a non-friend function.
1157     nop();
1158     bl64_patchable(fd->entry(), rt);
1159     _last_calls_return_pc = pc();
1160   }
1161   return _last_calls_return_pc;
1162 }
1163 #endif // ABI_ELFv2
1164 
1165 void MacroAssembler::call_VM_base(Register oop_result,
1166                                   Register last_java_sp,
1167                                   address  entry_point,
1168                                   bool     check_exceptions) {
1169   BLOCK_COMMENT("call_VM {");
1170   // Determine last_java_sp register.
1171   if (!last_java_sp->is_valid()) {
1172     last_java_sp = R1_SP;
1173   }
1174   set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1);
1175 
1176   // ARG1 must hold thread address.
1177   mr(R3_ARG1, R16_thread);
1178 #if defined(ABI_ELFv2)
1179   address return_pc = call_c(entry_point, relocInfo::none);
1180 #else
1181   address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none);
1182 #endif
1183 
1184   reset_last_Java_frame();
1185 
1186   // Check for pending exceptions.
1187   if (check_exceptions) {
1188     // We don't check for exceptions here.
1189     ShouldNotReachHere();
1190   }
1191 
1192   // Get oop result if there is one and reset the value in the thread.
1193   if (oop_result->is_valid()) {
1194     get_vm_result(oop_result);
1195   }
1196 
1197   _last_calls_return_pc = return_pc;
1198   BLOCK_COMMENT("} call_VM");
1199 }
1200 
1201 void MacroAssembler::call_VM_leaf_base(address entry_point) {
1202   BLOCK_COMMENT("call_VM_leaf {");
1203 #if defined(ABI_ELFv2)
1204   call_c(entry_point, relocInfo::none);
1205 #else
1206   call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none);
1207 #endif
1208   BLOCK_COMMENT("} call_VM_leaf");
1209 }
1210 
1211 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) {
1212   call_VM_base(oop_result, noreg, entry_point, check_exceptions);
1213 }
1214 
1215 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1,
1216                              bool check_exceptions) {
1217   // R3_ARG1 is reserved for the thread.
1218   mr_if_needed(R4_ARG2, arg_1);
1219   call_VM(oop_result, entry_point, check_exceptions);
1220 }
1221 
1222 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2,
1223                              bool check_exceptions) {
1224   // R3_ARG1 is reserved for the thread
1225   mr_if_needed(R4_ARG2, arg_1);
1226   assert(arg_2 != R4_ARG2, "smashed argument");
1227   mr_if_needed(R5_ARG3, arg_2);
1228   call_VM(oop_result, entry_point, check_exceptions);
1229 }
1230 
1231 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3,
1232                              bool check_exceptions) {
1233   // R3_ARG1 is reserved for the thread
1234   mr_if_needed(R4_ARG2, arg_1);
1235   assert(arg_2 != R4_ARG2, "smashed argument");
1236   mr_if_needed(R5_ARG3, arg_2);
1237   mr_if_needed(R6_ARG4, arg_3);
1238   call_VM(oop_result, entry_point, check_exceptions);
1239 }
1240 
1241 void MacroAssembler::call_VM_leaf(address entry_point) {
1242   call_VM_leaf_base(entry_point);
1243 }
1244 
1245 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) {
1246   mr_if_needed(R3_ARG1, arg_1);
1247   call_VM_leaf(entry_point);
1248 }
1249 
1250 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) {
1251   mr_if_needed(R3_ARG1, arg_1);
1252   assert(arg_2 != R3_ARG1, "smashed argument");
1253   mr_if_needed(R4_ARG2, arg_2);
1254   call_VM_leaf(entry_point);
1255 }
1256 
1257 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) {
1258   mr_if_needed(R3_ARG1, arg_1);
1259   assert(arg_2 != R3_ARG1, "smashed argument");
1260   mr_if_needed(R4_ARG2, arg_2);
1261   assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument");
1262   mr_if_needed(R5_ARG3, arg_3);
1263   call_VM_leaf(entry_point);
1264 }
1265 
1266 // Check whether instruction is a read access to the polling page
1267 // which was emitted by load_from_polling_page(..).
1268 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext,
1269                                                address* polling_address_ptr) {
1270   if (!is_ld(instruction))
1271     return false; // It's not a ld. Fail.
1272 
1273   int rt = inv_rt_field(instruction);
1274   int ra = inv_ra_field(instruction);
1275   int ds = inv_ds_field(instruction);
1276   if (!(ds == 0 && ra != 0 && rt == 0)) {
1277     return false; // It's not a ld(r0, X, ra). Fail.
1278   }
1279 
1280   if (!ucontext) {
1281     // Set polling address.
1282     if (polling_address_ptr != NULL) {
1283       *polling_address_ptr = NULL;
1284     }
1285     return true; // No ucontext given. Can't check value of ra. Assume true.
1286   }
1287 
1288 #ifdef LINUX
1289   // Ucontext given. Check that register ra contains the address of
1290   // the safepoing polling page.
1291   ucontext_t* uc = (ucontext_t*) ucontext;
1292   // Set polling address.
1293   address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds;
1294   if (polling_address_ptr != NULL) {
1295     *polling_address_ptr = addr;
1296   }
1297   return SafepointMechanism::is_poll_address(addr);
1298 #else
1299   // Not on Linux, ucontext must be NULL.
1300   ShouldNotReachHere();
1301   return false;
1302 #endif
1303 }
1304 
1305 void MacroAssembler::bang_stack_with_offset(int offset) {
1306   // When increasing the stack, the old stack pointer will be written
1307   // to the new top of stack according to the PPC64 abi.
1308   // Therefore, stack banging is not necessary when increasing
1309   // the stack by <= os::vm_page_size() bytes.
1310   // When increasing the stack by a larger amount, this method is
1311   // called repeatedly to bang the intermediate pages.
1312 
1313   // Stack grows down, caller passes positive offset.
1314   assert(offset > 0, "must bang with positive offset");
1315 
1316   long stdoffset = -offset;
1317 
1318   if (is_simm(stdoffset, 16)) {
1319     // Signed 16 bit offset, a simple std is ok.
1320     if (UseLoadInstructionsForStackBangingPPC64) {
1321       ld(R0, (int)(signed short)stdoffset, R1_SP);
1322     } else {
1323       std(R0,(int)(signed short)stdoffset, R1_SP);
1324     }
1325   } else if (is_simm(stdoffset, 31)) {
1326     const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset);
1327     const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset);
1328 
1329     Register tmp = R11;
1330     addis(tmp, R1_SP, hi);
1331     if (UseLoadInstructionsForStackBangingPPC64) {
1332       ld(R0,  lo, tmp);
1333     } else {
1334       std(R0, lo, tmp);
1335     }
1336   } else {
1337     ShouldNotReachHere();
1338   }
1339 }
1340 
1341 // If instruction is a stack bang of the form
1342 //    std    R0,    x(Ry),       (see bang_stack_with_offset())
1343 //    stdu   R1_SP, x(R1_SP),    (see push_frame(), resize_frame())
1344 // or stdux  R1_SP, Rx, R1_SP    (see push_frame(), resize_frame())
1345 // return the banged address. Otherwise, return 0.
1346 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) {
1347 #ifdef LINUX
1348   ucontext_t* uc = (ucontext_t*) ucontext;
1349   int rs = inv_rs_field(instruction);
1350   int ra = inv_ra_field(instruction);
1351   if (   (is_ld(instruction)   && rs == 0 &&  UseLoadInstructionsForStackBangingPPC64)
1352       || (is_std(instruction)  && rs == 0 && !UseLoadInstructionsForStackBangingPPC64)
1353       || (is_stdu(instruction) && rs == 1)) {
1354     int ds = inv_ds_field(instruction);
1355     // return banged address
1356     return ds+(address)uc->uc_mcontext.regs->gpr[ra];
1357   } else if (is_stdux(instruction) && rs == 1) {
1358     int rb = inv_rb_field(instruction);
1359     address sp = (address)uc->uc_mcontext.regs->gpr[1];
1360     long rb_val = (long)uc->uc_mcontext.regs->gpr[rb];
1361     return ra != 1 || rb_val >= 0 ? NULL         // not a stack bang
1362                                   : sp + rb_val; // banged address
1363   }
1364   return NULL; // not a stack bang
1365 #else
1366   // workaround not needed on !LINUX :-)
1367   ShouldNotCallThis();
1368   return NULL;
1369 #endif
1370 }
1371 
1372 void MacroAssembler::reserved_stack_check(Register return_pc) {
1373   // Test if reserved zone needs to be enabled.
1374   Label no_reserved_zone_enabling;
1375 
1376   ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread);
1377   cmpld(CCR0, R1_SP, R0);
1378   blt_predict_taken(CCR0, no_reserved_zone_enabling);
1379 
1380   // Enable reserved zone again, throw stack overflow exception.
1381   push_frame_reg_args(0, R0);
1382   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread);
1383   pop_frame();
1384   mtlr(return_pc);
1385   load_const_optimized(R0, StubRoutines::throw_delayed_StackOverflowError_entry());
1386   mtctr(R0);
1387   bctr();
1388 
1389   should_not_reach_here();
1390 
1391   bind(no_reserved_zone_enabling);
1392 }
1393 
1394 void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base,
1395                                 bool cmpxchgx_hint) {
1396   Label retry;
1397   bind(retry);
1398   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1399   stdcx_(exchange_value, addr_base);
1400   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1401     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1402   } else {
1403     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1404   }
1405 }
1406 
1407 void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base,
1408                                 Register tmp, bool cmpxchgx_hint) {
1409   Label retry;
1410   bind(retry);
1411   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1412   add(tmp, dest_current_value, inc_value);
1413   stdcx_(tmp, addr_base);
1414   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1415     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1416   } else {
1417     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1418   }
1419 }
1420 
1421 // Word/sub-word atomic helper functions
1422 
1423 // Temps and addr_base are killed if size < 4 and processor does not support respective instructions.
1424 // Only signed types are supported with size < 4.
1425 // Atomic add always kills tmp1.
1426 void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value,
1427                                                    Register addr_base, Register tmp1, Register tmp2, Register tmp3,
1428                                                    bool cmpxchgx_hint, bool is_add, int size) {
1429   // Sub-word instructions are available since Power 8.
1430   // For older processors, instruction_type != size holds, and we
1431   // emulate the sub-word instructions by constructing a 4-byte value
1432   // that leaves the other bytes unchanged.
1433   const int instruction_type = VM_Version::has_lqarx() ? size : 4;
1434 
1435   Label retry;
1436   Register shift_amount = noreg,
1437            val32 = dest_current_value,
1438            modval = is_add ? tmp1 : exchange_value;
1439 
1440   if (instruction_type != size) {
1441     assert_different_registers(tmp1, tmp2, tmp3, dest_current_value, exchange_value, addr_base);
1442     modval = tmp1;
1443     shift_amount = tmp2;
1444     val32 = tmp3;
1445     // Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.
1446 #ifdef VM_LITTLE_ENDIAN
1447     rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;
1448     clrrdi(addr_base, addr_base, 2);
1449 #else
1450     xori(shift_amount, addr_base, (size == 1) ? 3 : 2);
1451     clrrdi(addr_base, addr_base, 2);
1452     rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;
1453 #endif
1454   }
1455 
1456   // atomic emulation loop
1457   bind(retry);
1458 
1459   switch (instruction_type) {
1460     case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1461     case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1462     case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1463     default: ShouldNotReachHere();
1464   }
1465 
1466   if (instruction_type != size) {
1467     srw(dest_current_value, val32, shift_amount);
1468   }
1469 
1470   if (is_add) { add(modval, dest_current_value, exchange_value); }
1471 
1472   if (instruction_type != size) {
1473     // Transform exchange value such that the replacement can be done by one xor instruction.
1474     xorr(modval, dest_current_value, is_add ? modval : exchange_value);
1475     clrldi(modval, modval, (size == 1) ? 56 : 48);
1476     slw(modval, modval, shift_amount);
1477     xorr(modval, val32, modval);
1478   }
1479 
1480   switch (instruction_type) {
1481     case 4: stwcx_(modval, addr_base); break;
1482     case 2: sthcx_(modval, addr_base); break;
1483     case 1: stbcx_(modval, addr_base); break;
1484     default: ShouldNotReachHere();
1485   }
1486 
1487   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1488     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1489   } else {
1490     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1491   }
1492 
1493   // l?arx zero-extends, but Java wants byte/short values sign-extended.
1494   if (size == 1) {
1495     extsb(dest_current_value, dest_current_value);
1496   } else if (size == 2) {
1497     extsh(dest_current_value, dest_current_value);
1498   };
1499 }
1500 
1501 // Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions.
1502 // Only signed types are supported with size < 4.
1503 void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value,
1504                                        Register compare_value, Register exchange_value,
1505                                        Register addr_base, Register tmp1, Register tmp2,
1506                                        Label &retry, Label &failed, bool cmpxchgx_hint, int size) {
1507   // Sub-word instructions are available since Power 8.
1508   // For older processors, instruction_type != size holds, and we
1509   // emulate the sub-word instructions by constructing a 4-byte value
1510   // that leaves the other bytes unchanged.
1511   const int instruction_type = VM_Version::has_lqarx() ? size : 4;
1512 
1513   Register shift_amount = noreg,
1514            val32 = dest_current_value,
1515            modval = exchange_value;
1516 
1517   if (instruction_type != size) {
1518     assert_different_registers(tmp1, tmp2, dest_current_value, compare_value, exchange_value, addr_base);
1519     shift_amount = tmp1;
1520     val32 = tmp2;
1521     modval = tmp2;
1522     // Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.
1523 #ifdef VM_LITTLE_ENDIAN
1524     rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;
1525     clrrdi(addr_base, addr_base, 2);
1526 #else
1527     xori(shift_amount, addr_base, (size == 1) ? 3 : 2);
1528     clrrdi(addr_base, addr_base, 2);
1529     rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;
1530 #endif
1531     // Transform exchange value such that the replacement can be done by one xor instruction.
1532     xorr(exchange_value, compare_value, exchange_value);
1533     clrldi(exchange_value, exchange_value, (size == 1) ? 56 : 48);
1534     slw(exchange_value, exchange_value, shift_amount);
1535   }
1536 
1537   // atomic emulation loop
1538   bind(retry);
1539 
1540   switch (instruction_type) {
1541     case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1542     case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1543     case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1544     default: ShouldNotReachHere();
1545   }
1546 
1547   if (instruction_type != size) {
1548     srw(dest_current_value, val32, shift_amount);
1549   }
1550   if (size == 1) {
1551     extsb(dest_current_value, dest_current_value);
1552   } else if (size == 2) {
1553     extsh(dest_current_value, dest_current_value);
1554   };
1555 
1556   cmpw(flag, dest_current_value, compare_value);
1557   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1558     bne_predict_not_taken(flag, failed);
1559   } else {
1560     bne(                  flag, failed);
1561   }
1562   // branch to done  => (flag == ne), (dest_current_value != compare_value)
1563   // fall through    => (flag == eq), (dest_current_value == compare_value)
1564 
1565   if (instruction_type != size) {
1566     xorr(modval, val32, exchange_value);
1567   }
1568 
1569   switch (instruction_type) {
1570     case 4: stwcx_(modval, addr_base); break;
1571     case 2: sthcx_(modval, addr_base); break;
1572     case 1: stbcx_(modval, addr_base); break;
1573     default: ShouldNotReachHere();
1574   }
1575 }
1576 
1577 // CmpxchgX sets condition register to cmpX(current, compare).
1578 void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value,
1579                                      Register compare_value, Register exchange_value,
1580                                      Register addr_base, Register tmp1, Register tmp2,
1581                                      int semantics, bool cmpxchgx_hint,
1582                                      Register int_flag_success, bool contention_hint, bool weak, int size) {
1583   Label retry;
1584   Label failed;
1585   Label done;
1586 
1587   // Save one branch if result is returned via register and
1588   // result register is different from the other ones.
1589   bool use_result_reg    = (int_flag_success != noreg);
1590   bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value &&
1591                             int_flag_success != exchange_value && int_flag_success != addr_base &&
1592                             int_flag_success != tmp1 && int_flag_success != tmp2);
1593   assert(!weak || flag == CCR0, "weak only supported with CCR0");
1594   assert(size == 1 || size == 2 || size == 4, "unsupported");
1595 
1596   if (use_result_reg && preset_result_reg) {
1597     li(int_flag_success, 0); // preset (assume cas failed)
1598   }
1599 
1600   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1601   if (contention_hint) { // Don't try to reserve if cmp fails.
1602     switch (size) {
1603       case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break;
1604       case 2: lha(dest_current_value, 0, addr_base); break;
1605       case 4: lwz(dest_current_value, 0, addr_base); break;
1606       default: ShouldNotReachHere();
1607     }
1608     cmpw(flag, dest_current_value, compare_value);
1609     bne(flag, failed);
1610   }
1611 
1612   // release/fence semantics
1613   if (semantics & MemBarRel) {
1614     release();
1615   }
1616 
1617   cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base, tmp1, tmp2,
1618                     retry, failed, cmpxchgx_hint, size);
1619   if (!weak || use_result_reg) {
1620     if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1621       bne_predict_not_taken(CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1622     } else {
1623       bne(                  CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1624     }
1625   }
1626   // fall through    => (flag == eq), (dest_current_value == compare_value), (swapped)
1627 
1628   // Result in register (must do this at the end because int_flag_success can be the
1629   // same register as one above).
1630   if (use_result_reg) {
1631     li(int_flag_success, 1);
1632   }
1633 
1634   if (semantics & MemBarFenceAfter) {
1635     fence();
1636   } else if (semantics & MemBarAcq) {
1637     isync();
1638   }
1639 
1640   if (use_result_reg && !preset_result_reg) {
1641     b(done);
1642   }
1643 
1644   bind(failed);
1645   if (use_result_reg && !preset_result_reg) {
1646     li(int_flag_success, 0);
1647   }
1648 
1649   bind(done);
1650   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1651   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1652 }
1653 
1654 // Preforms atomic compare exchange:
1655 //   if (compare_value == *addr_base)
1656 //     *addr_base = exchange_value
1657 //     int_flag_success = 1;
1658 //   else
1659 //     int_flag_success = 0;
1660 //
1661 // ConditionRegister flag       = cmp(compare_value, *addr_base)
1662 // Register dest_current_value  = *addr_base
1663 // Register compare_value       Used to compare with value in memory
1664 // Register exchange_value      Written to memory if compare_value == *addr_base
1665 // Register addr_base           The memory location to compareXChange
1666 // Register int_flag_success    Set to 1 if exchange_value was written to *addr_base
1667 //
1668 // To avoid the costly compare exchange the value is tested beforehand.
1669 // Several special cases exist to avoid that unnecessary information is generated.
1670 //
1671 void MacroAssembler::cmpxchgd(ConditionRegister flag,
1672                               Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value,
1673                               Register addr_base, int semantics, bool cmpxchgx_hint,
1674                               Register int_flag_success, Label* failed_ext, bool contention_hint, bool weak) {
1675   Label retry;
1676   Label failed_int;
1677   Label& failed = (failed_ext != NULL) ? *failed_ext : failed_int;
1678   Label done;
1679 
1680   // Save one branch if result is returned via register and result register is different from the other ones.
1681   bool use_result_reg    = (int_flag_success!=noreg);
1682   bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() &&
1683                             int_flag_success!=exchange_value && int_flag_success!=addr_base);
1684   assert(!weak || flag == CCR0, "weak only supported with CCR0");
1685   assert(int_flag_success == noreg || failed_ext == NULL, "cannot have both");
1686 
1687   if (use_result_reg && preset_result_reg) {
1688     li(int_flag_success, 0); // preset (assume cas failed)
1689   }
1690 
1691   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1692   if (contention_hint) { // Don't try to reserve if cmp fails.
1693     ld(dest_current_value, 0, addr_base);
1694     cmpd(flag, compare_value, dest_current_value);
1695     bne(flag, failed);
1696   }
1697 
1698   // release/fence semantics
1699   if (semantics & MemBarRel) {
1700     release();
1701   }
1702 
1703   // atomic emulation loop
1704   bind(retry);
1705 
1706   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1707   cmpd(flag, compare_value, dest_current_value);
1708   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1709     bne_predict_not_taken(flag, failed);
1710   } else {
1711     bne(                  flag, failed);
1712   }
1713 
1714   stdcx_(exchange_value, addr_base);
1715   if (!weak || use_result_reg || failed_ext) {
1716     if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1717       bne_predict_not_taken(CCR0, weak ? failed : retry); // stXcx_ sets CCR0
1718     } else {
1719       bne(                  CCR0, weak ? failed : retry); // stXcx_ sets CCR0
1720     }
1721   }
1722 
1723   // result in register (must do this at the end because int_flag_success can be the same register as one above)
1724   if (use_result_reg) {
1725     li(int_flag_success, 1);
1726   }
1727 
1728   if (semantics & MemBarFenceAfter) {
1729     fence();
1730   } else if (semantics & MemBarAcq) {
1731     isync();
1732   }
1733 
1734   if (use_result_reg && !preset_result_reg) {
1735     b(done);
1736   }
1737 
1738   bind(failed_int);
1739   if (use_result_reg && !preset_result_reg) {
1740     li(int_flag_success, 0);
1741   }
1742 
1743   bind(done);
1744   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1745   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1746 }
1747 
1748 // Look up the method for a megamorphic invokeinterface call.
1749 // The target method is determined by <intf_klass, itable_index>.
1750 // The receiver klass is in recv_klass.
1751 // On success, the result will be in method_result, and execution falls through.
1752 // On failure, execution transfers to the given label.
1753 void MacroAssembler::lookup_interface_method(Register recv_klass,
1754                                              Register intf_klass,
1755                                              RegisterOrConstant itable_index,
1756                                              Register method_result,
1757                                              Register scan_temp,
1758                                              Register temp2,
1759                                              Label& L_no_such_interface,
1760                                              bool return_method) {
1761   assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
1762 
1763   // Compute start of first itableOffsetEntry (which is at the end of the vtable).
1764   int vtable_base = in_bytes(Klass::vtable_start_offset());
1765   int itentry_off = itableMethodEntry::method_offset_in_bytes();
1766   int logMEsize   = exact_log2(itableMethodEntry::size() * wordSize);
1767   int scan_step   = itableOffsetEntry::size() * wordSize;
1768   int log_vte_size= exact_log2(vtableEntry::size_in_bytes());
1769 
1770   lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass);
1771   // %%% We should store the aligned, prescaled offset in the klassoop.
1772   // Then the next several instructions would fold away.
1773 
1774   sldi(scan_temp, scan_temp, log_vte_size);
1775   addi(scan_temp, scan_temp, vtable_base);
1776   add(scan_temp, recv_klass, scan_temp);
1777 
1778   // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1779   if (return_method) {
1780     if (itable_index.is_register()) {
1781       Register itable_offset = itable_index.as_register();
1782       sldi(method_result, itable_offset, logMEsize);
1783       if (itentry_off) { addi(method_result, method_result, itentry_off); }
1784       add(method_result, method_result, recv_klass);
1785     } else {
1786       long itable_offset = (long)itable_index.as_constant();
1787       // static address, no relocation
1788       add_const_optimized(method_result, recv_klass, (itable_offset << logMEsize) + itentry_off, temp2);
1789     }
1790   }
1791 
1792   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
1793   //   if (scan->interface() == intf) {
1794   //     result = (klass + scan->offset() + itable_index);
1795   //   }
1796   // }
1797   Label search, found_method;
1798 
1799   for (int peel = 1; peel >= 0; peel--) {
1800     // %%%% Could load both offset and interface in one ldx, if they were
1801     // in the opposite order. This would save a load.
1802     ld(temp2, itableOffsetEntry::interface_offset_in_bytes(), scan_temp);
1803 
1804     // Check that this entry is non-null. A null entry means that
1805     // the receiver class doesn't implement the interface, and wasn't the
1806     // same as when the caller was compiled.
1807     cmpd(CCR0, temp2, intf_klass);
1808 
1809     if (peel) {
1810       beq(CCR0, found_method);
1811     } else {
1812       bne(CCR0, search);
1813       // (invert the test to fall through to found_method...)
1814     }
1815 
1816     if (!peel) break;
1817 
1818     bind(search);
1819 
1820     cmpdi(CCR0, temp2, 0);
1821     beq(CCR0, L_no_such_interface);
1822     addi(scan_temp, scan_temp, scan_step);
1823   }
1824 
1825   bind(found_method);
1826 
1827   // Got a hit.
1828   if (return_method) {
1829     int ito_offset = itableOffsetEntry::offset_offset_in_bytes();
1830     lwz(scan_temp, ito_offset, scan_temp);
1831     ldx(method_result, scan_temp, method_result);
1832   }
1833 }
1834 
1835 // virtual method calling
1836 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1837                                            RegisterOrConstant vtable_index,
1838                                            Register method_result) {
1839 
1840   assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg());
1841 
1842   const int base = in_bytes(Klass::vtable_start_offset());
1843   assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1844 
1845   if (vtable_index.is_register()) {
1846     sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord);
1847     add(recv_klass, vtable_index.as_register(), recv_klass);
1848   } else {
1849     addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord);
1850   }
1851   ld(R19_method, base + vtableEntry::method_offset_in_bytes(), recv_klass);
1852 }
1853 
1854 /////////////////////////////////////////// subtype checking ////////////////////////////////////////////
1855 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1856                                                    Register super_klass,
1857                                                    Register temp1_reg,
1858                                                    Register temp2_reg,
1859                                                    Label* L_success,
1860                                                    Label* L_failure,
1861                                                    Label* L_slow_path,
1862                                                    RegisterOrConstant super_check_offset) {
1863 
1864   const Register check_cache_offset = temp1_reg;
1865   const Register cached_super       = temp2_reg;
1866 
1867   assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super);
1868 
1869   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1870   int sc_offset  = in_bytes(Klass::secondary_super_cache_offset());
1871 
1872   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1873   bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset);
1874 
1875   Label L_fallthrough;
1876   int label_nulls = 0;
1877   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1878   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1879   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1880   assert(label_nulls <= 1 ||
1881          (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path),
1882          "at most one NULL in the batch, usually");
1883 
1884   // If the pointers are equal, we are done (e.g., String[] elements).
1885   // This self-check enables sharing of secondary supertype arrays among
1886   // non-primary types such as array-of-interface. Otherwise, each such
1887   // type would need its own customized SSA.
1888   // We move this check to the front of the fast path because many
1889   // type checks are in fact trivially successful in this manner,
1890   // so we get a nicely predicted branch right at the start of the check.
1891   cmpd(CCR0, sub_klass, super_klass);
1892   beq(CCR0, *L_success);
1893 
1894   // Check the supertype display:
1895   if (must_load_sco) {
1896     // The super check offset is always positive...
1897     lwz(check_cache_offset, sco_offset, super_klass);
1898     super_check_offset = RegisterOrConstant(check_cache_offset);
1899     // super_check_offset is register.
1900     assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register());
1901   }
1902   // The loaded value is the offset from KlassOopDesc.
1903 
1904   ld(cached_super, super_check_offset, sub_klass);
1905   cmpd(CCR0, cached_super, super_klass);
1906 
1907   // This check has worked decisively for primary supers.
1908   // Secondary supers are sought in the super_cache ('super_cache_addr').
1909   // (Secondary supers are interfaces and very deeply nested subtypes.)
1910   // This works in the same check above because of a tricky aliasing
1911   // between the super_cache and the primary super display elements.
1912   // (The 'super_check_addr' can address either, as the case requires.)
1913   // Note that the cache is updated below if it does not help us find
1914   // what we need immediately.
1915   // So if it was a primary super, we can just fail immediately.
1916   // Otherwise, it's the slow path for us (no success at this point).
1917 
1918 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); }
1919 
1920   if (super_check_offset.is_register()) {
1921     beq(CCR0, *L_success);
1922     cmpwi(CCR0, super_check_offset.as_register(), sc_offset);
1923     if (L_failure == &L_fallthrough) {
1924       beq(CCR0, *L_slow_path);
1925     } else {
1926       bne(CCR0, *L_failure);
1927       FINAL_JUMP(*L_slow_path);
1928     }
1929   } else {
1930     if (super_check_offset.as_constant() == sc_offset) {
1931       // Need a slow path; fast failure is impossible.
1932       if (L_slow_path == &L_fallthrough) {
1933         beq(CCR0, *L_success);
1934       } else {
1935         bne(CCR0, *L_slow_path);
1936         FINAL_JUMP(*L_success);
1937       }
1938     } else {
1939       // No slow path; it's a fast decision.
1940       if (L_failure == &L_fallthrough) {
1941         beq(CCR0, *L_success);
1942       } else {
1943         bne(CCR0, *L_failure);
1944         FINAL_JUMP(*L_success);
1945       }
1946     }
1947   }
1948 
1949   bind(L_fallthrough);
1950 #undef FINAL_JUMP
1951 }
1952 
1953 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1954                                                    Register super_klass,
1955                                                    Register temp1_reg,
1956                                                    Register temp2_reg,
1957                                                    Label* L_success,
1958                                                    Register result_reg) {
1959   const Register array_ptr = temp1_reg; // current value from cache array
1960   const Register temp      = temp2_reg;
1961 
1962   assert_different_registers(sub_klass, super_klass, array_ptr, temp);
1963 
1964   int source_offset = in_bytes(Klass::secondary_supers_offset());
1965   int target_offset = in_bytes(Klass::secondary_super_cache_offset());
1966 
1967   int length_offset = Array<Klass*>::length_offset_in_bytes();
1968   int base_offset   = Array<Klass*>::base_offset_in_bytes();
1969 
1970   Label hit, loop, failure, fallthru;
1971 
1972   ld(array_ptr, source_offset, sub_klass);
1973 
1974   // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated.");
1975   lwz(temp, length_offset, array_ptr);
1976   cmpwi(CCR0, temp, 0);
1977   beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0
1978 
1979   mtctr(temp); // load ctr
1980 
1981   bind(loop);
1982   // Oops in table are NO MORE compressed.
1983   ld(temp, base_offset, array_ptr);
1984   cmpd(CCR0, temp, super_klass);
1985   beq(CCR0, hit);
1986   addi(array_ptr, array_ptr, BytesPerWord);
1987   bdnz(loop);
1988 
1989   bind(failure);
1990   if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss)
1991   b(fallthru);
1992 
1993   bind(hit);
1994   std(super_klass, target_offset, sub_klass); // save result to cache
1995   if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit)
1996   if (L_success != NULL) { b(*L_success); }
1997   else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided
1998 
1999   bind(fallthru);
2000 }
2001 
2002 // Try fast path, then go to slow one if not successful
2003 void MacroAssembler::check_klass_subtype(Register sub_klass,
2004                          Register super_klass,
2005                          Register temp1_reg,
2006                          Register temp2_reg,
2007                          Label& L_success) {
2008   Label L_failure;
2009   check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure);
2010   check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success);
2011   bind(L_failure); // Fallthru if not successful.
2012 }
2013 
2014 void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) {
2015   assert(L_fast_path != NULL || L_slow_path != NULL, "at least one is required");
2016 
2017   Label L_fallthrough;
2018   if (L_fast_path == NULL) {
2019     L_fast_path = &L_fallthrough;
2020   } else if (L_slow_path == NULL) {
2021     L_slow_path = &L_fallthrough;
2022   }
2023 
2024   // Fast path check: class is fully initialized
2025   lbz(R0, in_bytes(InstanceKlass::init_state_offset()), klass);
2026   cmpwi(CCR0, R0, InstanceKlass::fully_initialized);
2027   beq(CCR0, *L_fast_path);
2028 
2029   // Fast path check: current thread is initializer thread
2030   ld(R0, in_bytes(InstanceKlass::init_thread_offset()), klass);
2031   cmpd(CCR0, thread, R0);
2032   if (L_slow_path == &L_fallthrough) {
2033     beq(CCR0, *L_fast_path);
2034   } else if (L_fast_path == &L_fallthrough) {
2035     bne(CCR0, *L_slow_path);
2036   } else {
2037     Unimplemented();
2038   }
2039 
2040   bind(L_fallthrough);
2041 }
2042 
2043 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot,
2044                                                    Register temp_reg,
2045                                                    int extra_slot_offset) {
2046   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
2047   int stackElementSize = Interpreter::stackElementSize;
2048   int offset = extra_slot_offset * stackElementSize;
2049   if (arg_slot.is_constant()) {
2050     offset += arg_slot.as_constant() * stackElementSize;
2051     return offset;
2052   } else {
2053     assert(temp_reg != noreg, "must specify");
2054     sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize));
2055     if (offset != 0)
2056       addi(temp_reg, temp_reg, offset);
2057     return temp_reg;
2058   }
2059 }
2060 
2061 // Supports temp2_reg = R0.
2062 void MacroAssembler::biased_locking_enter(ConditionRegister cr_reg, Register obj_reg,
2063                                           Register mark_reg, Register temp_reg,
2064                                           Register temp2_reg, Label& done, Label* slow_case) {
2065   assert(UseBiasedLocking, "why call this otherwise?");
2066 
2067 #ifdef ASSERT
2068   assert_different_registers(obj_reg, mark_reg, temp_reg, temp2_reg);
2069 #endif
2070 
2071   Label cas_label;
2072 
2073   // Branch to done if fast path fails and no slow_case provided.
2074   Label *slow_case_int = (slow_case != NULL) ? slow_case : &done;
2075 
2076   // Biased locking
2077   // See whether the lock is currently biased toward our thread and
2078   // whether the epoch is still valid
2079   // Note that the runtime guarantees sufficient alignment of JavaThread
2080   // pointers to allow age to be placed into low bits
2081   assert(markWord::age_shift == markWord::lock_bits + markWord::biased_lock_bits,
2082          "biased locking makes assumptions about bit layout");
2083 
2084   if (PrintBiasedLockingStatistics) {
2085     load_const(temp2_reg, (address) BiasedLocking::total_entry_count_addr(), temp_reg);
2086     lwzx(temp_reg, temp2_reg);
2087     addi(temp_reg, temp_reg, 1);
2088     stwx(temp_reg, temp2_reg);
2089   }
2090 
2091   andi(temp_reg, mark_reg, markWord::biased_lock_mask_in_place);
2092   cmpwi(cr_reg, temp_reg, markWord::biased_lock_pattern);
2093   bne(cr_reg, cas_label);
2094 
2095   load_klass(temp_reg, obj_reg);
2096 
2097   load_const_optimized(temp2_reg, ~((int) markWord::age_mask_in_place));
2098   ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
2099   orr(temp_reg, R16_thread, temp_reg);
2100   xorr(temp_reg, mark_reg, temp_reg);
2101   andr(temp_reg, temp_reg, temp2_reg);
2102   cmpdi(cr_reg, temp_reg, 0);
2103   if (PrintBiasedLockingStatistics) {
2104     Label l;
2105     bne(cr_reg, l);
2106     load_const(temp2_reg, (address) BiasedLocking::biased_lock_entry_count_addr());
2107     lwzx(mark_reg, temp2_reg);
2108     addi(mark_reg, mark_reg, 1);
2109     stwx(mark_reg, temp2_reg);
2110     // restore mark_reg
2111     ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);
2112     bind(l);
2113   }
2114   beq(cr_reg, done);
2115 
2116   Label try_revoke_bias;
2117   Label try_rebias;
2118 
2119   // At this point we know that the header has the bias pattern and
2120   // that we are not the bias owner in the current epoch. We need to
2121   // figure out more details about the state of the header in order to
2122   // know what operations can be legally performed on the object's
2123   // header.
2124 
2125   // If the low three bits in the xor result aren't clear, that means
2126   // the prototype header is no longer biased and we have to revoke
2127   // the bias on this object.
2128   andi(temp2_reg, temp_reg, markWord::biased_lock_mask_in_place);
2129   cmpwi(cr_reg, temp2_reg, 0);
2130   bne(cr_reg, try_revoke_bias);
2131 
2132   // Biasing is still enabled for this data type. See whether the
2133   // epoch of the current bias is still valid, meaning that the epoch
2134   // bits of the mark word are equal to the epoch bits of the
2135   // prototype header. (Note that the prototype header's epoch bits
2136   // only change at a safepoint.) If not, attempt to rebias the object
2137   // toward the current thread. Note that we must be absolutely sure
2138   // that the current epoch is invalid in order to do this because
2139   // otherwise the manipulations it performs on the mark word are
2140   // illegal.
2141 
2142   int shift_amount = 64 - markWord::epoch_shift;
2143   // rotate epoch bits to right (little) end and set other bits to 0
2144   // [ big part | epoch | little part ] -> [ 0..0 | epoch ]
2145   rldicl_(temp2_reg, temp_reg, shift_amount, 64 - markWord::epoch_bits);
2146   // branch if epoch bits are != 0, i.e. they differ, because the epoch has been incremented
2147   bne(CCR0, try_rebias);
2148 
2149   // The epoch of the current bias is still valid but we know nothing
2150   // about the owner; it might be set or it might be clear. Try to
2151   // acquire the bias of the object using an atomic operation. If this
2152   // fails we will go in to the runtime to revoke the object's bias.
2153   // Note that we first construct the presumed unbiased header so we
2154   // don't accidentally blow away another thread's valid bias.
2155   andi(mark_reg, mark_reg, (markWord::biased_lock_mask_in_place |
2156                                 markWord::age_mask_in_place |
2157                                 markWord::epoch_mask_in_place));
2158   orr(temp_reg, R16_thread, mark_reg);
2159 
2160   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2161 
2162   // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
2163   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
2164            /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
2165            /*where=*/obj_reg,
2166            MacroAssembler::MemBarAcq,
2167            MacroAssembler::cmpxchgx_hint_acquire_lock(),
2168            noreg, slow_case_int); // bail out if failed
2169 
2170   // If the biasing toward our thread failed, this means that
2171   // another thread succeeded in biasing it toward itself and we
2172   // need to revoke that bias. The revocation will occur in the
2173   // interpreter runtime in the slow case.
2174   if (PrintBiasedLockingStatistics) {
2175     load_const(temp2_reg, (address) BiasedLocking::anonymously_biased_lock_entry_count_addr(), temp_reg);
2176     lwzx(temp_reg, temp2_reg);
2177     addi(temp_reg, temp_reg, 1);
2178     stwx(temp_reg, temp2_reg);
2179   }
2180   b(done);
2181 
2182   bind(try_rebias);
2183   // At this point we know the epoch has expired, meaning that the
2184   // current "bias owner", if any, is actually invalid. Under these
2185   // circumstances _only_, we are allowed to use the current header's
2186   // value as the comparison value when doing the cas to acquire the
2187   // bias in the current epoch. In other words, we allow transfer of
2188   // the bias from one thread to another directly in this situation.
2189   load_klass(temp_reg, obj_reg);
2190   andi(temp2_reg, mark_reg, markWord::age_mask_in_place);
2191   orr(temp2_reg, R16_thread, temp2_reg);
2192   ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
2193   orr(temp_reg, temp2_reg, temp_reg);
2194 
2195   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2196 
2197   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
2198                  /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
2199                  /*where=*/obj_reg,
2200                  MacroAssembler::MemBarAcq,
2201                  MacroAssembler::cmpxchgx_hint_acquire_lock(),
2202                  noreg, slow_case_int); // bail out if failed
2203 
2204   // If the biasing toward our thread failed, this means that
2205   // another thread succeeded in biasing it toward itself and we
2206   // need to revoke that bias. The revocation will occur in the
2207   // interpreter runtime in the slow case.
2208   if (PrintBiasedLockingStatistics) {
2209     load_const(temp2_reg, (address) BiasedLocking::rebiased_lock_entry_count_addr(), temp_reg);
2210     lwzx(temp_reg, temp2_reg);
2211     addi(temp_reg, temp_reg, 1);
2212     stwx(temp_reg, temp2_reg);
2213   }
2214   b(done);
2215 
2216   bind(try_revoke_bias);
2217   // The prototype mark in the klass doesn't have the bias bit set any
2218   // more, indicating that objects of this data type are not supposed
2219   // to be biased any more. We are going to try to reset the mark of
2220   // this object to the prototype value and fall through to the
2221   // CAS-based locking scheme. Note that if our CAS fails, it means
2222   // that another thread raced us for the privilege of revoking the
2223   // bias of this particular object, so it's okay to continue in the
2224   // normal locking code.
2225   load_klass(temp_reg, obj_reg);
2226   ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
2227   andi(temp2_reg, mark_reg, markWord::age_mask_in_place);
2228   orr(temp_reg, temp_reg, temp2_reg);
2229 
2230   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2231 
2232   // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
2233   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
2234                  /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
2235                  /*where=*/obj_reg,
2236                  MacroAssembler::MemBarAcq,
2237                  MacroAssembler::cmpxchgx_hint_acquire_lock());
2238 
2239   // reload markWord in mark_reg before continuing with lightweight locking
2240   ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);
2241 
2242   // Fall through to the normal CAS-based lock, because no matter what
2243   // the result of the above CAS, some thread must have succeeded in
2244   // removing the bias bit from the object's header.
2245   if (PrintBiasedLockingStatistics) {
2246     Label l;
2247     bne(cr_reg, l);
2248     load_const(temp2_reg, (address) BiasedLocking::revoked_lock_entry_count_addr(), temp_reg);
2249     lwzx(temp_reg, temp2_reg);
2250     addi(temp_reg, temp_reg, 1);
2251     stwx(temp_reg, temp2_reg);
2252     bind(l);
2253   }
2254 
2255   bind(cas_label);
2256 }
2257 
2258 void MacroAssembler::biased_locking_exit (ConditionRegister cr_reg, Register mark_addr, Register temp_reg, Label& done) {
2259   // Check for biased locking unlock case, which is a no-op
2260   // Note: we do not have to check the thread ID for two reasons.
2261   // First, the interpreter checks for IllegalMonitorStateException at
2262   // a higher level. Second, if the bias was revoked while we held the
2263   // lock, the object could not be rebiased toward another thread, so
2264   // the bias bit would be clear.
2265 
2266   ld(temp_reg, 0, mark_addr);
2267   andi(temp_reg, temp_reg, markWord::biased_lock_mask_in_place);
2268 
2269   cmpwi(cr_reg, temp_reg, markWord::biased_lock_pattern);
2270   beq(cr_reg, done);
2271 }
2272 
2273 // allocation (for C1)
2274 void MacroAssembler::eden_allocate(
2275   Register obj,                      // result: pointer to object after successful allocation
2276   Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
2277   int      con_size_in_bytes,        // object size in bytes if   known at compile time
2278   Register t1,                       // temp register
2279   Register t2,                       // temp register
2280   Label&   slow_case                 // continuation point if fast allocation fails
2281 ) {
2282   b(slow_case);
2283 }
2284 
2285 void MacroAssembler::tlab_allocate(
2286   Register obj,                      // result: pointer to object after successful allocation
2287   Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
2288   int      con_size_in_bytes,        // object size in bytes if   known at compile time
2289   Register t1,                       // temp register
2290   Label&   slow_case                 // continuation point if fast allocation fails
2291 ) {
2292   // make sure arguments make sense
2293   assert_different_registers(obj, var_size_in_bytes, t1);
2294   assert(0 <= con_size_in_bytes && is_simm16(con_size_in_bytes), "illegal object size");
2295   assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment");
2296 
2297   const Register new_top = t1;
2298   //verify_tlab(); not implemented
2299 
2300   ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2301   ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread);
2302   if (var_size_in_bytes == noreg) {
2303     addi(new_top, obj, con_size_in_bytes);
2304   } else {
2305     add(new_top, obj, var_size_in_bytes);
2306   }
2307   cmpld(CCR0, new_top, R0);
2308   bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case);
2309 
2310 #ifdef ASSERT
2311   // make sure new free pointer is properly aligned
2312   {
2313     Label L;
2314     andi_(R0, new_top, MinObjAlignmentInBytesMask);
2315     beq(CCR0, L);
2316     stop("updated TLAB free is not properly aligned");
2317     bind(L);
2318   }
2319 #endif // ASSERT
2320 
2321   // update the tlab top pointer
2322   std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2323   //verify_tlab(); not implemented
2324 }
2325 void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register t1, Register t2) {
2326   unimplemented("incr_allocated_bytes");
2327 }
2328 
2329 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset,
2330                                              int insts_call_instruction_offset, Register Rtoc) {
2331   // Start the stub.
2332   address stub = start_a_stub(64);
2333   if (stub == NULL) { return NULL; } // CodeCache full: bail out
2334 
2335   // Create a trampoline stub relocation which relates this trampoline stub
2336   // with the call instruction at insts_call_instruction_offset in the
2337   // instructions code-section.
2338   relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset));
2339   const int stub_start_offset = offset();
2340 
2341   // For java_to_interp stubs we use R11_scratch1 as scratch register
2342   // and in call trampoline stubs we use R12_scratch2. This way we
2343   // can distinguish them (see is_NativeCallTrampolineStub_at()).
2344   Register reg_scratch = R12_scratch2;
2345 
2346   // Now, create the trampoline stub's code:
2347   // - load the TOC
2348   // - load the call target from the constant pool
2349   // - call
2350   if (Rtoc == noreg) {
2351     calculate_address_from_global_toc(reg_scratch, method_toc());
2352     Rtoc = reg_scratch;
2353   }
2354 
2355   ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false);
2356   mtctr(reg_scratch);
2357   bctr();
2358 
2359   const address stub_start_addr = addr_at(stub_start_offset);
2360 
2361   // Assert that the encoded destination_toc_offset can be identified and that it is correct.
2362   assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(),
2363          "encoded offset into the constant pool must match");
2364   // Trampoline_stub_size should be good.
2365   assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size");
2366   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
2367 
2368   // End the stub.
2369   end_a_stub();
2370   return stub;
2371 }
2372 
2373 // TM on PPC64.
2374 void MacroAssembler::atomic_inc_ptr(Register addr, Register result, int simm16) {
2375   Label retry;
2376   bind(retry);
2377   ldarx(result, addr, /*hint*/ false);
2378   addi(result, result, simm16);
2379   stdcx_(result, addr);
2380   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
2381     bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
2382   } else {
2383     bne(                  CCR0, retry); // stXcx_ sets CCR0
2384   }
2385 }
2386 
2387 void MacroAssembler::atomic_ori_int(Register addr, Register result, int uimm16) {
2388   Label retry;
2389   bind(retry);
2390   lwarx(result, addr, /*hint*/ false);
2391   ori(result, result, uimm16);
2392   stwcx_(result, addr);
2393   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
2394     bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
2395   } else {
2396     bne(                  CCR0, retry); // stXcx_ sets CCR0
2397   }
2398 }
2399 
2400 #if INCLUDE_RTM_OPT
2401 
2402 // Update rtm_counters based on abort status
2403 // input: abort_status
2404 //        rtm_counters_Reg (RTMLockingCounters*)
2405 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters_Reg) {
2406   // Mapping to keep PreciseRTMLockingStatistics similar to x86.
2407   // x86 ppc (! means inverted, ? means not the same)
2408   //  0   31  Set if abort caused by XABORT instruction.
2409   //  1  ! 7  If set, the transaction may succeed on a retry. This bit is always clear if bit 0 is set.
2410   //  2   13  Set if another logical processor conflicted with a memory address that was part of the transaction that aborted.
2411   //  3   10  Set if an internal buffer overflowed.
2412   //  4  ?12  Set if a debug breakpoint was hit.
2413   //  5  ?32  Set if an abort occurred during execution of a nested transaction.
2414   const int failure_bit[] = {tm_tabort, // Signal handler will set this too.
2415                              tm_failure_persistent,
2416                              tm_non_trans_cf,
2417                              tm_trans_cf,
2418                              tm_footprint_of,
2419                              tm_failure_code,
2420                              tm_transaction_level};
2421 
2422   const int num_failure_bits = sizeof(failure_bit) / sizeof(int);
2423   const int num_counters = RTMLockingCounters::ABORT_STATUS_LIMIT;
2424 
2425   const int bit2counter_map[][num_counters] =
2426   // 0 = no map; 1 = mapped, no inverted logic; -1 = mapped, inverted logic
2427   // Inverted logic means that if a bit is set don't count it, or vice-versa.
2428   // Care must be taken when mapping bits to counters as bits for a given
2429   // counter must be mutually exclusive. Otherwise, the counter will be
2430   // incremented more than once.
2431   // counters:
2432   // 0        1        2         3         4         5
2433   // abort  , persist, conflict, overflow, debug   , nested         bits:
2434   {{ 1      , 0      , 0       , 0       , 0       , 0      },   // abort
2435    { 0      , -1     , 0       , 0       , 0       , 0      },   // failure_persistent
2436    { 0      , 0      , 1       , 0       , 0       , 0      },   // non_trans_cf
2437    { 0      , 0      , 1       , 0       , 0       , 0      },   // trans_cf
2438    { 0      , 0      , 0       , 1       , 0       , 0      },   // footprint_of
2439    { 0      , 0      , 0       , 0       , -1      , 0      },   // failure_code = 0xD4
2440    { 0      , 0      , 0       , 0       , 0       , 1      }};  // transaction_level > 1
2441   // ...
2442 
2443   // Move abort_status value to R0 and use abort_status register as a
2444   // temporary register because R0 as third operand in ld/std is treated
2445   // as base address zero (value). Likewise, R0 as second operand in addi
2446   // is problematic because it amounts to li.
2447   const Register temp_Reg = abort_status;
2448   const Register abort_status_R0 = R0;
2449   mr(abort_status_R0, abort_status);
2450 
2451   // Increment total abort counter.
2452   int counters_offs = RTMLockingCounters::abort_count_offset();
2453   ld(temp_Reg, counters_offs, rtm_counters_Reg);
2454   addi(temp_Reg, temp_Reg, 1);
2455   std(temp_Reg, counters_offs, rtm_counters_Reg);
2456 
2457   // Increment specific abort counters.
2458   if (PrintPreciseRTMLockingStatistics) {
2459 
2460     // #0 counter offset.
2461     int abortX_offs = RTMLockingCounters::abortX_count_offset();
2462 
2463     for (int nbit = 0; nbit < num_failure_bits; nbit++) {
2464       for (int ncounter = 0; ncounter < num_counters; ncounter++) {
2465         if (bit2counter_map[nbit][ncounter] != 0) {
2466           Label check_abort;
2467           int abort_counter_offs = abortX_offs + (ncounter << 3);
2468 
2469           if (failure_bit[nbit] == tm_transaction_level) {
2470             // Don't check outer transaction, TL = 1 (bit 63). Hence only
2471             // 11 bits in the TL field are checked to find out if failure
2472             // occured in a nested transaction. This check also matches
2473             // the case when nesting_of = 1 (nesting overflow).
2474             rldicr_(temp_Reg, abort_status_R0, failure_bit[nbit], 10);
2475           } else if (failure_bit[nbit] == tm_failure_code) {
2476             // Check failure code for trap or illegal caught in TM.
2477             // Bits 0:7 are tested as bit 7 (persistent) is copied from
2478             // tabort or treclaim source operand.
2479             // On Linux: trap or illegal is TM_CAUSE_SIGNAL (0xD4).
2480             rldicl(temp_Reg, abort_status_R0, 8, 56);
2481             cmpdi(CCR0, temp_Reg, 0xD4);
2482           } else {
2483             rldicr_(temp_Reg, abort_status_R0, failure_bit[nbit], 0);
2484           }
2485 
2486           if (bit2counter_map[nbit][ncounter] == 1) {
2487             beq(CCR0, check_abort);
2488           } else {
2489             bne(CCR0, check_abort);
2490           }
2491 
2492           // We don't increment atomically.
2493           ld(temp_Reg, abort_counter_offs, rtm_counters_Reg);
2494           addi(temp_Reg, temp_Reg, 1);
2495           std(temp_Reg, abort_counter_offs, rtm_counters_Reg);
2496 
2497           bind(check_abort);
2498         }
2499       }
2500     }
2501   }
2502   // Restore abort_status.
2503   mr(abort_status, abort_status_R0);
2504 }
2505 
2506 // Branch if (random & (count-1) != 0), count is 2^n
2507 // tmp and CR0 are killed
2508 void MacroAssembler::branch_on_random_using_tb(Register tmp, int count, Label& brLabel) {
2509   mftb(tmp);
2510   andi_(tmp, tmp, count-1);
2511   bne(CCR0, brLabel);
2512 }
2513 
2514 // Perform abort ratio calculation, set no_rtm bit if high ratio.
2515 // input:  rtm_counters_Reg (RTMLockingCounters* address) - KILLED
2516 void MacroAssembler::rtm_abort_ratio_calculation(Register rtm_counters_Reg,
2517                                                  RTMLockingCounters* rtm_counters,
2518                                                  Metadata* method_data) {
2519   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
2520 
2521   if (RTMLockingCalculationDelay > 0) {
2522     // Delay calculation.
2523     ld(rtm_counters_Reg, (RegisterOrConstant)(intptr_t)RTMLockingCounters::rtm_calculation_flag_addr());
2524     cmpdi(CCR0, rtm_counters_Reg, 0);
2525     beq(CCR0, L_done);
2526     load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2527   }
2528   // Abort ratio calculation only if abort_count > RTMAbortThreshold.
2529   //   Aborted transactions = abort_count * 100
2530   //   All transactions = total_count *  RTMTotalCountIncrRate
2531   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
2532   ld(R0, RTMLockingCounters::abort_count_offset(), rtm_counters_Reg);
2533   if (is_simm(RTMAbortThreshold, 16)) {   // cmpdi can handle 16bit immediate only.
2534     cmpdi(CCR0, R0, RTMAbortThreshold);
2535     blt(CCR0, L_check_always_rtm2);  // reload of rtm_counters_Reg not necessary
2536   } else {
2537     load_const_optimized(rtm_counters_Reg, RTMAbortThreshold);
2538     cmpd(CCR0, R0, rtm_counters_Reg);
2539     blt(CCR0, L_check_always_rtm1);  // reload of rtm_counters_Reg required
2540   }
2541   mulli(R0, R0, 100);
2542 
2543   const Register tmpReg = rtm_counters_Reg;
2544   ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2545   mulli(tmpReg, tmpReg, RTMTotalCountIncrRate); // allowable range: int16
2546   mulli(tmpReg, tmpReg, RTMAbortRatio);         // allowable range: int16
2547   cmpd(CCR0, R0, tmpReg);
2548   blt(CCR0, L_check_always_rtm1); // jump to reload
2549   if (method_data != NULL) {
2550     // Set rtm_state to "no rtm" in MDO.
2551     // Not using a metadata relocation. Method and Class Loader are kept alive anyway.
2552     // (See nmethod::metadata_do and CodeBuffer::finalize_oop_references.)
2553     load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2554     atomic_ori_int(R0, tmpReg, NoRTM);
2555   }
2556   b(L_done);
2557 
2558   bind(L_check_always_rtm1);
2559   load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2560   bind(L_check_always_rtm2);
2561   ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2562   int64_t thresholdValue = RTMLockingThreshold / RTMTotalCountIncrRate;
2563   if (is_simm(thresholdValue, 16)) {   // cmpdi can handle 16bit immediate only.
2564     cmpdi(CCR0, tmpReg, thresholdValue);
2565   } else {
2566     load_const_optimized(R0, thresholdValue);
2567     cmpd(CCR0, tmpReg, R0);
2568   }
2569   blt(CCR0, L_done);
2570   if (method_data != NULL) {
2571     // Set rtm_state to "always rtm" in MDO.
2572     // Not using a metadata relocation. See above.
2573     load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2574     atomic_ori_int(R0, tmpReg, UseRTM);
2575   }
2576   bind(L_done);
2577 }
2578 
2579 // Update counters and perform abort ratio calculation.
2580 // input: abort_status_Reg
2581 void MacroAssembler::rtm_profiling(Register abort_status_Reg, Register temp_Reg,
2582                                    RTMLockingCounters* rtm_counters,
2583                                    Metadata* method_data,
2584                                    bool profile_rtm) {
2585 
2586   assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2587   // Update rtm counters based on state at abort.
2588   // Reads abort_status_Reg, updates flags.
2589   assert_different_registers(abort_status_Reg, temp_Reg);
2590   load_const_optimized(temp_Reg, (address)rtm_counters, R0);
2591   rtm_counters_update(abort_status_Reg, temp_Reg);
2592   if (profile_rtm) {
2593     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2594     rtm_abort_ratio_calculation(temp_Reg, rtm_counters, method_data);
2595   }
2596 }
2597 
2598 // Retry on abort if abort's status indicates non-persistent failure.
2599 // inputs: retry_count_Reg
2600 //       : abort_status_Reg
2601 // output: retry_count_Reg decremented by 1
2602 void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg,
2603                                              Label& retryLabel, Label* checkRetry) {
2604   Label doneRetry;
2605 
2606   // Don't retry if failure is persistent.
2607   // The persistent bit is set when a (A) Disallowed operation is performed in
2608   // transactional state, like for instance trying to write the TFHAR after a
2609   // transaction is started; or when there is (B) a Nesting Overflow (too many
2610   // nested transactions); or when (C) the Footprint overflows (too many
2611   // addressess touched in TM state so there is no more space in the footprint
2612   // area to track them); or in case of (D) a Self-Induced Conflict, i.e. a
2613   // store is performed to a given address in TM state, then once in suspended
2614   // state the same address is accessed. Failure (A) is very unlikely to occur
2615   // in the JVM. Failure (D) will never occur because Suspended state is never
2616   // used in the JVM. Thus mostly (B) a Nesting Overflow or (C) a Footprint
2617   // Overflow will set the persistent bit.
2618   rldicr_(R0, abort_status_Reg, tm_failure_persistent, 0);
2619   bne(CCR0, doneRetry);
2620 
2621   // Don't retry if transaction was deliberately aborted, i.e. caused by a
2622   // tabort instruction.
2623   rldicr_(R0, abort_status_Reg, tm_tabort, 0);
2624   bne(CCR0, doneRetry);
2625 
2626   // Retry if transaction aborted due to a conflict with another thread.
2627   if (checkRetry) { bind(*checkRetry); }
2628   addic_(retry_count_Reg, retry_count_Reg, -1);
2629   blt(CCR0, doneRetry);
2630   b(retryLabel);
2631   bind(doneRetry);
2632 }
2633 
2634 // Spin and retry if lock is busy.
2635 // inputs: owner_addr_Reg (monitor address)
2636 //       : retry_count_Reg
2637 // output: retry_count_Reg decremented by 1
2638 // CTR is killed
2639 void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register owner_addr_Reg, Label& retryLabel) {
2640   Label SpinLoop, doneRetry, doRetry;
2641   addic_(retry_count_Reg, retry_count_Reg, -1);
2642   blt(CCR0, doneRetry);
2643 
2644   if (RTMSpinLoopCount > 1) {
2645     li(R0, RTMSpinLoopCount);
2646     mtctr(R0);
2647   }
2648 
2649   // low thread priority
2650   smt_prio_low();
2651   bind(SpinLoop);
2652 
2653   if (RTMSpinLoopCount > 1) {
2654     bdz(doRetry);
2655     ld(R0, 0, owner_addr_Reg);
2656     cmpdi(CCR0, R0, 0);
2657     bne(CCR0, SpinLoop);
2658   }
2659 
2660   bind(doRetry);
2661 
2662   // restore thread priority to default in userspace
2663 #ifdef LINUX
2664   smt_prio_medium_low();
2665 #else
2666   smt_prio_medium();
2667 #endif
2668 
2669   b(retryLabel);
2670 
2671   bind(doneRetry);
2672 }
2673 
2674 // Use RTM for normal stack locks.
2675 // Input: objReg (object to lock)
2676 void MacroAssembler::rtm_stack_locking(ConditionRegister flag,
2677                                        Register obj, Register mark_word, Register tmp,
2678                                        Register retry_on_abort_count_Reg,
2679                                        RTMLockingCounters* stack_rtm_counters,
2680                                        Metadata* method_data, bool profile_rtm,
2681                                        Label& DONE_LABEL, Label& IsInflated) {
2682   assert(UseRTMForStackLocks, "why call this otherwise?");
2683   assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
2684   Label L_rtm_retry, L_decrement_retry, L_on_abort;
2685 
2686   if (RTMRetryCount > 0) {
2687     load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
2688     bind(L_rtm_retry);
2689   }
2690   andi_(R0, mark_word, markWord::monitor_value);  // inflated vs stack-locked|neutral|biased
2691   bne(CCR0, IsInflated);
2692 
2693   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2694     Label L_noincrement;
2695     if (RTMTotalCountIncrRate > 1) {
2696       branch_on_random_using_tb(tmp, RTMTotalCountIncrRate, L_noincrement);
2697     }
2698     assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
2699     load_const_optimized(tmp, (address)stack_rtm_counters->total_count_addr(), R0);
2700     //atomic_inc_ptr(tmp, /*temp, will be reloaded*/mark_word); We don't increment atomically
2701     ldx(mark_word, tmp);
2702     addi(mark_word, mark_word, 1);
2703     stdx(mark_word, tmp);
2704     bind(L_noincrement);
2705   }
2706   tbegin_();
2707   beq(CCR0, L_on_abort);
2708   ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);      // Reload in transaction, conflicts need to be tracked.
2709   andi(R0, mark_word, markWord::biased_lock_mask_in_place); // look at 3 lock bits
2710   cmpwi(flag, R0, markWord::unlocked_value);                // bits = 001 unlocked
2711   beq(flag, DONE_LABEL);                                    // all done if unlocked
2712 
2713   if (UseRTMXendForLockBusy) {
2714     tend_();
2715     b(L_decrement_retry);
2716   } else {
2717     tabort_();
2718   }
2719   bind(L_on_abort);
2720   const Register abort_status_Reg = tmp;
2721   mftexasr(abort_status_Reg);
2722   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2723     rtm_profiling(abort_status_Reg, /*temp*/mark_word, stack_rtm_counters, method_data, profile_rtm);
2724   }
2725   ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // reload
2726   if (RTMRetryCount > 0) {
2727     // Retry on lock abort if abort status is not permanent.
2728     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry, &L_decrement_retry);
2729   } else {
2730     bind(L_decrement_retry);
2731   }
2732 }
2733 
2734 // Use RTM for inflating locks
2735 // inputs: obj       (object to lock)
2736 //         mark_word (current header - KILLED)
2737 //         boxReg    (on-stack box address (displaced header location) - KILLED)
2738 void MacroAssembler::rtm_inflated_locking(ConditionRegister flag,
2739                                           Register obj, Register mark_word, Register boxReg,
2740                                           Register retry_on_busy_count_Reg, Register retry_on_abort_count_Reg,
2741                                           RTMLockingCounters* rtm_counters,
2742                                           Metadata* method_data, bool profile_rtm,
2743                                           Label& DONE_LABEL) {
2744   assert(UseRTMLocking, "why call this otherwise?");
2745   Label L_rtm_retry, L_decrement_retry, L_on_abort;
2746   // Clean monitor_value bit to get valid pointer.
2747   int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markWord::monitor_value;
2748 
2749   // Store non-null, using boxReg instead of (intptr_t)markWord::unused_mark().
2750   std(boxReg, BasicLock::displaced_header_offset_in_bytes(), boxReg);
2751   const Register tmpReg = boxReg;
2752   const Register owner_addr_Reg = mark_word;
2753   addi(owner_addr_Reg, mark_word, owner_offset);
2754 
2755   if (RTMRetryCount > 0) {
2756     load_const_optimized(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy.
2757     load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort.
2758     bind(L_rtm_retry);
2759   }
2760   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2761     Label L_noincrement;
2762     if (RTMTotalCountIncrRate > 1) {
2763       branch_on_random_using_tb(R0, RTMTotalCountIncrRate, L_noincrement);
2764     }
2765     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2766     load_const(R0, (address)rtm_counters->total_count_addr(), tmpReg);
2767     //atomic_inc_ptr(R0, tmpReg); We don't increment atomically
2768     ldx(tmpReg, R0);
2769     addi(tmpReg, tmpReg, 1);
2770     stdx(tmpReg, R0);
2771     bind(L_noincrement);
2772   }
2773   tbegin_();
2774   beq(CCR0, L_on_abort);
2775   // We don't reload mark word. Will only be reset at safepoint.
2776   ld(R0, 0, owner_addr_Reg); // Load in transaction, conflicts need to be tracked.
2777   cmpdi(flag, R0, 0);
2778   beq(flag, DONE_LABEL);
2779 
2780   if (UseRTMXendForLockBusy) {
2781     tend_();
2782     b(L_decrement_retry);
2783   } else {
2784     tabort_();
2785   }
2786   bind(L_on_abort);
2787   const Register abort_status_Reg = tmpReg;
2788   mftexasr(abort_status_Reg);
2789   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2790     rtm_profiling(abort_status_Reg, /*temp*/ owner_addr_Reg, rtm_counters, method_data, profile_rtm);
2791     // Restore owner_addr_Reg
2792     ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);
2793 #ifdef ASSERT
2794     andi_(R0, mark_word, markWord::monitor_value);
2795     asm_assert_ne("must be inflated"); // Deflating only allowed at safepoint.
2796 #endif
2797     addi(owner_addr_Reg, mark_word, owner_offset);
2798   }
2799   if (RTMRetryCount > 0) {
2800     // Retry on lock abort if abort status is not permanent.
2801     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
2802   }
2803 
2804   // Appears unlocked - try to swing _owner from null to non-null.
2805   cmpxchgd(flag, /*current val*/ R0, (intptr_t)0, /*new val*/ R16_thread, owner_addr_Reg,
2806            MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2807            MacroAssembler::cmpxchgx_hint_acquire_lock(), noreg, &L_decrement_retry, true);
2808 
2809   if (RTMRetryCount > 0) {
2810     // success done else retry
2811     b(DONE_LABEL);
2812     bind(L_decrement_retry);
2813     // Spin and retry if lock is busy.
2814     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, owner_addr_Reg, L_rtm_retry);
2815   } else {
2816     bind(L_decrement_retry);
2817   }
2818 }
2819 
2820 #endif //  INCLUDE_RTM_OPT
2821 
2822 // "The box" is the space on the stack where we copy the object mark.
2823 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box,
2824                                                Register temp, Register displaced_header, Register current_header,
2825                                                bool try_bias,
2826                                                RTMLockingCounters* rtm_counters,
2827                                                RTMLockingCounters* stack_rtm_counters,
2828                                                Metadata* method_data,
2829                                                bool use_rtm, bool profile_rtm) {
2830   assert_different_registers(oop, box, temp, displaced_header, current_header);
2831   assert(flag != CCR0, "bad condition register");
2832   Label cont;
2833   Label object_has_monitor;
2834   Label cas_failed;
2835 
2836   // Load markWord from object into displaced_header.
2837   ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop);
2838 
2839 
2840   if (try_bias) {
2841     biased_locking_enter(flag, oop, displaced_header, temp, current_header, cont);
2842   }
2843 
2844 #if INCLUDE_RTM_OPT
2845   if (UseRTMForStackLocks && use_rtm) {
2846     rtm_stack_locking(flag, oop, displaced_header, temp, /*temp*/ current_header,
2847                       stack_rtm_counters, method_data, profile_rtm,
2848                       cont, object_has_monitor);
2849   }
2850 #endif // INCLUDE_RTM_OPT
2851 
2852   // Handle existing monitor.
2853   // The object has an existing monitor iff (mark & monitor_value) != 0.
2854   andi_(temp, displaced_header, markWord::monitor_value);
2855   bne(CCR0, object_has_monitor);
2856 
2857   // Set displaced_header to be (markWord of object | UNLOCK_VALUE).
2858   ori(displaced_header, displaced_header, markWord::unlocked_value);
2859 
2860   // Load Compare Value application register.
2861 
2862   // Initialize the box. (Must happen before we update the object mark!)
2863   std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2864 
2865   // Must fence, otherwise, preceding store(s) may float below cmpxchg.
2866   // Compare object markWord with mark and if equal exchange scratch1 with object markWord.
2867   cmpxchgd(/*flag=*/flag,
2868            /*current_value=*/current_header,
2869            /*compare_value=*/displaced_header,
2870            /*exchange_value=*/box,
2871            /*where=*/oop,
2872            MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2873            MacroAssembler::cmpxchgx_hint_acquire_lock(),
2874            noreg,
2875            &cas_failed,
2876            /*check without membar and ldarx first*/true);
2877   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2878 
2879   // If the compare-and-exchange succeeded, then we found an unlocked
2880   // object and we have now locked it.
2881   b(cont);
2882 
2883   bind(cas_failed);
2884   // We did not see an unlocked object so try the fast recursive case.
2885 
2886   // Check if the owner is self by comparing the value in the markWord of object
2887   // (current_header) with the stack pointer.
2888   sub(current_header, current_header, R1_SP);
2889   load_const_optimized(temp, ~(os::vm_page_size()-1) | markWord::lock_mask_in_place);
2890 
2891   and_(R0/*==0?*/, current_header, temp);
2892   // If condition is true we are cont and hence we can store 0 as the
2893   // displaced header in the box, which indicates that it is a recursive lock.
2894   mcrf(flag,CCR0);
2895   std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box);
2896 
2897   // Handle existing monitor.
2898   b(cont);
2899 
2900   bind(object_has_monitor);
2901   // The object's monitor m is unlocked iff m->owner == NULL,
2902   // otherwise m->owner may contain a thread or a stack address.
2903 
2904 #if INCLUDE_RTM_OPT
2905   // Use the same RTM locking code in 32- and 64-bit VM.
2906   if (use_rtm) {
2907     rtm_inflated_locking(flag, oop, displaced_header, box, temp, /*temp*/ current_header,
2908                          rtm_counters, method_data, profile_rtm, cont);
2909   } else {
2910 #endif // INCLUDE_RTM_OPT
2911 
2912   // Try to CAS m->owner from NULL to current thread.
2913   addi(temp, displaced_header, ObjectMonitor::owner_offset_in_bytes()-markWord::monitor_value);
2914   cmpxchgd(/*flag=*/flag,
2915            /*current_value=*/current_header,
2916            /*compare_value=*/(intptr_t)0,
2917            /*exchange_value=*/R16_thread,
2918            /*where=*/temp,
2919            MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2920            MacroAssembler::cmpxchgx_hint_acquire_lock());
2921 
2922   // Store a non-null value into the box.
2923   std(box, BasicLock::displaced_header_offset_in_bytes(), box);
2924 
2925 # ifdef ASSERT
2926   bne(flag, cont);
2927   // We have acquired the monitor, check some invariants.
2928   addi(/*monitor=*/temp, temp, -ObjectMonitor::owner_offset_in_bytes());
2929   // Invariant 1: _recursions should be 0.
2930   //assert(ObjectMonitor::recursions_size_in_bytes() == 8, "unexpected size");
2931   asm_assert_mem8_is_zero(ObjectMonitor::recursions_offset_in_bytes(), temp,
2932                             "monitor->_recursions should be 0");
2933 # endif
2934 
2935 #if INCLUDE_RTM_OPT
2936   } // use_rtm()
2937 #endif
2938 
2939   bind(cont);
2940   // flag == EQ indicates success
2941   // flag == NE indicates failure
2942 }
2943 
2944 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box,
2945                                                  Register temp, Register displaced_header, Register current_header,
2946                                                  bool try_bias, bool use_rtm) {
2947   assert_different_registers(oop, box, temp, displaced_header, current_header);
2948   assert(flag != CCR0, "bad condition register");
2949   Label cont;
2950   Label object_has_monitor;
2951 
2952   if (try_bias) {
2953     biased_locking_exit(flag, oop, current_header, cont);
2954   }
2955 
2956 #if INCLUDE_RTM_OPT
2957   if (UseRTMForStackLocks && use_rtm) {
2958     assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
2959     Label L_regular_unlock;
2960     ld(current_header, oopDesc::mark_offset_in_bytes(), oop);      // fetch markword
2961     andi(R0, current_header, markWord::biased_lock_mask_in_place); // look at 3 lock bits
2962     cmpwi(flag, R0, markWord::unlocked_value);                     // bits = 001 unlocked
2963     bne(flag, L_regular_unlock);                                   // else RegularLock
2964     tend_();                                                       // otherwise end...
2965     b(cont);                                                       // ... and we're done
2966     bind(L_regular_unlock);
2967   }
2968 #endif
2969 
2970   // Find the lock address and load the displaced header from the stack.
2971   ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2972 
2973   // If the displaced header is 0, we have a recursive unlock.
2974   cmpdi(flag, displaced_header, 0);
2975   beq(flag, cont);
2976 
2977   // Handle existing monitor.
2978   // The object has an existing monitor iff (mark & monitor_value) != 0.
2979   RTM_OPT_ONLY( if (!(UseRTMForStackLocks && use_rtm)) ) // skip load if already done
2980   ld(current_header, oopDesc::mark_offset_in_bytes(), oop);
2981   andi_(R0, current_header, markWord::monitor_value);
2982   bne(CCR0, object_has_monitor);
2983 
2984   // Check if it is still a light weight lock, this is is true if we see
2985   // the stack address of the basicLock in the markWord of the object.
2986   // Cmpxchg sets flag to cmpd(current_header, box).
2987   cmpxchgd(/*flag=*/flag,
2988            /*current_value=*/current_header,
2989            /*compare_value=*/box,
2990            /*exchange_value=*/displaced_header,
2991            /*where=*/oop,
2992            MacroAssembler::MemBarRel,
2993            MacroAssembler::cmpxchgx_hint_release_lock(),
2994            noreg,
2995            &cont);
2996 
2997   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2998 
2999   // Handle existing monitor.
3000   b(cont);
3001 
3002   bind(object_has_monitor);
3003   STATIC_ASSERT(markWord::monitor_value <= INT_MAX);
3004   addi(current_header, current_header, -(int)markWord::monitor_value); // monitor
3005   ld(temp,             ObjectMonitor::owner_offset_in_bytes(), current_header);
3006 
3007     // It's inflated.
3008 #if INCLUDE_RTM_OPT
3009   if (use_rtm) {
3010     Label L_regular_inflated_unlock;
3011     // Clean monitor_value bit to get valid pointer
3012     cmpdi(flag, temp, 0);
3013     bne(flag, L_regular_inflated_unlock);
3014     tend_();
3015     b(cont);
3016     bind(L_regular_inflated_unlock);
3017   }
3018 #endif
3019 
3020   ld(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header);
3021   xorr(temp, R16_thread, temp);      // Will be 0 if we are the owner.
3022   orr(temp, temp, displaced_header); // Will be 0 if there are 0 recursions.
3023   cmpdi(flag, temp, 0);
3024   bne(flag, cont);
3025 
3026   ld(temp,             ObjectMonitor::EntryList_offset_in_bytes(), current_header);
3027   ld(displaced_header, ObjectMonitor::cxq_offset_in_bytes(), current_header);
3028   orr(temp, temp, displaced_header); // Will be 0 if both are 0.
3029   cmpdi(flag, temp, 0);
3030   bne(flag, cont);
3031   release();
3032   std(temp, ObjectMonitor::owner_offset_in_bytes(), current_header);
3033 
3034   bind(cont);
3035   // flag == EQ indicates success
3036   // flag == NE indicates failure
3037 }
3038 
3039 void MacroAssembler::safepoint_poll(Label& slow_path, Register temp_reg) {
3040   ld(temp_reg, in_bytes(Thread::polling_page_offset()), R16_thread);
3041   // Armed page has poll_bit set.
3042   andi_(temp_reg, temp_reg, SafepointMechanism::poll_bit());
3043   bne(CCR0, slow_path);
3044 }
3045 
3046 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2, bool needs_frame) {
3047   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3048   bs->resolve_jobject(this, value, tmp1, tmp2, needs_frame);
3049 }
3050 
3051 // Values for last_Java_pc, and last_Java_sp must comply to the rules
3052 // in frame_ppc.hpp.
3053 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) {
3054   // Always set last_Java_pc and flags first because once last_Java_sp
3055   // is visible has_last_Java_frame is true and users will look at the
3056   // rest of the fields. (Note: flags should always be zero before we
3057   // get here so doesn't need to be set.)
3058 
3059   // Verify that last_Java_pc was zeroed on return to Java
3060   asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread,
3061                           "last_Java_pc not zeroed before leaving Java");
3062 
3063   // When returning from calling out from Java mode the frame anchor's
3064   // last_Java_pc will always be set to NULL. It is set here so that
3065   // if we are doing a call to native (not VM) that we capture the
3066   // known pc and don't have to rely on the native call having a
3067   // standard frame linkage where we can find the pc.
3068   if (last_Java_pc != noreg)
3069     std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
3070 
3071   // Set last_Java_sp last.
3072   std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
3073 }
3074 
3075 void MacroAssembler::reset_last_Java_frame(void) {
3076   asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()),
3077                              R16_thread, "SP was not set, still zero");
3078 
3079   BLOCK_COMMENT("reset_last_Java_frame {");
3080   li(R0, 0);
3081 
3082   // _last_Java_sp = 0
3083   std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
3084 
3085   // _last_Java_pc = 0
3086   std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
3087   BLOCK_COMMENT("} reset_last_Java_frame");
3088 }
3089 
3090 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) {
3091   assert_different_registers(sp, tmp1);
3092 
3093   // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via
3094   // TOP_IJAVA_FRAME_ABI.
3095   // FIXME: assert that we really have a TOP_IJAVA_FRAME here!
3096   address entry = pc();
3097   load_const_optimized(tmp1, entry);
3098 
3099   set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1);
3100 }
3101 
3102 void MacroAssembler::get_vm_result(Register oop_result) {
3103   // Read:
3104   //   R16_thread
3105   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
3106   //
3107   // Updated:
3108   //   oop_result
3109   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
3110 
3111   verify_thread();
3112 
3113   ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread);
3114   li(R0, 0);
3115   std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread);
3116 
3117   verify_oop(oop_result, FILE_AND_LINE);
3118 }
3119 
3120 void MacroAssembler::get_vm_result_2(Register metadata_result) {
3121   // Read:
3122   //   R16_thread
3123   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
3124   //
3125   // Updated:
3126   //   metadata_result
3127   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
3128 
3129   ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
3130   li(R0, 0);
3131   std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
3132 }
3133 
3134 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3135   Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided.
3136   if (CompressedKlassPointers::base() != 0) {
3137     // Use dst as temp if it is free.
3138     sub_const_optimized(dst, current, CompressedKlassPointers::base(), R0);
3139     current = dst;
3140   }
3141   if (CompressedKlassPointers::shift() != 0) {
3142     srdi(dst, current, CompressedKlassPointers::shift());
3143     current = dst;
3144   }
3145   return current;
3146 }
3147 
3148 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) {
3149   if (UseCompressedClassPointers) {
3150     Register compressedKlass = encode_klass_not_null(ck, klass);
3151     stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop);
3152   } else {
3153     std(klass, oopDesc::klass_offset_in_bytes(), dst_oop);
3154   }
3155 }
3156 
3157 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) {
3158   if (UseCompressedClassPointers) {
3159     if (val == noreg) {
3160       val = R0;
3161       li(val, 0);
3162     }
3163     stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed
3164   }
3165 }
3166 
3167 int MacroAssembler::instr_size_for_decode_klass_not_null() {
3168   if (!UseCompressedClassPointers) return 0;
3169   int num_instrs = 1;  // shift or move
3170   if (CompressedKlassPointers::base() != 0) num_instrs = 7;  // shift + load const + add
3171   return num_instrs * BytesPerInstWord;
3172 }
3173 
3174 void MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3175   assert(dst != R0, "Dst reg may not be R0, as R0 is used here.");
3176   if (src == noreg) src = dst;
3177   Register shifted_src = src;
3178   if (CompressedKlassPointers::shift() != 0 ||
3179       CompressedKlassPointers::base() == 0 && src != dst) {  // Move required.
3180     shifted_src = dst;
3181     sldi(shifted_src, src, CompressedKlassPointers::shift());
3182   }
3183   if (CompressedKlassPointers::base() != 0) {
3184     add_const_optimized(dst, shifted_src, CompressedKlassPointers::base(), R0);
3185   }
3186 }
3187 
3188 void MacroAssembler::load_klass(Register dst, Register src) {
3189   if (UseCompressedClassPointers) {
3190     lwz(dst, oopDesc::klass_offset_in_bytes(), src);
3191     // Attention: no null check here!
3192     decode_klass_not_null(dst, dst);
3193   } else {
3194     ld(dst, oopDesc::klass_offset_in_bytes(), src);
3195   }
3196 }
3197 
3198 // ((OopHandle)result).resolve();
3199 void MacroAssembler::resolve_oop_handle(Register result) {
3200   // OopHandle::resolve is an indirection.
3201   ld(result, 0, result);
3202 }
3203 
3204 void MacroAssembler::load_mirror_from_const_method(Register mirror, Register const_method) {
3205   ld(mirror, in_bytes(ConstMethod::constants_offset()), const_method);
3206   ld(mirror, ConstantPool::pool_holder_offset_in_bytes(), mirror);
3207   ld(mirror, in_bytes(Klass::java_mirror_offset()), mirror);
3208   resolve_oop_handle(mirror);
3209 }
3210 
3211 void MacroAssembler::load_method_holder(Register holder, Register method) {
3212   ld(holder, in_bytes(Method::const_offset()), method);
3213   ld(holder, in_bytes(ConstMethod::constants_offset()), holder);
3214   ld(holder, ConstantPool::pool_holder_offset_in_bytes(), holder);
3215 }
3216 
3217 // Clear Array
3218 // For very short arrays. tmp == R0 is allowed.
3219 void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) {
3220   if (cnt_dwords > 0) { li(tmp, 0); }
3221   for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); }
3222 }
3223 
3224 // Version for constant short array length. Kills base_ptr. tmp == R0 is allowed.
3225 void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) {
3226   if (cnt_dwords < 8) {
3227     clear_memory_unrolled(base_ptr, cnt_dwords, tmp);
3228     return;
3229   }
3230 
3231   Label loop;
3232   const long loopcnt   = cnt_dwords >> 1,
3233              remainder = cnt_dwords & 1;
3234 
3235   li(tmp, loopcnt);
3236   mtctr(tmp);
3237   li(tmp, 0);
3238   bind(loop);
3239     std(tmp, 0, base_ptr);
3240     std(tmp, 8, base_ptr);
3241     addi(base_ptr, base_ptr, 16);
3242     bdnz(loop);
3243   if (remainder) { std(tmp, 0, base_ptr); }
3244 }
3245 
3246 // Kills both input registers. tmp == R0 is allowed.
3247 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) {
3248   // Procedure for large arrays (uses data cache block zero instruction).
3249     Label startloop, fast, fastloop, small_rest, restloop, done;
3250     const int cl_size         = VM_Version::L1_data_cache_line_size(),
3251               cl_dwords       = cl_size >> 3,
3252               cl_dw_addr_bits = exact_log2(cl_dwords),
3253               dcbz_min        = 1,  // Min count of dcbz executions, needs to be >0.
3254               min_cnt         = ((dcbz_min + 1) << cl_dw_addr_bits) - 1;
3255 
3256   if (const_cnt >= 0) {
3257     // Constant case.
3258     if (const_cnt < min_cnt) {
3259       clear_memory_constlen(base_ptr, const_cnt, tmp);
3260       return;
3261     }
3262     load_const_optimized(cnt_dwords, const_cnt, tmp);
3263   } else {
3264     // cnt_dwords already loaded in register. Need to check size.
3265     cmpdi(CCR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included).
3266     blt(CCR1, small_rest);
3267   }
3268     rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line.
3269     beq(CCR0, fast);                                  // Already 128byte aligned.
3270 
3271     subfic(tmp, tmp, cl_dwords);
3272     mtctr(tmp);                        // Set ctr to hit 128byte boundary (0<ctr<cl_dwords).
3273     subf(cnt_dwords, tmp, cnt_dwords); // rest.
3274     li(tmp, 0);
3275 
3276   bind(startloop);                     // Clear at the beginning to reach 128byte boundary.
3277     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3278     addi(base_ptr, base_ptr, 8);
3279     bdnz(startloop);
3280 
3281   bind(fast);                                  // Clear 128byte blocks.
3282     srdi(tmp, cnt_dwords, cl_dw_addr_bits);    // Loop count for 128byte loop (>0).
3283     andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords.
3284     mtctr(tmp);                                // Load counter.
3285 
3286   bind(fastloop);
3287     dcbz(base_ptr);                    // Clear 128byte aligned block.
3288     addi(base_ptr, base_ptr, cl_size);
3289     bdnz(fastloop);
3290 
3291   bind(small_rest);
3292     cmpdi(CCR0, cnt_dwords, 0);        // size 0?
3293     beq(CCR0, done);                   // rest == 0
3294     li(tmp, 0);
3295     mtctr(cnt_dwords);                 // Load counter.
3296 
3297   bind(restloop);                      // Clear rest.
3298     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3299     addi(base_ptr, base_ptr, 8);
3300     bdnz(restloop);
3301 
3302   bind(done);
3303 }
3304 
3305 /////////////////////////////////////////// String intrinsics ////////////////////////////////////////////
3306 
3307 // Helpers for Intrinsic Emitters
3308 //
3309 // Revert the byte order of a 32bit value in a register
3310 //   src: 0x44556677
3311 //   dst: 0x77665544
3312 // Three steps to obtain the result:
3313 //  1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word
3314 //     into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared.
3315 //     This value initializes dst.
3316 //  2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost
3317 //     byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go.
3318 //     This value is mask inserted into dst with a [0..23] mask of 1s.
3319 //  3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position.
3320 //     This value is mask inserted into dst with a [8..15] mask of 1s.
3321 void MacroAssembler::load_reverse_32(Register dst, Register src) {
3322   assert_different_registers(dst, src);
3323 
3324   rldicl(dst, src, (4+1)*8, 56);       // Rotate byte 4 into position 7 (rightmost), clear all to the left.
3325   rlwimi(dst, src,     3*8,  0, 23);   // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone.
3326   rlwimi(dst, src,     1*8,  8, 15);   // Insert byte 6 into position 5, leave the rest alone.
3327 }
3328 
3329 // Calculate the column addresses of the crc32 lookup table into distinct registers.
3330 // This loop-invariant calculation is moved out of the loop body, reducing the loop
3331 // body size from 20 to 16 instructions.
3332 // Returns the offset that was used to calculate the address of column tc3.
3333 // Due to register shortage, setting tc3 may overwrite table. With the return offset
3334 // at hand, the original table address can be easily reconstructed.
3335 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {
3336   assert(!VM_Version::has_vpmsumb(), "Vector version should be used instead!");
3337 
3338   // Point to 4 byte folding tables (byte-reversed version for Big Endian)
3339   // Layout: See StubRoutines::generate_crc_constants.
3340 #ifdef VM_LITTLE_ENDIAN
3341   const int ix0 = 3 * CRC32_TABLE_SIZE;
3342   const int ix1 = 2 * CRC32_TABLE_SIZE;
3343   const int ix2 = 1 * CRC32_TABLE_SIZE;
3344   const int ix3 = 0 * CRC32_TABLE_SIZE;
3345 #else
3346   const int ix0 = 1 * CRC32_TABLE_SIZE;
3347   const int ix1 = 2 * CRC32_TABLE_SIZE;
3348   const int ix2 = 3 * CRC32_TABLE_SIZE;
3349   const int ix3 = 4 * CRC32_TABLE_SIZE;
3350 #endif
3351   assert_different_registers(table, tc0, tc1, tc2);
3352   assert(table == tc3, "must be!");
3353 
3354   addi(tc0, table, ix0);
3355   addi(tc1, table, ix1);
3356   addi(tc2, table, ix2);
3357   if (ix3 != 0) addi(tc3, table, ix3);
3358 
3359   return ix3;
3360 }
3361 
3362 /**
3363  * uint32_t crc;
3364  * table[crc & 0xFF] ^ (crc >> 8);
3365  */
3366 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) {
3367   assert_different_registers(crc, table, tmp);
3368   assert_different_registers(val, table);
3369 
3370   if (crc == val) {                   // Must rotate first to use the unmodified value.
3371     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3372                                       // As we use a word (4-byte) instruction, we have to adapt the mask bit positions.
3373     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
3374   } else {
3375     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
3376     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3377   }
3378   lwzx(tmp, table, tmp);
3379   xorr(crc, crc, tmp);
3380 }
3381 
3382 /**
3383  * Emits code to update CRC-32 with a byte value according to constants in table.
3384  *
3385  * @param [in,out]crc   Register containing the crc.
3386  * @param [in]val       Register containing the byte to fold into the CRC.
3387  * @param [in]table     Register containing the table of crc constants.
3388  *
3389  * uint32_t crc;
3390  * val = crc_table[(val ^ crc) & 0xFF];
3391  * crc = val ^ (crc >> 8);
3392  */
3393 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3394   BLOCK_COMMENT("update_byte_crc32:");
3395   xorr(val, val, crc);
3396   fold_byte_crc32(crc, val, table, val);
3397 }
3398 
3399 /**
3400  * @param crc   register containing existing CRC (32-bit)
3401  * @param buf   register pointing to input byte buffer (byte*)
3402  * @param len   register containing number of bytes
3403  * @param table register pointing to CRC table
3404  */
3405 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,
3406                                            Register data, bool loopAlignment) {
3407   assert_different_registers(crc, buf, len, table, data);
3408 
3409   Label L_mainLoop, L_done;
3410   const int mainLoop_stepping  = 1;
3411   const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4;
3412 
3413   // Process all bytes in a single-byte loop.
3414   clrldi_(len, len, 32);                         // Enforce 32 bit. Anything to do?
3415   beq(CCR0, L_done);
3416 
3417   mtctr(len);
3418   align(mainLoop_alignment);
3419   BIND(L_mainLoop);
3420     lbz(data, 0, buf);                           // Byte from buffer, zero-extended.
3421     addi(buf, buf, mainLoop_stepping);           // Advance buffer position.
3422     update_byte_crc32(crc, data, table);
3423     bdnz(L_mainLoop);                            // Iterate.
3424 
3425   bind(L_done);
3426 }
3427 
3428 /**
3429  * Emits code to update CRC-32 with a 4-byte value according to constants in table
3430  * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
3431  */
3432 // A note on the lookup table address(es):
3433 // The implementation uses 4 table columns (byte-reversed versions for Big Endian).
3434 // To save the effort of adding the column offset to the table address each time
3435 // a table element is looked up, it is possible to pass the pre-calculated
3436 // column addresses.
3437 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary.
3438 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,
3439                                         Register t0,  Register t1,  Register t2,  Register t3,
3440                                         Register tc0, Register tc1, Register tc2, Register tc3) {
3441   assert_different_registers(crc, t3);
3442 
3443   // XOR crc with next four bytes of buffer.
3444   lwz(t3, bufDisp, buf);
3445   if (bufInc != 0) {
3446     addi(buf, buf, bufInc);
3447   }
3448   xorr(t3, t3, crc);
3449 
3450   // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices.
3451   rlwinm(t0, t3,  2,         24-2, 31-2);  // ((t1 >>  0) & 0xff) << 2
3452   rlwinm(t1, t3,  32+(2- 8), 24-2, 31-2);  // ((t1 >>  8) & 0xff) << 2
3453   rlwinm(t2, t3,  32+(2-16), 24-2, 31-2);  // ((t1 >> 16) & 0xff) << 2
3454   rlwinm(t3, t3,  32+(2-24), 24-2, 31-2);  // ((t1 >> 24) & 0xff) << 2
3455 
3456   // Use the pre-calculated column addresses.
3457   // Load pre-calculated table values.
3458   lwzx(t0, tc0, t0);
3459   lwzx(t1, tc1, t1);
3460   lwzx(t2, tc2, t2);
3461   lwzx(t3, tc3, t3);
3462 
3463   // Calculate new crc from table values.
3464   xorr(t0,  t0, t1);
3465   xorr(t2,  t2, t3);
3466   xorr(crc, t0, t2);  // Now crc contains the final checksum value.
3467 }
3468 
3469 /**
3470  * @param crc   register containing existing CRC (32-bit)
3471  * @param buf   register pointing to input byte buffer (byte*)
3472  * @param len   register containing number of bytes
3473  * @param table register pointing to CRC table
3474  *
3475  * uses R9..R12 as work register. Must be saved/restored by caller!
3476  */
3477 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table,
3478                                         Register t0,  Register t1,  Register t2,  Register t3,
3479                                         Register tc0, Register tc1, Register tc2, Register tc3,
3480                                         bool invertCRC) {
3481   assert_different_registers(crc, buf, len, table);
3482 
3483   Label L_mainLoop, L_tail;
3484   Register  tmp          = t0;
3485   Register  data         = t0;
3486   Register  tmp2         = t1;
3487   const int mainLoop_stepping  = 4;
3488   const int tailLoop_stepping  = 1;
3489   const int log_stepping       = exact_log2(mainLoop_stepping);
3490   const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
3491   const int complexThreshold   = 2*mainLoop_stepping;
3492 
3493   // Don't test for len <= 0 here. This pathological case should not occur anyway.
3494   // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles
3495   // for all well-behaved cases. The situation itself is detected and handled correctly
3496   // within update_byteLoop_crc32.
3497   assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
3498 
3499   BLOCK_COMMENT("kernel_crc32_1word {");
3500 
3501   if (invertCRC) {
3502     nand(crc, crc, crc);                      // 1s complement of crc
3503   }
3504 
3505   // Check for short (<mainLoop_stepping) buffer.
3506   cmpdi(CCR0, len, complexThreshold);
3507   blt(CCR0, L_tail);
3508 
3509   // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
3510   // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
3511   {
3512     // Align buf addr to mainLoop_stepping boundary.
3513     neg(tmp2, buf);                              // Calculate # preLoop iterations for alignment.
3514     rldicl(tmp2, tmp2, 0, 64-log_stepping);      // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
3515 
3516     if (complexThreshold > mainLoop_stepping) {
3517       sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3518     } else {
3519       sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
3520       cmpdi(CCR0, tmp, mainLoop_stepping);
3521       blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
3522       mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3523     }
3524     update_byteLoop_crc32(crc, buf, tmp2, table, data, false);
3525   }
3526 
3527   srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
3528   andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
3529   mtctr(tmp2);
3530 
3531 #ifdef VM_LITTLE_ENDIAN
3532   Register crc_rv = crc;
3533 #else
3534   Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
3535                                                  // Occupies tmp, but frees up crc.
3536   load_reverse_32(crc_rv, crc);                  // Revert byte order because we are dealing with big-endian data.
3537   tmp = crc;
3538 #endif
3539 
3540   int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
3541 
3542   align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
3543   BIND(L_mainLoop);
3544     update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
3545     bdnz(L_mainLoop);
3546 
3547 #ifndef VM_LITTLE_ENDIAN
3548   load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
3549   tmp = crc_rv;                                  // Tmp uses it's original register again.
3550 #endif
3551 
3552   // Restore original table address for tailLoop.
3553   if (reconstructTableOffset != 0) {
3554     addi(table, table, -reconstructTableOffset);
3555   }
3556 
3557   // Process last few (<complexThreshold) bytes of buffer.
3558   BIND(L_tail);
3559   update_byteLoop_crc32(crc, buf, len, table, data, false);
3560 
3561   if (invertCRC) {
3562     nand(crc, crc, crc);                      // 1s complement of crc
3563   }
3564   BLOCK_COMMENT("} kernel_crc32_1word");
3565 }
3566 
3567 /**
3568  * @param crc             register containing existing CRC (32-bit)
3569  * @param buf             register pointing to input byte buffer (byte*)
3570  * @param len             register containing number of bytes
3571  * @param constants       register pointing to precomputed constants
3572  * @param t0-t6           temp registers
3573  */
3574 void MacroAssembler::kernel_crc32_vpmsum(Register crc, Register buf, Register len, Register constants,
3575                                          Register t0, Register t1, Register t2, Register t3,
3576                                          Register t4, Register t5, Register t6, bool invertCRC) {
3577   assert_different_registers(crc, buf, len, constants);
3578 
3579   Label L_tail;
3580 
3581   BLOCK_COMMENT("kernel_crc32_vpmsum {");
3582 
3583   if (invertCRC) {
3584     nand(crc, crc, crc);                      // 1s complement of crc
3585   }
3586 
3587   // Enforce 32 bit.
3588   clrldi(len, len, 32);
3589 
3590   // Align if we have enough bytes for the fast version.
3591   const int alignment = 16,
3592             threshold = 32;
3593   Register prealign = t0;
3594 
3595   neg(prealign, buf);
3596   addi(t1, len, -threshold);
3597   andi(prealign, prealign, alignment - 1);
3598   cmpw(CCR0, t1, prealign);
3599   blt(CCR0, L_tail); // len - prealign < threshold?
3600 
3601   subf(len, prealign, len);
3602   update_byteLoop_crc32(crc, buf, prealign, constants, t2, false);
3603 
3604   // Calculate from first aligned address as far as possible.
3605   addi(constants, constants, CRC32_TABLE_SIZE); // Point to vector constants.
3606   kernel_crc32_vpmsum_aligned(crc, buf, len, constants, t0, t1, t2, t3, t4, t5, t6);
3607   addi(constants, constants, -CRC32_TABLE_SIZE); // Point to table again.
3608 
3609   // Remaining bytes.
3610   BIND(L_tail);
3611   update_byteLoop_crc32(crc, buf, len, constants, t2, false);
3612 
3613   if (invertCRC) {
3614     nand(crc, crc, crc);                      // 1s complement of crc
3615   }
3616 
3617   BLOCK_COMMENT("} kernel_crc32_vpmsum");
3618 }
3619 
3620 /**
3621  * @param crc             register containing existing CRC (32-bit)
3622  * @param buf             register pointing to input byte buffer (byte*)
3623  * @param len             register containing number of bytes (will get updated to remaining bytes)
3624  * @param constants       register pointing to CRC table for 128-bit aligned memory
3625  * @param t0-t6           temp registers
3626  */
3627 void MacroAssembler::kernel_crc32_vpmsum_aligned(Register crc, Register buf, Register len, Register constants,
3628     Register t0, Register t1, Register t2, Register t3, Register t4, Register t5, Register t6) {
3629 
3630   // Save non-volatile vector registers (frameless).
3631   Register offset = t1;
3632   int offsetInt = 0;
3633   offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP);
3634   offsetInt -= 16; li(offset, offsetInt); stvx(VR21, offset, R1_SP);
3635   offsetInt -= 16; li(offset, offsetInt); stvx(VR22, offset, R1_SP);
3636   offsetInt -= 16; li(offset, offsetInt); stvx(VR23, offset, R1_SP);
3637   offsetInt -= 16; li(offset, offsetInt); stvx(VR24, offset, R1_SP);
3638   offsetInt -= 16; li(offset, offsetInt); stvx(VR25, offset, R1_SP);
3639 #ifndef VM_LITTLE_ENDIAN
3640   offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP);
3641 #endif
3642   offsetInt -= 8; std(R14, offsetInt, R1_SP);
3643   offsetInt -= 8; std(R15, offsetInt, R1_SP);
3644 
3645   // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor
3646   // bytes per iteration. The basic scheme is:
3647   // lvx: load vector (Big Endian needs reversal)
3648   // vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift
3649   // vxor: xor partial results together to get unroll_factor2 vectors
3650 
3651   // Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors.
3652 
3653   // Using 16 * unroll_factor / unroll_factor_2 bytes for constants.
3654   const int unroll_factor = CRC32_UNROLL_FACTOR,
3655             unroll_factor2 = CRC32_UNROLL_FACTOR2;
3656 
3657   const int outer_consts_size = (unroll_factor2 - 1) * 16,
3658             inner_consts_size = (unroll_factor / unroll_factor2) * 16;
3659 
3660   // Support registers.
3661   Register offs[] = { noreg, t0, t1, t2, t3, t4, t5, t6 };
3662   Register num_bytes = R14,
3663            loop_count = R15,
3664            cur_const = crc; // will live in VCRC
3665   // Constant array for outer loop: unroll_factor2 - 1 registers,
3666   // Constant array for inner loop: unroll_factor / unroll_factor2 registers.
3667   VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 },
3668                  consts1[] = { VR23, VR24 };
3669   // Data register arrays: 2 arrays with unroll_factor2 registers.
3670   VectorRegister data0[] = { VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7 },
3671                  data1[] = { VR8, VR9, VR10, VR11, VR12, VR13, VR14, VR15 };
3672 
3673   VectorRegister VCRC = data0[0];
3674   VectorRegister Vc = VR25;
3675   VectorRegister swap_bytes = VR26; // Only for Big Endian.
3676 
3677   // We have at least 1 iteration (ensured by caller).
3678   Label L_outer_loop, L_inner_loop, L_last;
3679 
3680   // If supported set DSCR pre-fetch to deepest.
3681   if (VM_Version::has_mfdscr()) {
3682     load_const_optimized(t0, VM_Version::_dscr_val | 7);
3683     mtdscr(t0);
3684   }
3685 
3686   mtvrwz(VCRC, crc); // crc lives in VCRC, now
3687 
3688   for (int i = 1; i < unroll_factor2; ++i) {
3689     li(offs[i], 16 * i);
3690   }
3691 
3692   // Load consts for outer loop
3693   lvx(consts0[0], constants);
3694   for (int i = 1; i < unroll_factor2 - 1; ++i) {
3695     lvx(consts0[i], offs[i], constants);
3696   }
3697 
3698   load_const_optimized(num_bytes, 16 * unroll_factor);
3699 
3700   // Reuse data registers outside of the loop.
3701   VectorRegister Vtmp = data1[0];
3702   VectorRegister Vtmp2 = data1[1];
3703   VectorRegister zeroes = data1[2];
3704 
3705   vspltisb(Vtmp, 0);
3706   vsldoi(VCRC, Vtmp, VCRC, 8); // 96 bit zeroes, 32 bit CRC.
3707 
3708   // Load vector for vpermxor (to xor both 64 bit parts together)
3709   lvsl(Vtmp, buf);   // 000102030405060708090a0b0c0d0e0f
3710   vspltisb(Vc, 4);
3711   vsl(Vc, Vtmp, Vc); // 00102030405060708090a0b0c0d0e0f0
3712   xxspltd(Vc->to_vsr(), Vc->to_vsr(), 0);
3713   vor(Vc, Vtmp, Vc); // 001122334455667708192a3b4c5d6e7f
3714 
3715 #ifdef VM_LITTLE_ENDIAN
3716 #define BE_swap_bytes(x)
3717 #else
3718   vspltisb(Vtmp2, 0xf);
3719   vxor(swap_bytes, Vtmp, Vtmp2);
3720 #define BE_swap_bytes(x) vperm(x, x, x, swap_bytes)
3721 #endif
3722 
3723   cmpd(CCR0, len, num_bytes);
3724   blt(CCR0, L_last);
3725 
3726   addi(cur_const, constants, outer_consts_size); // Point to consts for inner loop
3727   load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off.
3728 
3729   // ********** Main loop start **********
3730   align(32);
3731   bind(L_outer_loop);
3732 
3733   // Begin of unrolled first iteration (no xor).
3734   lvx(data1[0], buf);
3735   for (int i = 1; i < unroll_factor2 / 2; ++i) {
3736     lvx(data1[i], offs[i], buf);
3737   }
3738   vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3739   lvx(consts1[0], cur_const);
3740   mtctr(loop_count);
3741   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3742     BE_swap_bytes(data1[i]);
3743     if (i == 0) { vxor(data1[0], data1[0], VCRC); } // xor in previous CRC.
3744     lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
3745     vpmsumw(data0[i], data1[i], consts1[0]);
3746   }
3747   addi(buf, buf, 16 * unroll_factor2);
3748   subf(len, num_bytes, len);
3749   lvx(consts1[1], offs[1], cur_const);
3750   addi(cur_const, cur_const, 32);
3751   // Begin of unrolled second iteration (head).
3752   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3753     BE_swap_bytes(data1[i + unroll_factor2 / 2]);
3754     if (i == 0) { lvx(data1[0], buf); } else { lvx(data1[i], offs[i], buf); }
3755     vpmsumw(data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[0]);
3756   }
3757   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3758     BE_swap_bytes(data1[i]);
3759     lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
3760     vpmsumw(data1[i], data1[i], consts1[1]);
3761   }
3762   addi(buf, buf, 16 * unroll_factor2);
3763 
3764   // Generate most performance relevant code. Loads + half of the vpmsumw have been generated.
3765   // Double-iteration allows using the 2 constant registers alternatingly.
3766   align(32);
3767   bind(L_inner_loop);
3768   for (int j = 1; j < 3; ++j) { // j < unroll_factor / unroll_factor2 - 1 for complete unrolling.
3769     if (j & 1) {
3770       lvx(consts1[0], cur_const);
3771     } else {
3772       lvx(consts1[1], offs[1], cur_const);
3773       addi(cur_const, cur_const, 32);
3774     }
3775     for (int i = 0; i < unroll_factor2; ++i) {
3776       int idx = i + unroll_factor2 / 2, inc = 0; // For modulo-scheduled input.
3777       if (idx >= unroll_factor2) { idx -= unroll_factor2; inc = 1; }
3778       BE_swap_bytes(data1[idx]);
3779       vxor(data0[i], data0[i], data1[i]);
3780       if (i == 0) lvx(data1[0], buf); else lvx(data1[i], offs[i], buf);
3781       vpmsumw(data1[idx], data1[idx], consts1[(j + inc) & 1]);
3782     }
3783     addi(buf, buf, 16 * unroll_factor2);
3784   }
3785   bdnz(L_inner_loop);
3786 
3787   addi(cur_const, constants, outer_consts_size); // Reset
3788 
3789   // Tail of last iteration (no loads).
3790   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3791     BE_swap_bytes(data1[i + unroll_factor2 / 2]);
3792     vxor(data0[i], data0[i], data1[i]);
3793     vpmsumw(data1[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[1]);
3794   }
3795   for (int i = 0; i < unroll_factor2 / 2; ++i) {
3796     vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); // First half of fixup shifts.
3797     vxor(data0[i + unroll_factor2 / 2], data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2]);
3798   }
3799 
3800   // Last data register is ok, other ones need fixup shift.
3801   for (int i = unroll_factor2 / 2; i < unroll_factor2 - 1; ++i) {
3802     vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]);
3803   }
3804 
3805   // Combine to 128 bit result vector VCRC = data0[0].
3806   for (int i = 1; i < unroll_factor2; i<<=1) {
3807     for (int j = 0; j <= unroll_factor2 - 2*i; j+=2*i) {
3808       vxor(data0[j], data0[j], data0[j+i]);
3809     }
3810   }
3811   cmpd(CCR0, len, num_bytes);
3812   bge(CCR0, L_outer_loop);
3813 
3814   // Last chance with lower num_bytes.
3815   bind(L_last);
3816   srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations.
3817   // Point behind last const for inner loop.
3818   add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
3819   sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used.
3820   clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2));
3821   subf(cur_const, R0, cur_const); // Point to constant to be used first.
3822 
3823   addic_(loop_count, loop_count, -1); // One double-iteration peeled off.
3824   bgt(CCR0, L_outer_loop);
3825   // ********** Main loop end **********
3826 
3827   // Restore DSCR pre-fetch value.
3828   if (VM_Version::has_mfdscr()) {
3829     load_const_optimized(t0, VM_Version::_dscr_val);
3830     mtdscr(t0);
3831   }
3832 
3833   // ********** Simple loop for remaining 16 byte blocks **********
3834   {
3835     Label L_loop, L_done;
3836 
3837     srdi_(t0, len, 4); // 16 bytes per iteration
3838     clrldi(len, len, 64-4);
3839     beq(CCR0, L_done);
3840 
3841     // Point to const (same as last const for inner loop).
3842     add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size - 16);
3843     mtctr(t0);
3844     lvx(Vtmp2, cur_const);
3845 
3846     align(32);
3847     bind(L_loop);
3848 
3849     lvx(Vtmp, buf);
3850     addi(buf, buf, 16);
3851     vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3852     BE_swap_bytes(Vtmp);
3853     vxor(VCRC, VCRC, Vtmp);
3854     vpmsumw(VCRC, VCRC, Vtmp2);
3855     bdnz(L_loop);
3856 
3857     bind(L_done);
3858   }
3859   // ********** Simple loop end **********
3860 #undef BE_swap_bytes
3861 
3862   // Point to Barrett constants
3863   add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
3864 
3865   vspltisb(zeroes, 0);
3866 
3867   // Combine to 64 bit result.
3868   vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3869 
3870   // Reduce to 32 bit CRC: Remainder by multiply-high.
3871   lvx(Vtmp, cur_const);
3872   vsldoi(Vtmp2, zeroes, VCRC, 12);  // Extract high 32 bit.
3873   vpmsumd(Vtmp2, Vtmp2, Vtmp);      // Multiply by inverse long poly.
3874   vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit.
3875   vsldoi(Vtmp, zeroes, Vtmp, 8);
3876   vpmsumd(Vtmp2, Vtmp2, Vtmp);      // Multiply quotient by long poly.
3877   vxor(VCRC, VCRC, Vtmp2);          // Remainder fits into 32 bit.
3878 
3879   // Move result. len is already updated.
3880   vsldoi(VCRC, VCRC, zeroes, 8);
3881   mfvrd(crc, VCRC);
3882 
3883   // Restore non-volatile Vector registers (frameless).
3884   offsetInt = 0;
3885   offsetInt -= 16; li(offset, offsetInt); lvx(VR20, offset, R1_SP);
3886   offsetInt -= 16; li(offset, offsetInt); lvx(VR21, offset, R1_SP);
3887   offsetInt -= 16; li(offset, offsetInt); lvx(VR22, offset, R1_SP);
3888   offsetInt -= 16; li(offset, offsetInt); lvx(VR23, offset, R1_SP);
3889   offsetInt -= 16; li(offset, offsetInt); lvx(VR24, offset, R1_SP);
3890   offsetInt -= 16; li(offset, offsetInt); lvx(VR25, offset, R1_SP);
3891 #ifndef VM_LITTLE_ENDIAN
3892   offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP);
3893 #endif
3894   offsetInt -= 8;  ld(R14, offsetInt, R1_SP);
3895   offsetInt -= 8;  ld(R15, offsetInt, R1_SP);
3896 }
3897 
3898 void MacroAssembler::crc32(Register crc, Register buf, Register len, Register t0, Register t1, Register t2,
3899                            Register t3, Register t4, Register t5, Register t6, Register t7, bool is_crc32c) {
3900   load_const_optimized(t0, is_crc32c ? StubRoutines::crc32c_table_addr()
3901                                      : StubRoutines::crc_table_addr()   , R0);
3902 
3903   if (VM_Version::has_vpmsumb()) {
3904     kernel_crc32_vpmsum(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, !is_crc32c);
3905   } else {
3906     kernel_crc32_1word(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, t0, !is_crc32c);
3907   }
3908 }
3909 
3910 void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) {
3911   assert_different_registers(crc, val, table);
3912 
3913   BLOCK_COMMENT("kernel_crc32_singleByteReg:");
3914   if (invertCRC) {
3915     nand(crc, crc, crc);                // 1s complement of crc
3916   }
3917 
3918   update_byte_crc32(crc, val, table);
3919 
3920   if (invertCRC) {
3921     nand(crc, crc, crc);                // 1s complement of crc
3922   }
3923 }
3924 
3925 // dest_lo += src1 + src2
3926 // dest_hi += carry1 + carry2
3927 void MacroAssembler::add2_with_carry(Register dest_hi,
3928                                      Register dest_lo,
3929                                      Register src1, Register src2) {
3930   li(R0, 0);
3931   addc(dest_lo, dest_lo, src1);
3932   adde(dest_hi, dest_hi, R0);
3933   addc(dest_lo, dest_lo, src2);
3934   adde(dest_hi, dest_hi, R0);
3935 }
3936 
3937 // Multiply 64 bit by 64 bit first loop.
3938 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,
3939                                            Register x_xstart,
3940                                            Register y, Register y_idx,
3941                                            Register z,
3942                                            Register carry,
3943                                            Register product_high, Register product,
3944                                            Register idx, Register kdx,
3945                                            Register tmp) {
3946   //  jlong carry, x[], y[], z[];
3947   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
3948   //    huge_128 product = y[idx] * x[xstart] + carry;
3949   //    z[kdx] = (jlong)product;
3950   //    carry  = (jlong)(product >>> 64);
3951   //  }
3952   //  z[xstart] = carry;
3953 
3954   Label L_first_loop, L_first_loop_exit;
3955   Label L_one_x, L_one_y, L_multiply;
3956 
3957   addic_(xstart, xstart, -1);
3958   blt(CCR0, L_one_x);   // Special case: length of x is 1.
3959 
3960   // Load next two integers of x.
3961   sldi(tmp, xstart, LogBytesPerInt);
3962   ldx(x_xstart, x, tmp);
3963 #ifdef VM_LITTLE_ENDIAN
3964   rldicl(x_xstart, x_xstart, 32, 0);
3965 #endif
3966 
3967   align(32, 16);
3968   bind(L_first_loop);
3969 
3970   cmpdi(CCR0, idx, 1);
3971   blt(CCR0, L_first_loop_exit);
3972   addi(idx, idx, -2);
3973   beq(CCR0, L_one_y);
3974 
3975   // Load next two integers of y.
3976   sldi(tmp, idx, LogBytesPerInt);
3977   ldx(y_idx, y, tmp);
3978 #ifdef VM_LITTLE_ENDIAN
3979   rldicl(y_idx, y_idx, 32, 0);
3980 #endif
3981 
3982 
3983   bind(L_multiply);
3984   multiply64(product_high, product, x_xstart, y_idx);
3985 
3986   li(tmp, 0);
3987   addc(product, product, carry);         // Add carry to result.
3988   adde(product_high, product_high, tmp); // Add carry of the last addition.
3989   addi(kdx, kdx, -2);
3990 
3991   // Store result.
3992 #ifdef VM_LITTLE_ENDIAN
3993   rldicl(product, product, 32, 0);
3994 #endif
3995   sldi(tmp, kdx, LogBytesPerInt);
3996   stdx(product, z, tmp);
3997   mr_if_needed(carry, product_high);
3998   b(L_first_loop);
3999 
4000 
4001   bind(L_one_y); // Load one 32 bit portion of y as (0,value).
4002 
4003   lwz(y_idx, 0, y);
4004   b(L_multiply);
4005 
4006 
4007   bind(L_one_x); // Load one 32 bit portion of x as (0,value).
4008 
4009   lwz(x_xstart, 0, x);
4010   b(L_first_loop);
4011 
4012   bind(L_first_loop_exit);
4013 }
4014 
4015 // Multiply 64 bit by 64 bit and add 128 bit.
4016 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y,
4017                                             Register z, Register yz_idx,
4018                                             Register idx, Register carry,
4019                                             Register product_high, Register product,
4020                                             Register tmp, int offset) {
4021 
4022   //  huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
4023   //  z[kdx] = (jlong)product;
4024 
4025   sldi(tmp, idx, LogBytesPerInt);
4026   if (offset) {
4027     addi(tmp, tmp, offset);
4028   }
4029   ldx(yz_idx, y, tmp);
4030 #ifdef VM_LITTLE_ENDIAN
4031   rldicl(yz_idx, yz_idx, 32, 0);
4032 #endif
4033 
4034   multiply64(product_high, product, x_xstart, yz_idx);
4035   ldx(yz_idx, z, tmp);
4036 #ifdef VM_LITTLE_ENDIAN
4037   rldicl(yz_idx, yz_idx, 32, 0);
4038 #endif
4039 
4040   add2_with_carry(product_high, product, carry, yz_idx);
4041 
4042   sldi(tmp, idx, LogBytesPerInt);
4043   if (offset) {
4044     addi(tmp, tmp, offset);
4045   }
4046 #ifdef VM_LITTLE_ENDIAN
4047   rldicl(product, product, 32, 0);
4048 #endif
4049   stdx(product, z, tmp);
4050 }
4051 
4052 // Multiply 128 bit by 128 bit. Unrolled inner loop.
4053 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart,
4054                                              Register y, Register z,
4055                                              Register yz_idx, Register idx, Register carry,
4056                                              Register product_high, Register product,
4057                                              Register carry2, Register tmp) {
4058 
4059   //  jlong carry, x[], y[], z[];
4060   //  int kdx = ystart+1;
4061   //  for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
4062   //    huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
4063   //    z[kdx+idx+1] = (jlong)product;
4064   //    jlong carry2 = (jlong)(product >>> 64);
4065   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
4066   //    z[kdx+idx] = (jlong)product;
4067   //    carry = (jlong)(product >>> 64);
4068   //  }
4069   //  idx += 2;
4070   //  if (idx > 0) {
4071   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
4072   //    z[kdx+idx] = (jlong)product;
4073   //    carry = (jlong)(product >>> 64);
4074   //  }
4075 
4076   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
4077   const Register jdx = R0;
4078 
4079   // Scale the index.
4080   srdi_(jdx, idx, 2);
4081   beq(CCR0, L_third_loop_exit);
4082   mtctr(jdx);
4083 
4084   align(32, 16);
4085   bind(L_third_loop);
4086 
4087   addi(idx, idx, -4);
4088 
4089   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8);
4090   mr_if_needed(carry2, product_high);
4091 
4092   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0);
4093   mr_if_needed(carry, product_high);
4094   bdnz(L_third_loop);
4095 
4096   bind(L_third_loop_exit);  // Handle any left-over operand parts.
4097 
4098   andi_(idx, idx, 0x3);
4099   beq(CCR0, L_post_third_loop_done);
4100 
4101   Label L_check_1;
4102 
4103   addic_(idx, idx, -2);
4104   blt(CCR0, L_check_1);
4105 
4106   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0);
4107   mr_if_needed(carry, product_high);
4108 
4109   bind(L_check_1);
4110 
4111   addi(idx, idx, 0x2);
4112   andi_(idx, idx, 0x1);
4113   addic_(idx, idx, -1);
4114   blt(CCR0, L_post_third_loop_done);
4115 
4116   sldi(tmp, idx, LogBytesPerInt);
4117   lwzx(yz_idx, y, tmp);
4118   multiply64(product_high, product, x_xstart, yz_idx);
4119   lwzx(yz_idx, z, tmp);
4120 
4121   add2_with_carry(product_high, product, yz_idx, carry);
4122 
4123   sldi(tmp, idx, LogBytesPerInt);
4124   stwx(product, z, tmp);
4125   srdi(product, product, 32);
4126 
4127   sldi(product_high, product_high, 32);
4128   orr(product, product, product_high);
4129   mr_if_needed(carry, product);
4130 
4131   bind(L_post_third_loop_done);
4132 }   // multiply_128_x_128_loop
4133 
4134 void MacroAssembler::muladd(Register out, Register in,
4135                             Register offset, Register len, Register k,
4136                             Register tmp1, Register tmp2, Register carry) {
4137 
4138   // Labels
4139   Label LOOP, SKIP;
4140 
4141   // Make sure length is positive.
4142   cmpdi  (CCR0,    len,     0);
4143 
4144   // Prepare variables
4145   subi   (offset,  offset,  4);
4146   li     (carry,   0);
4147   ble    (CCR0,    SKIP);
4148 
4149   mtctr  (len);
4150   subi   (len,     len,     1    );
4151   sldi   (len,     len,     2    );
4152 
4153   // Main loop
4154   bind(LOOP);
4155   lwzx   (tmp1,    len,     in   );
4156   lwzx   (tmp2,    offset,  out  );
4157   mulld  (tmp1,    tmp1,    k    );
4158   add    (tmp2,    carry,   tmp2 );
4159   add    (tmp2,    tmp1,    tmp2 );
4160   stwx   (tmp2,    offset,  out  );
4161   srdi   (carry,   tmp2,    32   );
4162   subi   (offset,  offset,  4    );
4163   subi   (len,     len,     4    );
4164   bdnz   (LOOP);
4165   bind(SKIP);
4166 }
4167 
4168 void MacroAssembler::multiply_to_len(Register x, Register xlen,
4169                                      Register y, Register ylen,
4170                                      Register z, Register zlen,
4171                                      Register tmp1, Register tmp2,
4172                                      Register tmp3, Register tmp4,
4173                                      Register tmp5, Register tmp6,
4174                                      Register tmp7, Register tmp8,
4175                                      Register tmp9, Register tmp10,
4176                                      Register tmp11, Register tmp12,
4177                                      Register tmp13) {
4178 
4179   ShortBranchVerifier sbv(this);
4180 
4181   assert_different_registers(x, xlen, y, ylen, z, zlen,
4182                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
4183   assert_different_registers(x, xlen, y, ylen, z, zlen,
4184                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp7);
4185   assert_different_registers(x, xlen, y, ylen, z, zlen,
4186                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp8);
4187 
4188   const Register idx = tmp1;
4189   const Register kdx = tmp2;
4190   const Register xstart = tmp3;
4191 
4192   const Register y_idx = tmp4;
4193   const Register carry = tmp5;
4194   const Register product = tmp6;
4195   const Register product_high = tmp7;
4196   const Register x_xstart = tmp8;
4197   const Register tmp = tmp9;
4198 
4199   // First Loop.
4200   //
4201   //  final static long LONG_MASK = 0xffffffffL;
4202   //  int xstart = xlen - 1;
4203   //  int ystart = ylen - 1;
4204   //  long carry = 0;
4205   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
4206   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
4207   //    z[kdx] = (int)product;
4208   //    carry = product >>> 32;
4209   //  }
4210   //  z[xstart] = (int)carry;
4211 
4212   mr_if_needed(idx, ylen);        // idx = ylen
4213   mr_if_needed(kdx, zlen);        // kdx = xlen + ylen
4214   li(carry, 0);                   // carry = 0
4215 
4216   Label L_done;
4217 
4218   addic_(xstart, xlen, -1);
4219   blt(CCR0, L_done);
4220 
4221   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z,
4222                         carry, product_high, product, idx, kdx, tmp);
4223 
4224   Label L_second_loop;
4225 
4226   cmpdi(CCR0, kdx, 0);
4227   beq(CCR0, L_second_loop);
4228 
4229   Label L_carry;
4230 
4231   addic_(kdx, kdx, -1);
4232   beq(CCR0, L_carry);
4233 
4234   // Store lower 32 bits of carry.
4235   sldi(tmp, kdx, LogBytesPerInt);
4236   stwx(carry, z, tmp);
4237   srdi(carry, carry, 32);
4238   addi(kdx, kdx, -1);
4239 
4240 
4241   bind(L_carry);
4242 
4243   // Store upper 32 bits of carry.
4244   sldi(tmp, kdx, LogBytesPerInt);
4245   stwx(carry, z, tmp);
4246 
4247   // Second and third (nested) loops.
4248   //
4249   //  for (int i = xstart-1; i >= 0; i--) { // Second loop
4250   //    carry = 0;
4251   //    for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
4252   //      long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
4253   //                     (z[k] & LONG_MASK) + carry;
4254   //      z[k] = (int)product;
4255   //      carry = product >>> 32;
4256   //    }
4257   //    z[i] = (int)carry;
4258   //  }
4259   //
4260   //  i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
4261 
4262   bind(L_second_loop);
4263 
4264   li(carry, 0);                   // carry = 0;
4265 
4266   addic_(xstart, xstart, -1);     // i = xstart-1;
4267   blt(CCR0, L_done);
4268 
4269   Register zsave = tmp10;
4270 
4271   mr(zsave, z);
4272 
4273 
4274   Label L_last_x;
4275 
4276   sldi(tmp, xstart, LogBytesPerInt);
4277   add(z, z, tmp);                 // z = z + k - j
4278   addi(z, z, 4);
4279   addic_(xstart, xstart, -1);     // i = xstart-1;
4280   blt(CCR0, L_last_x);
4281 
4282   sldi(tmp, xstart, LogBytesPerInt);
4283   ldx(x_xstart, x, tmp);
4284 #ifdef VM_LITTLE_ENDIAN
4285   rldicl(x_xstart, x_xstart, 32, 0);
4286 #endif
4287 
4288 
4289   Label L_third_loop_prologue;
4290 
4291   bind(L_third_loop_prologue);
4292 
4293   Register xsave = tmp11;
4294   Register xlensave = tmp12;
4295   Register ylensave = tmp13;
4296 
4297   mr(xsave, x);
4298   mr(xlensave, xstart);
4299   mr(ylensave, ylen);
4300 
4301 
4302   multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen,
4303                           carry, product_high, product, x, tmp);
4304 
4305   mr(z, zsave);
4306   mr(x, xsave);
4307   mr(xlen, xlensave);   // This is the decrement of the loop counter!
4308   mr(ylen, ylensave);
4309 
4310   addi(tmp3, xlen, 1);
4311   sldi(tmp, tmp3, LogBytesPerInt);
4312   stwx(carry, z, tmp);
4313   addic_(tmp3, tmp3, -1);
4314   blt(CCR0, L_done);
4315 
4316   srdi(carry, carry, 32);
4317   sldi(tmp, tmp3, LogBytesPerInt);
4318   stwx(carry, z, tmp);
4319   b(L_second_loop);
4320 
4321   // Next infrequent code is moved outside loops.
4322   bind(L_last_x);
4323 
4324   lwz(x_xstart, 0, x);
4325   b(L_third_loop_prologue);
4326 
4327   bind(L_done);
4328 }   // multiply_to_len
4329 
4330 void MacroAssembler::asm_assert(bool check_equal, const char *msg) {
4331 #ifdef ASSERT
4332   Label ok;
4333   if (check_equal) {
4334     beq(CCR0, ok);
4335   } else {
4336     bne(CCR0, ok);
4337   }
4338   stop(msg);
4339   bind(ok);
4340 #endif
4341 }
4342 
4343 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset,
4344                                           Register mem_base, const char* msg) {
4345 #ifdef ASSERT
4346   switch (size) {
4347     case 4:
4348       lwz(R0, mem_offset, mem_base);
4349       cmpwi(CCR0, R0, 0);
4350       break;
4351     case 8:
4352       ld(R0, mem_offset, mem_base);
4353       cmpdi(CCR0, R0, 0);
4354       break;
4355     default:
4356       ShouldNotReachHere();
4357   }
4358   asm_assert(check_equal, msg);
4359 #endif // ASSERT
4360 }
4361 
4362 void MacroAssembler::verify_thread() {
4363   if (VerifyThread) {
4364     unimplemented("'VerifyThread' currently not implemented on PPC");
4365   }
4366 }
4367 
4368 void MacroAssembler::verify_coop(Register coop, const char* msg) {
4369   if (!VerifyOops) { return; }
4370   if (UseCompressedOops) { decode_heap_oop(coop); }
4371   verify_oop(coop, msg);
4372   if (UseCompressedOops) { encode_heap_oop(coop, coop); }
4373 }
4374 
4375 // READ: oop. KILL: R0. Volatile floats perhaps.
4376 void MacroAssembler::verify_oop(Register oop, const char* msg) {
4377   if (!VerifyOops) {
4378     return;
4379   }
4380 
4381   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4382   const Register tmp = R11; // Will be preserved.
4383   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
4384 
4385   BLOCK_COMMENT("verify_oop {");
4386 
4387   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
4388 
4389   mr_if_needed(R4_ARG2, oop);
4390   save_LR_CR(tmp); // save in old frame
4391   push_frame_reg_args(nbytes_save, tmp);
4392   // load FunctionDescriptor** / entry_address *
4393   load_const_optimized(tmp, fd, R0);
4394   // load FunctionDescriptor* / entry_address
4395   ld(tmp, 0, tmp);
4396   load_const_optimized(R3_ARG1, (address)msg, R0);
4397   // Call destination for its side effect.
4398   call_c(tmp);
4399 
4400   pop_frame();
4401   restore_LR_CR(tmp);
4402   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
4403 
4404   BLOCK_COMMENT("} verify_oop");
4405 }
4406 
4407 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) {
4408   if (!VerifyOops) {
4409     return;
4410   }
4411 
4412   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4413   const Register tmp = R11; // Will be preserved.
4414   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
4415   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
4416 
4417   ld(R4_ARG2, offs, base);
4418   save_LR_CR(tmp); // save in old frame
4419   push_frame_reg_args(nbytes_save, tmp);
4420   // load FunctionDescriptor** / entry_address *
4421   load_const_optimized(tmp, fd, R0);
4422   // load FunctionDescriptor* / entry_address
4423   ld(tmp, 0, tmp);
4424   load_const_optimized(R3_ARG1, (address)msg, R0);
4425   // Call destination for its side effect.
4426   call_c(tmp);
4427 
4428   pop_frame();
4429   restore_LR_CR(tmp);
4430   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
4431 }
4432 
4433 // Call a C-function that prints output.
4434 void MacroAssembler::stop(int type, const char* msg) {
4435 #ifndef PRODUCT
4436   block_comment(err_msg("stop(type %d): %s {", type, msg));
4437 #else
4438   block_comment("stop {");
4439 #endif
4440 
4441   if (type != stop_shouldnotreachhere) {
4442     // Use R0 to pass msg. "shouldnotreachhere" preserves R0.
4443     load_const_optimized(R0, (void*)msg);
4444   }
4445   tdi_unchecked(traptoUnconditional, 0/*reg 0*/, type);
4446 
4447   block_comment("} stop;");
4448 }
4449 
4450 #ifndef PRODUCT
4451 // Write pattern 0x0101010101010101 in memory region [low-before, high+after].
4452 // Val, addr are temp registers.
4453 // If low == addr, addr is killed.
4454 // High is preserved.
4455 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) {
4456   if (!ZapMemory) return;
4457 
4458   assert_different_registers(low, val);
4459 
4460   BLOCK_COMMENT("zap memory region {");
4461   load_const_optimized(val, 0x0101010101010101);
4462   int size = before + after;
4463   if (low == high && size < 5 && size > 0) {
4464     int offset = -before*BytesPerWord;
4465     for (int i = 0; i < size; ++i) {
4466       std(val, offset, low);
4467       offset += (1*BytesPerWord);
4468     }
4469   } else {
4470     addi(addr, low, -before*BytesPerWord);
4471     assert_different_registers(high, val);
4472     if (after) addi(high, high, after * BytesPerWord);
4473     Label loop;
4474     bind(loop);
4475     std(val, 0, addr);
4476     addi(addr, addr, 8);
4477     cmpd(CCR6, addr, high);
4478     ble(CCR6, loop);
4479     if (after) addi(high, high, -after * BytesPerWord);  // Correct back to old value.
4480   }
4481   BLOCK_COMMENT("} zap memory region");
4482 }
4483 
4484 #endif // !PRODUCT
4485 
4486 void SkipIfEqualZero::skip_to_label_if_equal_zero(MacroAssembler* masm, Register temp,
4487                                                   const bool* flag_addr, Label& label) {
4488   int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true);
4489   assert(sizeof(bool) == 1, "PowerPC ABI");
4490   masm->lbz(temp, simm16_offset, temp);
4491   masm->cmpwi(CCR0, temp, 0);
4492   masm->beq(CCR0, label);
4493 }
4494 
4495 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() {
4496   skip_to_label_if_equal_zero(masm, temp, flag_addr, _label);
4497 }
4498 
4499 SkipIfEqualZero::~SkipIfEqualZero() {
4500   _masm->bind(_label);
4501 }
4502 
4503 void MacroAssembler::cache_wb(Address line) {
4504   assert(line.index() == noreg, "index should be noreg");
4505   assert(line.disp() == 0, "displacement should be 0");
4506   assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support flush to persistent memory");
4507   // Data Cache Store, not really a flush, so it works like a sync of cache
4508   // line and persistent mem, i.e. copying the cache line to persistent whilst
4509   // not invalidating the cache line.
4510   dcbst(line.base());
4511 }
4512 
4513 void MacroAssembler::cache_wbsync(bool is_presync) {
4514   assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support sync related to persistent memory");
4515   // We only need a post sync barrier. Post means _after_ a cache line flush or
4516   // store instruction, pre means a barrier emitted before such a instructions.
4517   if (!is_presync) {
4518     fence();
4519   }
4520 }