1 /*
   2  * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2012, 2019, SAP SE. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "compiler/disassembler.hpp"
  29 #include "gc/shared/collectedHeap.inline.hpp"
  30 #include "gc/shared/barrierSet.hpp"
  31 #include "gc/shared/barrierSetAssembler.hpp"
  32 #include "interpreter/interpreter.hpp"
  33 #include "memory/resourceArea.hpp"
  34 #include "nativeInst_ppc.hpp"
  35 #include "oops/klass.inline.hpp"
  36 #include "prims/methodHandles.hpp"
  37 #include "runtime/biasedLocking.hpp"
  38 #include "runtime/icache.hpp"
  39 #include "runtime/interfaceSupport.inline.hpp"
  40 #include "runtime/objectMonitor.hpp"
  41 #include "runtime/os.hpp"
  42 #include "runtime/safepoint.hpp"
  43 #include "runtime/safepointMechanism.hpp"
  44 #include "runtime/sharedRuntime.hpp"
  45 #include "runtime/stubRoutines.hpp"
  46 #include "utilities/macros.hpp"
  47 #ifdef COMPILER2
  48 #include "opto/intrinsicnode.hpp"
  49 #endif
  50 
  51 #ifdef PRODUCT
  52 #define BLOCK_COMMENT(str) // nothing
  53 #else
  54 #define BLOCK_COMMENT(str) block_comment(str)
  55 #endif
  56 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  57 
  58 #ifdef ASSERT
  59 // On RISC, there's no benefit to verifying instruction boundaries.
  60 bool AbstractAssembler::pd_check_instruction_mark() { return false; }
  61 #endif
  62 
  63 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) {
  64   assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range");
  65   if (Assembler::is_simm(si31, 16)) {
  66     ld(d, si31, a);
  67     if (emit_filler_nop) nop();
  68   } else {
  69     const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31);
  70     const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31);
  71     addis(d, a, hi);
  72     ld(d, lo, d);
  73   }
  74 }
  75 
  76 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) {
  77   assert_different_registers(d, a);
  78   ld_largeoffset_unchecked(d, si31, a, emit_filler_nop);
  79 }
  80 
  81 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base,
  82                                       size_t size_in_bytes, bool is_signed) {
  83   switch (size_in_bytes) {
  84   case  8:              ld(dst, offs, base);                         break;
  85   case  4:  is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break;
  86   case  2:  is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break;
  87   case  1:  lbz(dst, offs, base); if (is_signed) extsb(dst, dst);    break; // lba doesn't exist :(
  88   default:  ShouldNotReachHere();
  89   }
  90 }
  91 
  92 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base,
  93                                        size_t size_in_bytes) {
  94   switch (size_in_bytes) {
  95   case  8:  std(dst, offs, base); break;
  96   case  4:  stw(dst, offs, base); break;
  97   case  2:  sth(dst, offs, base); break;
  98   case  1:  stb(dst, offs, base); break;
  99   default:  ShouldNotReachHere();
 100   }
 101 }
 102 
 103 void MacroAssembler::align(int modulus, int max, int rem) {
 104   int padding = (rem + modulus - (offset() % modulus)) % modulus;
 105   if (padding > max) return;
 106   for (int c = (padding >> 2); c > 0; --c) { nop(); }
 107 }
 108 
 109 // Issue instructions that calculate given TOC from global TOC.
 110 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16,
 111                                                        bool add_relocation, bool emit_dummy_addr) {
 112   int offset = -1;
 113   if (emit_dummy_addr) {
 114     offset = -128; // dummy address
 115   } else if (addr != (address)(intptr_t)-1) {
 116     offset = MacroAssembler::offset_to_global_toc(addr);
 117   }
 118 
 119   if (hi16) {
 120     addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset));
 121   }
 122   if (lo16) {
 123     if (add_relocation) {
 124       // Relocate at the addi to avoid confusion with a load from the method's TOC.
 125       relocate(internal_word_Relocation::spec(addr));
 126     }
 127     addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset));
 128   }
 129 }
 130 
 131 address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) {
 132   const int offset = MacroAssembler::offset_to_global_toc(addr);
 133 
 134   const address inst2_addr = a;
 135   const int inst2 = *(int *)inst2_addr;
 136 
 137   // The relocation points to the second instruction, the addi,
 138   // and the addi reads and writes the same register dst.
 139   const int dst = inv_rt_field(inst2);
 140   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 141 
 142   // Now, find the preceding addis which writes to dst.
 143   int inst1 = 0;
 144   address inst1_addr = inst2_addr - BytesPerInstWord;
 145   while (inst1_addr >= bound) {
 146     inst1 = *(int *) inst1_addr;
 147     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 148       // Stop, found the addis which writes dst.
 149       break;
 150     }
 151     inst1_addr -= BytesPerInstWord;
 152   }
 153 
 154   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 155   set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset));
 156   set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset));
 157   return inst1_addr;
 158 }
 159 
 160 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) {
 161   const address inst2_addr = a;
 162   const int inst2 = *(int *)inst2_addr;
 163 
 164   // The relocation points to the second instruction, the addi,
 165   // and the addi reads and writes the same register dst.
 166   const int dst = inv_rt_field(inst2);
 167   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 168 
 169   // Now, find the preceding addis which writes to dst.
 170   int inst1 = 0;
 171   address inst1_addr = inst2_addr - BytesPerInstWord;
 172   while (inst1_addr >= bound) {
 173     inst1 = *(int *) inst1_addr;
 174     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 175       // stop, found the addis which writes dst
 176       break;
 177     }
 178     inst1_addr -= BytesPerInstWord;
 179   }
 180 
 181   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 182 
 183   int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0);
 184   // -1 is a special case
 185   if (offset == -1) {
 186     return (address)(intptr_t)-1;
 187   } else {
 188     return global_toc() + offset;
 189   }
 190 }
 191 
 192 #ifdef _LP64
 193 // Patch compressed oops or klass constants.
 194 // Assembler sequence is
 195 // 1) compressed oops:
 196 //    lis  rx = const.hi
 197 //    ori rx = rx | const.lo
 198 // 2) compressed klass:
 199 //    lis  rx = const.hi
 200 //    clrldi rx = rx & 0xFFFFffff // clearMS32b, optional
 201 //    ori rx = rx | const.lo
 202 // Clrldi will be passed by.
 203 address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) {
 204   assert(UseCompressedOops, "Should only patch compressed oops");
 205 
 206   const address inst2_addr = a;
 207   const int inst2 = *(int *)inst2_addr;
 208 
 209   // The relocation points to the second instruction, the ori,
 210   // and the ori reads and writes the same register dst.
 211   const int dst = inv_rta_field(inst2);
 212   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 213   // Now, find the preceding addis which writes to dst.
 214   int inst1 = 0;
 215   address inst1_addr = inst2_addr - BytesPerInstWord;
 216   bool inst1_found = false;
 217   while (inst1_addr >= bound) {
 218     inst1 = *(int *)inst1_addr;
 219     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; }
 220     inst1_addr -= BytesPerInstWord;
 221   }
 222   assert(inst1_found, "inst is not lis");
 223 
 224   int xc = (data >> 16) & 0xffff;
 225   int xd = (data >>  0) & 0xffff;
 226 
 227   set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo
 228   set_imm((int *)inst2_addr,        (xd)); // unsigned int
 229   return inst1_addr;
 230 }
 231 
 232 // Get compressed oop or klass constant.
 233 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) {
 234   assert(UseCompressedOops, "Should only patch compressed oops");
 235 
 236   const address inst2_addr = a;
 237   const int inst2 = *(int *)inst2_addr;
 238 
 239   // The relocation points to the second instruction, the ori,
 240   // and the ori reads and writes the same register dst.
 241   const int dst = inv_rta_field(inst2);
 242   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 243   // Now, find the preceding lis which writes to dst.
 244   int inst1 = 0;
 245   address inst1_addr = inst2_addr - BytesPerInstWord;
 246   bool inst1_found = false;
 247 
 248   while (inst1_addr >= bound) {
 249     inst1 = *(int *) inst1_addr;
 250     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;}
 251     inst1_addr -= BytesPerInstWord;
 252   }
 253   assert(inst1_found, "inst is not lis");
 254 
 255   uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff));
 256   uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16);
 257 
 258   return (int) (xl | xh);
 259 }
 260 #endif // _LP64
 261 
 262 // Returns true if successful.
 263 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a,
 264                                                 Register toc, bool fixed_size) {
 265   int toc_offset = 0;
 266   // Use RelocationHolder::none for the constant pool entry, otherwise
 267   // we will end up with a failing NativeCall::verify(x) where x is
 268   // the address of the constant pool entry.
 269   // FIXME: We should insert relocation information for oops at the constant
 270   // pool entries instead of inserting it at the loads; patching of a constant
 271   // pool entry should be less expensive.
 272   address const_address = address_constant((address)a.value(), RelocationHolder::none);
 273   if (const_address == NULL) { return false; } // allocation failure
 274   // Relocate at the pc of the load.
 275   relocate(a.rspec());
 276   toc_offset = (int)(const_address - code()->consts()->start());
 277   ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size);
 278   return true;
 279 }
 280 
 281 bool MacroAssembler::is_load_const_from_method_toc_at(address a) {
 282   const address inst1_addr = a;
 283   const int inst1 = *(int *)inst1_addr;
 284 
 285    // The relocation points to the ld or the addis.
 286    return (is_ld(inst1)) ||
 287           (is_addis(inst1) && inv_ra_field(inst1) != 0);
 288 }
 289 
 290 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) {
 291   assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc");
 292 
 293   const address inst1_addr = a;
 294   const int inst1 = *(int *)inst1_addr;
 295 
 296   if (is_ld(inst1)) {
 297     return inv_d1_field(inst1);
 298   } else if (is_addis(inst1)) {
 299     const int dst = inv_rt_field(inst1);
 300 
 301     // Now, find the succeeding ld which reads and writes to dst.
 302     address inst2_addr = inst1_addr + BytesPerInstWord;
 303     int inst2 = 0;
 304     while (true) {
 305       inst2 = *(int *) inst2_addr;
 306       if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) {
 307         // Stop, found the ld which reads and writes dst.
 308         break;
 309       }
 310       inst2_addr += BytesPerInstWord;
 311     }
 312     return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2);
 313   }
 314   ShouldNotReachHere();
 315   return 0;
 316 }
 317 
 318 // Get the constant from a `load_const' sequence.
 319 long MacroAssembler::get_const(address a) {
 320   assert(is_load_const_at(a), "not a load of a constant");
 321   const int *p = (const int*) a;
 322   unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48);
 323   if (is_ori(*(p+1))) {
 324     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32);
 325     x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16);
 326     x |= (((unsigned long) (get_imm(a,4) & 0xffff)));
 327   } else if (is_lis(*(p+1))) {
 328     x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32);
 329     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16);
 330     x |= (((unsigned long) (get_imm(a,3) & 0xffff)));
 331   } else {
 332     ShouldNotReachHere();
 333     return (long) 0;
 334   }
 335   return (long) x;
 336 }
 337 
 338 // Patch the 64 bit constant of a `load_const' sequence. This is a low
 339 // level procedure. It neither flushes the instruction cache nor is it
 340 // mt safe.
 341 void MacroAssembler::patch_const(address a, long x) {
 342   assert(is_load_const_at(a), "not a load of a constant");
 343   int *p = (int*) a;
 344   if (is_ori(*(p+1))) {
 345     set_imm(0 + p, (x >> 48) & 0xffff);
 346     set_imm(1 + p, (x >> 32) & 0xffff);
 347     set_imm(3 + p, (x >> 16) & 0xffff);
 348     set_imm(4 + p, x & 0xffff);
 349   } else if (is_lis(*(p+1))) {
 350     set_imm(0 + p, (x >> 48) & 0xffff);
 351     set_imm(2 + p, (x >> 32) & 0xffff);
 352     set_imm(1 + p, (x >> 16) & 0xffff);
 353     set_imm(3 + p, x & 0xffff);
 354   } else {
 355     ShouldNotReachHere();
 356   }
 357 }
 358 
 359 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) {
 360   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
 361   int index = oop_recorder()->allocate_metadata_index(obj);
 362   RelocationHolder rspec = metadata_Relocation::spec(index);
 363   return AddressLiteral((address)obj, rspec);
 364 }
 365 
 366 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) {
 367   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
 368   int index = oop_recorder()->find_index(obj);
 369   RelocationHolder rspec = metadata_Relocation::spec(index);
 370   return AddressLiteral((address)obj, rspec);
 371 }
 372 
 373 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) {
 374   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
 375   int oop_index = oop_recorder()->allocate_oop_index(obj);
 376   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 377 }
 378 
 379 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) {
 380   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
 381   int oop_index = oop_recorder()->find_index(obj);
 382   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 383 }
 384 
 385 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
 386                                                       Register tmp, int offset) {
 387   intptr_t value = *delayed_value_addr;
 388   if (value != 0) {
 389     return RegisterOrConstant(value + offset);
 390   }
 391 
 392   // Load indirectly to solve generation ordering problem.
 393   // static address, no relocation
 394   int simm16_offset = load_const_optimized(tmp, delayed_value_addr, noreg, true);
 395   ld(tmp, simm16_offset, tmp); // must be aligned ((xa & 3) == 0)
 396 
 397   if (offset != 0) {
 398     addi(tmp, tmp, offset);
 399   }
 400 
 401   return RegisterOrConstant(tmp);
 402 }
 403 
 404 #ifndef PRODUCT
 405 void MacroAssembler::pd_print_patched_instruction(address branch) {
 406   Unimplemented(); // TODO: PPC port
 407 }
 408 #endif // ndef PRODUCT
 409 
 410 // Conditional far branch for destinations encodable in 24+2 bits.
 411 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) {
 412 
 413   // If requested by flag optimize, relocate the bc_far as a
 414   // runtime_call and prepare for optimizing it when the code gets
 415   // relocated.
 416   if (optimize == bc_far_optimize_on_relocate) {
 417     relocate(relocInfo::runtime_call_type);
 418   }
 419 
 420   // variant 2:
 421   //
 422   //    b!cxx SKIP
 423   //    bxx   DEST
 424   //  SKIP:
 425   //
 426 
 427   const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 428                                                 opposite_bcond(inv_boint_bcond(boint)));
 429 
 430   // We emit two branches.
 431   // First, a conditional branch which jumps around the far branch.
 432   const address not_taken_pc = pc() + 2 * BytesPerInstWord;
 433   const address bc_pc        = pc();
 434   bc(opposite_boint, biint, not_taken_pc);
 435 
 436   const int bc_instr = *(int*)bc_pc;
 437   assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition");
 438   assert(opposite_boint == inv_bo_field(bc_instr), "postcondition");
 439   assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))),
 440                                      opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))),
 441          "postcondition");
 442   assert(biint == inv_bi_field(bc_instr), "postcondition");
 443 
 444   // Second, an unconditional far branch which jumps to dest.
 445   // Note: target(dest) remembers the current pc (see CodeSection::target)
 446   //       and returns the current pc if the label is not bound yet; when
 447   //       the label gets bound, the unconditional far branch will be patched.
 448   const address target_pc = target(dest);
 449   const address b_pc  = pc();
 450   b(target_pc);
 451 
 452   assert(not_taken_pc == pc(),                     "postcondition");
 453   assert(dest.is_bound() || target_pc == b_pc, "postcondition");
 454 }
 455 
 456 // 1 or 2 instructions
 457 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) {
 458   if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) {
 459     bc(boint, biint, dest);
 460   } else {
 461     bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate);
 462   }
 463 }
 464 
 465 bool MacroAssembler::is_bc_far_at(address instruction_addr) {
 466   return is_bc_far_variant1_at(instruction_addr) ||
 467          is_bc_far_variant2_at(instruction_addr) ||
 468          is_bc_far_variant3_at(instruction_addr);
 469 }
 470 
 471 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) {
 472   if (is_bc_far_variant1_at(instruction_addr)) {
 473     const address instruction_1_addr = instruction_addr;
 474     const int instruction_1 = *(int*)instruction_1_addr;
 475     return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr);
 476   } else if (is_bc_far_variant2_at(instruction_addr)) {
 477     const address instruction_2_addr = instruction_addr + 4;
 478     return bxx_destination(instruction_2_addr);
 479   } else if (is_bc_far_variant3_at(instruction_addr)) {
 480     return instruction_addr + 8;
 481   }
 482   // variant 4 ???
 483   ShouldNotReachHere();
 484   return NULL;
 485 }
 486 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) {
 487 
 488   if (is_bc_far_variant3_at(instruction_addr)) {
 489     // variant 3, far cond branch to the next instruction, already patched to nops:
 490     //
 491     //    nop
 492     //    endgroup
 493     //  SKIP/DEST:
 494     //
 495     return;
 496   }
 497 
 498   // first, extract boint and biint from the current branch
 499   int boint = 0;
 500   int biint = 0;
 501 
 502   ResourceMark rm;
 503   const int code_size = 2 * BytesPerInstWord;
 504   CodeBuffer buf(instruction_addr, code_size);
 505   MacroAssembler masm(&buf);
 506   if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) {
 507     // Far branch to next instruction: Optimize it by patching nops (produce variant 3).
 508     masm.nop();
 509     masm.endgroup();
 510   } else {
 511     if (is_bc_far_variant1_at(instruction_addr)) {
 512       // variant 1, the 1st instruction contains the destination address:
 513       //
 514       //    bcxx  DEST
 515       //    nop
 516       //
 517       const int instruction_1 = *(int*)(instruction_addr);
 518       boint = inv_bo_field(instruction_1);
 519       biint = inv_bi_field(instruction_1);
 520     } else if (is_bc_far_variant2_at(instruction_addr)) {
 521       // variant 2, the 2nd instruction contains the destination address:
 522       //
 523       //    b!cxx SKIP
 524       //    bxx   DEST
 525       //  SKIP:
 526       //
 527       const int instruction_1 = *(int*)(instruction_addr);
 528       boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))),
 529           opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1))));
 530       biint = inv_bi_field(instruction_1);
 531     } else {
 532       // variant 4???
 533       ShouldNotReachHere();
 534     }
 535 
 536     // second, set the new branch destination and optimize the code
 537     if (dest != instruction_addr + 4 && // the bc_far is still unbound!
 538         masm.is_within_range_of_bcxx(dest, instruction_addr)) {
 539       // variant 1:
 540       //
 541       //    bcxx  DEST
 542       //    nop
 543       //
 544       masm.bc(boint, biint, dest);
 545       masm.nop();
 546     } else {
 547       // variant 2:
 548       //
 549       //    b!cxx SKIP
 550       //    bxx   DEST
 551       //  SKIP:
 552       //
 553       const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 554                                                     opposite_bcond(inv_boint_bcond(boint)));
 555       const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord;
 556       masm.bc(opposite_boint, biint, not_taken_pc);
 557       masm.b(dest);
 558     }
 559   }
 560   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 561 }
 562 
 563 // Emit a NOT mt-safe patchable 64 bit absolute call/jump.
 564 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) {
 565   // get current pc
 566   uint64_t start_pc = (uint64_t) pc();
 567 
 568   const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last
 569   const address pc_of_b  = (address) (start_pc + (0*BytesPerInstWord)); // b is first
 570 
 571   // relocate here
 572   if (rt != relocInfo::none) {
 573     relocate(rt);
 574   }
 575 
 576   if ( ReoptimizeCallSequences &&
 577        (( link && is_within_range_of_b(dest, pc_of_bl)) ||
 578         (!link && is_within_range_of_b(dest, pc_of_b)))) {
 579     // variant 2:
 580     // Emit an optimized, pc-relative call/jump.
 581 
 582     if (link) {
 583       // some padding
 584       nop();
 585       nop();
 586       nop();
 587       nop();
 588       nop();
 589       nop();
 590 
 591       // do the call
 592       assert(pc() == pc_of_bl, "just checking");
 593       bl(dest, relocInfo::none);
 594     } else {
 595       // do the jump
 596       assert(pc() == pc_of_b, "just checking");
 597       b(dest, relocInfo::none);
 598 
 599       // some padding
 600       nop();
 601       nop();
 602       nop();
 603       nop();
 604       nop();
 605       nop();
 606     }
 607 
 608     // Assert that we can identify the emitted call/jump.
 609     assert(is_bxx64_patchable_variant2_at((address)start_pc, link),
 610            "can't identify emitted call");
 611   } else {
 612     // variant 1:
 613     mr(R0, R11);  // spill R11 -> R0.
 614 
 615     // Load the destination address into CTR,
 616     // calculate destination relative to global toc.
 617     calculate_address_from_global_toc(R11, dest, true, true, false);
 618 
 619     mtctr(R11);
 620     mr(R11, R0);  // spill R11 <- R0.
 621     nop();
 622 
 623     // do the call/jump
 624     if (link) {
 625       bctrl();
 626     } else{
 627       bctr();
 628     }
 629     // Assert that we can identify the emitted call/jump.
 630     assert(is_bxx64_patchable_variant1b_at((address)start_pc, link),
 631            "can't identify emitted call");
 632   }
 633 
 634   // Assert that we can identify the emitted call/jump.
 635   assert(is_bxx64_patchable_at((address)start_pc, link),
 636          "can't identify emitted call");
 637   assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest,
 638          "wrong encoding of dest address");
 639 }
 640 
 641 // Identify a bxx64_patchable instruction.
 642 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) {
 643   return is_bxx64_patchable_variant1b_at(instruction_addr, link)
 644     //|| is_bxx64_patchable_variant1_at(instruction_addr, link)
 645       || is_bxx64_patchable_variant2_at(instruction_addr, link);
 646 }
 647 
 648 // Does the call64_patchable instruction use a pc-relative encoding of
 649 // the call destination?
 650 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) {
 651   // variant 2 is pc-relative
 652   return is_bxx64_patchable_variant2_at(instruction_addr, link);
 653 }
 654 
 655 // Identify variant 1.
 656 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) {
 657   unsigned int* instr = (unsigned int*) instruction_addr;
 658   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 659       && is_mtctr(instr[5]) // mtctr
 660     && is_load_const_at(instruction_addr);
 661 }
 662 
 663 // Identify variant 1b: load destination relative to global toc.
 664 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) {
 665   unsigned int* instr = (unsigned int*) instruction_addr;
 666   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 667     && is_mtctr(instr[3]) // mtctr
 668     && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr);
 669 }
 670 
 671 // Identify variant 2.
 672 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) {
 673   unsigned int* instr = (unsigned int*) instruction_addr;
 674   if (link) {
 675     return is_bl (instr[6])  // bl dest is last
 676       && is_nop(instr[0])  // nop
 677       && is_nop(instr[1])  // nop
 678       && is_nop(instr[2])  // nop
 679       && is_nop(instr[3])  // nop
 680       && is_nop(instr[4])  // nop
 681       && is_nop(instr[5]); // nop
 682   } else {
 683     return is_b  (instr[0])  // b  dest is first
 684       && is_nop(instr[1])  // nop
 685       && is_nop(instr[2])  // nop
 686       && is_nop(instr[3])  // nop
 687       && is_nop(instr[4])  // nop
 688       && is_nop(instr[5])  // nop
 689       && is_nop(instr[6]); // nop
 690   }
 691 }
 692 
 693 // Set dest address of a bxx64_patchable instruction.
 694 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) {
 695   ResourceMark rm;
 696   int code_size = MacroAssembler::bxx64_patchable_size;
 697   CodeBuffer buf(instruction_addr, code_size);
 698   MacroAssembler masm(&buf);
 699   masm.bxx64_patchable(dest, relocInfo::none, link);
 700   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 701 }
 702 
 703 // Get dest address of a bxx64_patchable instruction.
 704 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) {
 705   if (is_bxx64_patchable_variant1_at(instruction_addr, link)) {
 706     return (address) (unsigned long) get_const(instruction_addr);
 707   } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) {
 708     unsigned int* instr = (unsigned int*) instruction_addr;
 709     if (link) {
 710       const int instr_idx = 6; // bl is last
 711       int branchoffset = branch_destination(instr[instr_idx], 0);
 712       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 713     } else {
 714       const int instr_idx = 0; // b is first
 715       int branchoffset = branch_destination(instr[instr_idx], 0);
 716       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 717     }
 718   // Load dest relative to global toc.
 719   } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) {
 720     return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord,
 721                                                                instruction_addr);
 722   } else {
 723     ShouldNotReachHere();
 724     return NULL;
 725   }
 726 }
 727 
 728 // Uses ordering which corresponds to ABI:
 729 //    _savegpr0_14:  std  r14,-144(r1)
 730 //    _savegpr0_15:  std  r15,-136(r1)
 731 //    _savegpr0_16:  std  r16,-128(r1)
 732 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) {
 733   std(R14, offset, dst);   offset += 8;
 734   std(R15, offset, dst);   offset += 8;
 735   std(R16, offset, dst);   offset += 8;
 736   std(R17, offset, dst);   offset += 8;
 737   std(R18, offset, dst);   offset += 8;
 738   std(R19, offset, dst);   offset += 8;
 739   std(R20, offset, dst);   offset += 8;
 740   std(R21, offset, dst);   offset += 8;
 741   std(R22, offset, dst);   offset += 8;
 742   std(R23, offset, dst);   offset += 8;
 743   std(R24, offset, dst);   offset += 8;
 744   std(R25, offset, dst);   offset += 8;
 745   std(R26, offset, dst);   offset += 8;
 746   std(R27, offset, dst);   offset += 8;
 747   std(R28, offset, dst);   offset += 8;
 748   std(R29, offset, dst);   offset += 8;
 749   std(R30, offset, dst);   offset += 8;
 750   std(R31, offset, dst);   offset += 8;
 751 
 752   stfd(F14, offset, dst);   offset += 8;
 753   stfd(F15, offset, dst);   offset += 8;
 754   stfd(F16, offset, dst);   offset += 8;
 755   stfd(F17, offset, dst);   offset += 8;
 756   stfd(F18, offset, dst);   offset += 8;
 757   stfd(F19, offset, dst);   offset += 8;
 758   stfd(F20, offset, dst);   offset += 8;
 759   stfd(F21, offset, dst);   offset += 8;
 760   stfd(F22, offset, dst);   offset += 8;
 761   stfd(F23, offset, dst);   offset += 8;
 762   stfd(F24, offset, dst);   offset += 8;
 763   stfd(F25, offset, dst);   offset += 8;
 764   stfd(F26, offset, dst);   offset += 8;
 765   stfd(F27, offset, dst);   offset += 8;
 766   stfd(F28, offset, dst);   offset += 8;
 767   stfd(F29, offset, dst);   offset += 8;
 768   stfd(F30, offset, dst);   offset += 8;
 769   stfd(F31, offset, dst);
 770 }
 771 
 772 // Uses ordering which corresponds to ABI:
 773 //    _restgpr0_14:  ld   r14,-144(r1)
 774 //    _restgpr0_15:  ld   r15,-136(r1)
 775 //    _restgpr0_16:  ld   r16,-128(r1)
 776 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) {
 777   ld(R14, offset, src);   offset += 8;
 778   ld(R15, offset, src);   offset += 8;
 779   ld(R16, offset, src);   offset += 8;
 780   ld(R17, offset, src);   offset += 8;
 781   ld(R18, offset, src);   offset += 8;
 782   ld(R19, offset, src);   offset += 8;
 783   ld(R20, offset, src);   offset += 8;
 784   ld(R21, offset, src);   offset += 8;
 785   ld(R22, offset, src);   offset += 8;
 786   ld(R23, offset, src);   offset += 8;
 787   ld(R24, offset, src);   offset += 8;
 788   ld(R25, offset, src);   offset += 8;
 789   ld(R26, offset, src);   offset += 8;
 790   ld(R27, offset, src);   offset += 8;
 791   ld(R28, offset, src);   offset += 8;
 792   ld(R29, offset, src);   offset += 8;
 793   ld(R30, offset, src);   offset += 8;
 794   ld(R31, offset, src);   offset += 8;
 795 
 796   // FP registers
 797   lfd(F14, offset, src);   offset += 8;
 798   lfd(F15, offset, src);   offset += 8;
 799   lfd(F16, offset, src);   offset += 8;
 800   lfd(F17, offset, src);   offset += 8;
 801   lfd(F18, offset, src);   offset += 8;
 802   lfd(F19, offset, src);   offset += 8;
 803   lfd(F20, offset, src);   offset += 8;
 804   lfd(F21, offset, src);   offset += 8;
 805   lfd(F22, offset, src);   offset += 8;
 806   lfd(F23, offset, src);   offset += 8;
 807   lfd(F24, offset, src);   offset += 8;
 808   lfd(F25, offset, src);   offset += 8;
 809   lfd(F26, offset, src);   offset += 8;
 810   lfd(F27, offset, src);   offset += 8;
 811   lfd(F28, offset, src);   offset += 8;
 812   lfd(F29, offset, src);   offset += 8;
 813   lfd(F30, offset, src);   offset += 8;
 814   lfd(F31, offset, src);
 815 }
 816 
 817 // For verify_oops.
 818 void MacroAssembler::save_volatile_gprs(Register dst, int offset) {
 819   std(R2,  offset, dst);   offset += 8;
 820   std(R3,  offset, dst);   offset += 8;
 821   std(R4,  offset, dst);   offset += 8;
 822   std(R5,  offset, dst);   offset += 8;
 823   std(R6,  offset, dst);   offset += 8;
 824   std(R7,  offset, dst);   offset += 8;
 825   std(R8,  offset, dst);   offset += 8;
 826   std(R9,  offset, dst);   offset += 8;
 827   std(R10, offset, dst);   offset += 8;
 828   std(R11, offset, dst);   offset += 8;
 829   std(R12, offset, dst);   offset += 8;
 830 
 831   stfd(F0, offset, dst);   offset += 8;
 832   stfd(F1, offset, dst);   offset += 8;
 833   stfd(F2, offset, dst);   offset += 8;
 834   stfd(F3, offset, dst);   offset += 8;
 835   stfd(F4, offset, dst);   offset += 8;
 836   stfd(F5, offset, dst);   offset += 8;
 837   stfd(F6, offset, dst);   offset += 8;
 838   stfd(F7, offset, dst);   offset += 8;
 839   stfd(F8, offset, dst);   offset += 8;
 840   stfd(F9, offset, dst);   offset += 8;
 841   stfd(F10, offset, dst);  offset += 8;
 842   stfd(F11, offset, dst);  offset += 8;
 843   stfd(F12, offset, dst);  offset += 8;
 844   stfd(F13, offset, dst);
 845 }
 846 
 847 // For verify_oops.
 848 void MacroAssembler::restore_volatile_gprs(Register src, int offset) {
 849   ld(R2,  offset, src);   offset += 8;
 850   ld(R3,  offset, src);   offset += 8;
 851   ld(R4,  offset, src);   offset += 8;
 852   ld(R5,  offset, src);   offset += 8;
 853   ld(R6,  offset, src);   offset += 8;
 854   ld(R7,  offset, src);   offset += 8;
 855   ld(R8,  offset, src);   offset += 8;
 856   ld(R9,  offset, src);   offset += 8;
 857   ld(R10, offset, src);   offset += 8;
 858   ld(R11, offset, src);   offset += 8;
 859   ld(R12, offset, src);   offset += 8;
 860 
 861   lfd(F0, offset, src);   offset += 8;
 862   lfd(F1, offset, src);   offset += 8;
 863   lfd(F2, offset, src);   offset += 8;
 864   lfd(F3, offset, src);   offset += 8;
 865   lfd(F4, offset, src);   offset += 8;
 866   lfd(F5, offset, src);   offset += 8;
 867   lfd(F6, offset, src);   offset += 8;
 868   lfd(F7, offset, src);   offset += 8;
 869   lfd(F8, offset, src);   offset += 8;
 870   lfd(F9, offset, src);   offset += 8;
 871   lfd(F10, offset, src);  offset += 8;
 872   lfd(F11, offset, src);  offset += 8;
 873   lfd(F12, offset, src);  offset += 8;
 874   lfd(F13, offset, src);
 875 }
 876 
 877 void MacroAssembler::save_LR_CR(Register tmp) {
 878   mfcr(tmp);
 879   std(tmp, _abi(cr), R1_SP);
 880   mflr(tmp);
 881   std(tmp, _abi(lr), R1_SP);
 882   // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad)
 883 }
 884 
 885 void MacroAssembler::restore_LR_CR(Register tmp) {
 886   assert(tmp != R1_SP, "must be distinct");
 887   ld(tmp, _abi(lr), R1_SP);
 888   mtlr(tmp);
 889   ld(tmp, _abi(cr), R1_SP);
 890   mtcr(tmp);
 891 }
 892 
 893 address MacroAssembler::get_PC_trash_LR(Register result) {
 894   Label L;
 895   bl(L);
 896   bind(L);
 897   address lr_pc = pc();
 898   mflr(result);
 899   return lr_pc;
 900 }
 901 
 902 void MacroAssembler::resize_frame(Register offset, Register tmp) {
 903 #ifdef ASSERT
 904   assert_different_registers(offset, tmp, R1_SP);
 905   andi_(tmp, offset, frame::alignment_in_bytes-1);
 906   asm_assert_eq("resize_frame: unaligned", 0x204);
 907 #endif
 908 
 909   // tmp <- *(SP)
 910   ld(tmp, _abi(callers_sp), R1_SP);
 911   // addr <- SP + offset;
 912   // *(addr) <- tmp;
 913   // SP <- addr
 914   stdux(tmp, R1_SP, offset);
 915 }
 916 
 917 void MacroAssembler::resize_frame(int offset, Register tmp) {
 918   assert(is_simm(offset, 16), "too big an offset");
 919   assert_different_registers(tmp, R1_SP);
 920   assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned");
 921   // tmp <- *(SP)
 922   ld(tmp, _abi(callers_sp), R1_SP);
 923   // addr <- SP + offset;
 924   // *(addr) <- tmp;
 925   // SP <- addr
 926   stdu(tmp, offset, R1_SP);
 927 }
 928 
 929 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) {
 930   // (addr == tmp1) || (addr == tmp2) is allowed here!
 931   assert(tmp1 != tmp2, "must be distinct");
 932 
 933   // compute offset w.r.t. current stack pointer
 934   // tmp_1 <- addr - SP (!)
 935   subf(tmp1, R1_SP, addr);
 936 
 937   // atomically update SP keeping back link.
 938   resize_frame(tmp1/* offset */, tmp2/* tmp */);
 939 }
 940 
 941 void MacroAssembler::push_frame(Register bytes, Register tmp) {
 942 #ifdef ASSERT
 943   assert(bytes != R0, "r0 not allowed here");
 944   andi_(R0, bytes, frame::alignment_in_bytes-1);
 945   asm_assert_eq("push_frame(Reg, Reg): unaligned", 0x203);
 946 #endif
 947   neg(tmp, bytes);
 948   stdux(R1_SP, R1_SP, tmp);
 949 }
 950 
 951 // Push a frame of size `bytes'.
 952 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) {
 953   long offset = align_addr(bytes, frame::alignment_in_bytes);
 954   if (is_simm(-offset, 16)) {
 955     stdu(R1_SP, -offset, R1_SP);
 956   } else {
 957     load_const_optimized(tmp, -offset);
 958     stdux(R1_SP, R1_SP, tmp);
 959   }
 960 }
 961 
 962 // Push a frame of size `bytes' plus abi_reg_args on top.
 963 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) {
 964   push_frame(bytes + frame::abi_reg_args_size, tmp);
 965 }
 966 
 967 // Setup up a new C frame with a spill area for non-volatile GPRs and
 968 // additional space for local variables.
 969 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes,
 970                                                       Register tmp) {
 971   push_frame(bytes + frame::abi_reg_args_size + frame::spill_nonvolatiles_size, tmp);
 972 }
 973 
 974 // Pop current C frame.
 975 void MacroAssembler::pop_frame() {
 976   ld(R1_SP, _abi(callers_sp), R1_SP);
 977 }
 978 
 979 #if defined(ABI_ELFv2)
 980 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) {
 981   // TODO(asmundak): make sure the caller uses R12 as function descriptor
 982   // most of the times.
 983   if (R12 != r_function_entry) {
 984     mr(R12, r_function_entry);
 985   }
 986   mtctr(R12);
 987   // Do a call or a branch.
 988   if (and_link) {
 989     bctrl();
 990   } else {
 991     bctr();
 992   }
 993   _last_calls_return_pc = pc();
 994 
 995   return _last_calls_return_pc;
 996 }
 997 
 998 // Call a C function via a function descriptor and use full C
 999 // calling conventions. Updates and returns _last_calls_return_pc.
1000 address MacroAssembler::call_c(Register r_function_entry) {
1001   return branch_to(r_function_entry, /*and_link=*/true);
1002 }
1003 
1004 // For tail calls: only branch, don't link, so callee returns to caller of this function.
1005 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) {
1006   return branch_to(r_function_entry, /*and_link=*/false);
1007 }
1008 
1009 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) {
1010   load_const(R12, function_entry, R0);
1011   return branch_to(R12,  /*and_link=*/true);
1012 }
1013 
1014 #else
1015 // Generic version of a call to C function via a function descriptor
1016 // with variable support for C calling conventions (TOC, ENV, etc.).
1017 // Updates and returns _last_calls_return_pc.
1018 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call,
1019                                   bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) {
1020   // we emit standard ptrgl glue code here
1021   assert((function_descriptor != R0), "function_descriptor cannot be R0");
1022 
1023   // retrieve necessary entries from the function descriptor
1024   ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor);
1025   mtctr(R0);
1026 
1027   if (load_toc_of_callee) {
1028     ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor);
1029   }
1030   if (load_env_of_callee) {
1031     ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor);
1032   } else if (load_toc_of_callee) {
1033     li(R11, 0);
1034   }
1035 
1036   // do a call or a branch
1037   if (and_link) {
1038     bctrl();
1039   } else {
1040     bctr();
1041   }
1042   _last_calls_return_pc = pc();
1043 
1044   return _last_calls_return_pc;
1045 }
1046 
1047 // Call a C function via a function descriptor and use full C calling
1048 // conventions.
1049 // We don't use the TOC in generated code, so there is no need to save
1050 // and restore its value.
1051 address MacroAssembler::call_c(Register fd) {
1052   return branch_to(fd, /*and_link=*/true,
1053                        /*save toc=*/false,
1054                        /*restore toc=*/false,
1055                        /*load toc=*/true,
1056                        /*load env=*/true);
1057 }
1058 
1059 address MacroAssembler::call_c_and_return_to_caller(Register fd) {
1060   return branch_to(fd, /*and_link=*/false,
1061                        /*save toc=*/false,
1062                        /*restore toc=*/false,
1063                        /*load toc=*/true,
1064                        /*load env=*/true);
1065 }
1066 
1067 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) {
1068   if (rt != relocInfo::none) {
1069     // this call needs to be relocatable
1070     if (!ReoptimizeCallSequences
1071         || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1072         || fd == NULL   // support code-size estimation
1073         || !fd->is_friend_function()
1074         || fd->entry() == NULL) {
1075       // it's not a friend function as defined by class FunctionDescriptor,
1076       // so do a full call-c here.
1077       load_const(R11, (address)fd, R0);
1078 
1079       bool has_env = (fd != NULL && fd->env() != NULL);
1080       return branch_to(R11, /*and_link=*/true,
1081                             /*save toc=*/false,
1082                             /*restore toc=*/false,
1083                             /*load toc=*/true,
1084                             /*load env=*/has_env);
1085     } else {
1086       // It's a friend function. Load the entry point and don't care about
1087       // toc and env. Use an optimizable call instruction, but ensure the
1088       // same code-size as in the case of a non-friend function.
1089       nop();
1090       nop();
1091       nop();
1092       bl64_patchable(fd->entry(), rt);
1093       _last_calls_return_pc = pc();
1094       return _last_calls_return_pc;
1095     }
1096   } else {
1097     // This call does not need to be relocatable, do more aggressive
1098     // optimizations.
1099     if (!ReoptimizeCallSequences
1100       || !fd->is_friend_function()) {
1101       // It's not a friend function as defined by class FunctionDescriptor,
1102       // so do a full call-c here.
1103       load_const(R11, (address)fd, R0);
1104       return branch_to(R11, /*and_link=*/true,
1105                             /*save toc=*/false,
1106                             /*restore toc=*/false,
1107                             /*load toc=*/true,
1108                             /*load env=*/true);
1109     } else {
1110       // it's a friend function, load the entry point and don't care about
1111       // toc and env.
1112       address dest = fd->entry();
1113       if (is_within_range_of_b(dest, pc())) {
1114         bl(dest);
1115       } else {
1116         bl64_patchable(dest, rt);
1117       }
1118       _last_calls_return_pc = pc();
1119       return _last_calls_return_pc;
1120     }
1121   }
1122 }
1123 
1124 // Call a C function.  All constants needed reside in TOC.
1125 //
1126 // Read the address to call from the TOC.
1127 // Read env from TOC, if fd specifies an env.
1128 // Read new TOC from TOC.
1129 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd,
1130                                          relocInfo::relocType rt, Register toc) {
1131   if (!ReoptimizeCallSequences
1132     || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1133     || !fd->is_friend_function()) {
1134     // It's not a friend function as defined by class FunctionDescriptor,
1135     // so do a full call-c here.
1136     assert(fd->entry() != NULL, "function must be linked");
1137 
1138     AddressLiteral fd_entry(fd->entry());
1139     bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true);
1140     mtctr(R11);
1141     if (fd->env() == NULL) {
1142       li(R11, 0);
1143       nop();
1144     } else {
1145       AddressLiteral fd_env(fd->env());
1146       success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true);
1147     }
1148     AddressLiteral fd_toc(fd->toc());
1149     // Set R2_TOC (load from toc)
1150     success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true);
1151     bctrl();
1152     _last_calls_return_pc = pc();
1153     if (!success) { return NULL; }
1154   } else {
1155     // It's a friend function, load the entry point and don't care about
1156     // toc and env. Use an optimizable call instruction, but ensure the
1157     // same code-size as in the case of a non-friend function.
1158     nop();
1159     bl64_patchable(fd->entry(), rt);
1160     _last_calls_return_pc = pc();
1161   }
1162   return _last_calls_return_pc;
1163 }
1164 #endif // ABI_ELFv2
1165 
1166 void MacroAssembler::call_VM_base(Register oop_result,
1167                                   Register last_java_sp,
1168                                   address  entry_point,
1169                                   bool     check_exceptions) {
1170   BLOCK_COMMENT("call_VM {");
1171   // Determine last_java_sp register.
1172   if (!last_java_sp->is_valid()) {
1173     last_java_sp = R1_SP;
1174   }
1175   set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1);
1176 
1177   // ARG1 must hold thread address.
1178   mr(R3_ARG1, R16_thread);
1179 #if defined(ABI_ELFv2)
1180   address return_pc = call_c(entry_point, relocInfo::none);
1181 #else
1182   address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none);
1183 #endif
1184 
1185   reset_last_Java_frame();
1186 
1187   // Check for pending exceptions.
1188   if (check_exceptions) {
1189     // We don't check for exceptions here.
1190     ShouldNotReachHere();
1191   }
1192 
1193   // Get oop result if there is one and reset the value in the thread.
1194   if (oop_result->is_valid()) {
1195     get_vm_result(oop_result);
1196   }
1197 
1198   _last_calls_return_pc = return_pc;
1199   BLOCK_COMMENT("} call_VM");
1200 }
1201 
1202 void MacroAssembler::call_VM_leaf_base(address entry_point) {
1203   BLOCK_COMMENT("call_VM_leaf {");
1204 #if defined(ABI_ELFv2)
1205   call_c(entry_point, relocInfo::none);
1206 #else
1207   call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none);
1208 #endif
1209   BLOCK_COMMENT("} call_VM_leaf");
1210 }
1211 
1212 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) {
1213   call_VM_base(oop_result, noreg, entry_point, check_exceptions);
1214 }
1215 
1216 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1,
1217                              bool check_exceptions) {
1218   // R3_ARG1 is reserved for the thread.
1219   mr_if_needed(R4_ARG2, arg_1);
1220   call_VM(oop_result, entry_point, check_exceptions);
1221 }
1222 
1223 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2,
1224                              bool check_exceptions) {
1225   // R3_ARG1 is reserved for the thread
1226   mr_if_needed(R4_ARG2, arg_1);
1227   assert(arg_2 != R4_ARG2, "smashed argument");
1228   mr_if_needed(R5_ARG3, arg_2);
1229   call_VM(oop_result, entry_point, check_exceptions);
1230 }
1231 
1232 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3,
1233                              bool check_exceptions) {
1234   // R3_ARG1 is reserved for the thread
1235   mr_if_needed(R4_ARG2, arg_1);
1236   assert(arg_2 != R4_ARG2, "smashed argument");
1237   mr_if_needed(R5_ARG3, arg_2);
1238   mr_if_needed(R6_ARG4, arg_3);
1239   call_VM(oop_result, entry_point, check_exceptions);
1240 }
1241 
1242 void MacroAssembler::call_VM_leaf(address entry_point) {
1243   call_VM_leaf_base(entry_point);
1244 }
1245 
1246 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) {
1247   mr_if_needed(R3_ARG1, arg_1);
1248   call_VM_leaf(entry_point);
1249 }
1250 
1251 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) {
1252   mr_if_needed(R3_ARG1, arg_1);
1253   assert(arg_2 != R3_ARG1, "smashed argument");
1254   mr_if_needed(R4_ARG2, arg_2);
1255   call_VM_leaf(entry_point);
1256 }
1257 
1258 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) {
1259   mr_if_needed(R3_ARG1, arg_1);
1260   assert(arg_2 != R3_ARG1, "smashed argument");
1261   mr_if_needed(R4_ARG2, arg_2);
1262   assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument");
1263   mr_if_needed(R5_ARG3, arg_3);
1264   call_VM_leaf(entry_point);
1265 }
1266 
1267 // Check whether instruction is a read access to the polling page
1268 // which was emitted by load_from_polling_page(..).
1269 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext,
1270                                                address* polling_address_ptr) {
1271   if (!is_ld(instruction))
1272     return false; // It's not a ld. Fail.
1273 
1274   int rt = inv_rt_field(instruction);
1275   int ra = inv_ra_field(instruction);
1276   int ds = inv_ds_field(instruction);
1277   if (!(ds == 0 && ra != 0 && rt == 0)) {
1278     return false; // It's not a ld(r0, X, ra). Fail.
1279   }
1280 
1281   if (!ucontext) {
1282     // Set polling address.
1283     if (polling_address_ptr != NULL) {
1284       *polling_address_ptr = NULL;
1285     }
1286     return true; // No ucontext given. Can't check value of ra. Assume true.
1287   }
1288 
1289 #ifdef LINUX
1290   // Ucontext given. Check that register ra contains the address of
1291   // the safepoing polling page.
1292   ucontext_t* uc = (ucontext_t*) ucontext;
1293   // Set polling address.
1294   address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds;
1295   if (polling_address_ptr != NULL) {
1296     *polling_address_ptr = addr;
1297   }
1298   return os::is_poll_address(addr);
1299 #else
1300   // Not on Linux, ucontext must be NULL.
1301   ShouldNotReachHere();
1302   return false;
1303 #endif
1304 }
1305 
1306 void MacroAssembler::bang_stack_with_offset(int offset) {
1307   // When increasing the stack, the old stack pointer will be written
1308   // to the new top of stack according to the PPC64 abi.
1309   // Therefore, stack banging is not necessary when increasing
1310   // the stack by <= os::vm_page_size() bytes.
1311   // When increasing the stack by a larger amount, this method is
1312   // called repeatedly to bang the intermediate pages.
1313 
1314   // Stack grows down, caller passes positive offset.
1315   assert(offset > 0, "must bang with positive offset");
1316 
1317   long stdoffset = -offset;
1318 
1319   if (is_simm(stdoffset, 16)) {
1320     // Signed 16 bit offset, a simple std is ok.
1321     if (UseLoadInstructionsForStackBangingPPC64) {
1322       ld(R0, (int)(signed short)stdoffset, R1_SP);
1323     } else {
1324       std(R0,(int)(signed short)stdoffset, R1_SP);
1325     }
1326   } else if (is_simm(stdoffset, 31)) {
1327     const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset);
1328     const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset);
1329 
1330     Register tmp = R11;
1331     addis(tmp, R1_SP, hi);
1332     if (UseLoadInstructionsForStackBangingPPC64) {
1333       ld(R0,  lo, tmp);
1334     } else {
1335       std(R0, lo, tmp);
1336     }
1337   } else {
1338     ShouldNotReachHere();
1339   }
1340 }
1341 
1342 // If instruction is a stack bang of the form
1343 //    std    R0,    x(Ry),       (see bang_stack_with_offset())
1344 //    stdu   R1_SP, x(R1_SP),    (see push_frame(), resize_frame())
1345 // or stdux  R1_SP, Rx, R1_SP    (see push_frame(), resize_frame())
1346 // return the banged address. Otherwise, return 0.
1347 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) {
1348 #ifdef LINUX
1349   ucontext_t* uc = (ucontext_t*) ucontext;
1350   int rs = inv_rs_field(instruction);
1351   int ra = inv_ra_field(instruction);
1352   if (   (is_ld(instruction)   && rs == 0 &&  UseLoadInstructionsForStackBangingPPC64)
1353       || (is_std(instruction)  && rs == 0 && !UseLoadInstructionsForStackBangingPPC64)
1354       || (is_stdu(instruction) && rs == 1)) {
1355     int ds = inv_ds_field(instruction);
1356     // return banged address
1357     return ds+(address)uc->uc_mcontext.regs->gpr[ra];
1358   } else if (is_stdux(instruction) && rs == 1) {
1359     int rb = inv_rb_field(instruction);
1360     address sp = (address)uc->uc_mcontext.regs->gpr[1];
1361     long rb_val = (long)uc->uc_mcontext.regs->gpr[rb];
1362     return ra != 1 || rb_val >= 0 ? NULL         // not a stack bang
1363                                   : sp + rb_val; // banged address
1364   }
1365   return NULL; // not a stack bang
1366 #else
1367   // workaround not needed on !LINUX :-)
1368   ShouldNotCallThis();
1369   return NULL;
1370 #endif
1371 }
1372 
1373 void MacroAssembler::reserved_stack_check(Register return_pc) {
1374   // Test if reserved zone needs to be enabled.
1375   Label no_reserved_zone_enabling;
1376 
1377   ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread);
1378   cmpld(CCR0, R1_SP, R0);
1379   blt_predict_taken(CCR0, no_reserved_zone_enabling);
1380 
1381   // Enable reserved zone again, throw stack overflow exception.
1382   push_frame_reg_args(0, R0);
1383   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread);
1384   pop_frame();
1385   mtlr(return_pc);
1386   load_const_optimized(R0, StubRoutines::throw_delayed_StackOverflowError_entry());
1387   mtctr(R0);
1388   bctr();
1389 
1390   should_not_reach_here();
1391 
1392   bind(no_reserved_zone_enabling);
1393 }
1394 
1395 void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base,
1396                                 bool cmpxchgx_hint) {
1397   Label retry;
1398   bind(retry);
1399   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1400   stdcx_(exchange_value, addr_base);
1401   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1402     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1403   } else {
1404     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1405   }
1406 }
1407 
1408 void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base,
1409                                 Register tmp, bool cmpxchgx_hint) {
1410   Label retry;
1411   bind(retry);
1412   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1413   add(tmp, dest_current_value, inc_value);
1414   stdcx_(tmp, addr_base);
1415   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1416     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1417   } else {
1418     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1419   }
1420 }
1421 
1422 // Word/sub-word atomic helper functions
1423 
1424 // Temps and addr_base are killed if size < 4 and processor does not support respective instructions.
1425 // Only signed types are supported with size < 4.
1426 // Atomic add always kills tmp1.
1427 void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value,
1428                                                    Register addr_base, Register tmp1, Register tmp2, Register tmp3,
1429                                                    bool cmpxchgx_hint, bool is_add, int size) {
1430   // Sub-word instructions are available since Power 8.
1431   // For older processors, instruction_type != size holds, and we
1432   // emulate the sub-word instructions by constructing a 4-byte value
1433   // that leaves the other bytes unchanged.
1434   const int instruction_type = VM_Version::has_lqarx() ? size : 4;
1435 
1436   Label retry;
1437   Register shift_amount = noreg,
1438            val32 = dest_current_value,
1439            modval = is_add ? tmp1 : exchange_value;
1440 
1441   if (instruction_type != size) {
1442     assert_different_registers(tmp1, tmp2, tmp3, dest_current_value, exchange_value, addr_base);
1443     modval = tmp1;
1444     shift_amount = tmp2;
1445     val32 = tmp3;
1446     // Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.
1447 #ifdef VM_LITTLE_ENDIAN
1448     rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;
1449     clrrdi(addr_base, addr_base, 2);
1450 #else
1451     xori(shift_amount, addr_base, (size == 1) ? 3 : 2);
1452     clrrdi(addr_base, addr_base, 2);
1453     rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;
1454 #endif
1455   }
1456 
1457   // atomic emulation loop
1458   bind(retry);
1459 
1460   switch (instruction_type) {
1461     case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1462     case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1463     case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1464     default: ShouldNotReachHere();
1465   }
1466 
1467   if (instruction_type != size) {
1468     srw(dest_current_value, val32, shift_amount);
1469   }
1470 
1471   if (is_add) { add(modval, dest_current_value, exchange_value); }
1472 
1473   if (instruction_type != size) {
1474     // Transform exchange value such that the replacement can be done by one xor instruction.
1475     xorr(modval, dest_current_value, is_add ? modval : exchange_value);
1476     clrldi(modval, modval, (size == 1) ? 56 : 48);
1477     slw(modval, modval, shift_amount);
1478     xorr(modval, val32, modval);
1479   }
1480 
1481   switch (instruction_type) {
1482     case 4: stwcx_(modval, addr_base); break;
1483     case 2: sthcx_(modval, addr_base); break;
1484     case 1: stbcx_(modval, addr_base); break;
1485     default: ShouldNotReachHere();
1486   }
1487 
1488   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1489     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1490   } else {
1491     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1492   }
1493 
1494   // l?arx zero-extends, but Java wants byte/short values sign-extended.
1495   if (size == 1) {
1496     extsb(dest_current_value, dest_current_value);
1497   } else if (size == 2) {
1498     extsh(dest_current_value, dest_current_value);
1499   };
1500 }
1501 
1502 // Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions.
1503 // Only signed types are supported with size < 4.
1504 void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value,
1505                                        Register compare_value, Register exchange_value,
1506                                        Register addr_base, Register tmp1, Register tmp2,
1507                                        Label &retry, Label &failed, bool cmpxchgx_hint, int size) {
1508   // Sub-word instructions are available since Power 8.
1509   // For older processors, instruction_type != size holds, and we
1510   // emulate the sub-word instructions by constructing a 4-byte value
1511   // that leaves the other bytes unchanged.
1512   const int instruction_type = VM_Version::has_lqarx() ? size : 4;
1513 
1514   Register shift_amount = noreg,
1515            val32 = dest_current_value,
1516            modval = exchange_value;
1517 
1518   if (instruction_type != size) {
1519     assert_different_registers(tmp1, tmp2, dest_current_value, compare_value, exchange_value, addr_base);
1520     shift_amount = tmp1;
1521     val32 = tmp2;
1522     modval = tmp2;
1523     // Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.
1524 #ifdef VM_LITTLE_ENDIAN
1525     rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;
1526     clrrdi(addr_base, addr_base, 2);
1527 #else
1528     xori(shift_amount, addr_base, (size == 1) ? 3 : 2);
1529     clrrdi(addr_base, addr_base, 2);
1530     rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;
1531 #endif
1532     // Transform exchange value such that the replacement can be done by one xor instruction.
1533     xorr(exchange_value, compare_value, exchange_value);
1534     clrldi(exchange_value, exchange_value, (size == 1) ? 56 : 48);
1535     slw(exchange_value, exchange_value, shift_amount);
1536   }
1537 
1538   // atomic emulation loop
1539   bind(retry);
1540 
1541   switch (instruction_type) {
1542     case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1543     case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1544     case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1545     default: ShouldNotReachHere();
1546   }
1547 
1548   if (instruction_type != size) {
1549     srw(dest_current_value, val32, shift_amount);
1550   }
1551   if (size == 1) {
1552     extsb(dest_current_value, dest_current_value);
1553   } else if (size == 2) {
1554     extsh(dest_current_value, dest_current_value);
1555   };
1556 
1557   cmpw(flag, dest_current_value, compare_value);
1558   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1559     bne_predict_not_taken(flag, failed);
1560   } else {
1561     bne(                  flag, failed);
1562   }
1563   // branch to done  => (flag == ne), (dest_current_value != compare_value)
1564   // fall through    => (flag == eq), (dest_current_value == compare_value)
1565 
1566   if (instruction_type != size) {
1567     xorr(modval, val32, exchange_value);
1568   }
1569 
1570   switch (instruction_type) {
1571     case 4: stwcx_(modval, addr_base); break;
1572     case 2: sthcx_(modval, addr_base); break;
1573     case 1: stbcx_(modval, addr_base); break;
1574     default: ShouldNotReachHere();
1575   }
1576 }
1577 
1578 // CmpxchgX sets condition register to cmpX(current, compare).
1579 void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value,
1580                                      Register compare_value, Register exchange_value,
1581                                      Register addr_base, Register tmp1, Register tmp2,
1582                                      int semantics, bool cmpxchgx_hint,
1583                                      Register int_flag_success, bool contention_hint, bool weak, int size) {
1584   Label retry;
1585   Label failed;
1586   Label done;
1587 
1588   // Save one branch if result is returned via register and
1589   // result register is different from the other ones.
1590   bool use_result_reg    = (int_flag_success != noreg);
1591   bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value &&
1592                             int_flag_success != exchange_value && int_flag_success != addr_base &&
1593                             int_flag_success != tmp1 && int_flag_success != tmp2);
1594   assert(!weak || flag == CCR0, "weak only supported with CCR0");
1595   assert(size == 1 || size == 2 || size == 4, "unsupported");
1596 
1597   if (use_result_reg && preset_result_reg) {
1598     li(int_flag_success, 0); // preset (assume cas failed)
1599   }
1600 
1601   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1602   if (contention_hint) { // Don't try to reserve if cmp fails.
1603     switch (size) {
1604       case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break;
1605       case 2: lha(dest_current_value, 0, addr_base); break;
1606       case 4: lwz(dest_current_value, 0, addr_base); break;
1607       default: ShouldNotReachHere();
1608     }
1609     cmpw(flag, dest_current_value, compare_value);
1610     bne(flag, failed);
1611   }
1612 
1613   // release/fence semantics
1614   if (semantics & MemBarRel) {
1615     release();
1616   }
1617 
1618   cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base, tmp1, tmp2,
1619                     retry, failed, cmpxchgx_hint, size);
1620   if (!weak || use_result_reg) {
1621     if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1622       bne_predict_not_taken(CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1623     } else {
1624       bne(                  CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1625     }
1626   }
1627   // fall through    => (flag == eq), (dest_current_value == compare_value), (swapped)
1628 
1629   // Result in register (must do this at the end because int_flag_success can be the
1630   // same register as one above).
1631   if (use_result_reg) {
1632     li(int_flag_success, 1);
1633   }
1634 
1635   if (semantics & MemBarFenceAfter) {
1636     fence();
1637   } else if (semantics & MemBarAcq) {
1638     isync();
1639   }
1640 
1641   if (use_result_reg && !preset_result_reg) {
1642     b(done);
1643   }
1644 
1645   bind(failed);
1646   if (use_result_reg && !preset_result_reg) {
1647     li(int_flag_success, 0);
1648   }
1649 
1650   bind(done);
1651   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1652   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1653 }
1654 
1655 // Preforms atomic compare exchange:
1656 //   if (compare_value == *addr_base)
1657 //     *addr_base = exchange_value
1658 //     int_flag_success = 1;
1659 //   else
1660 //     int_flag_success = 0;
1661 //
1662 // ConditionRegister flag       = cmp(compare_value, *addr_base)
1663 // Register dest_current_value  = *addr_base
1664 // Register compare_value       Used to compare with value in memory
1665 // Register exchange_value      Written to memory if compare_value == *addr_base
1666 // Register addr_base           The memory location to compareXChange
1667 // Register int_flag_success    Set to 1 if exchange_value was written to *addr_base
1668 //
1669 // To avoid the costly compare exchange the value is tested beforehand.
1670 // Several special cases exist to avoid that unnecessary information is generated.
1671 //
1672 void MacroAssembler::cmpxchgd(ConditionRegister flag,
1673                               Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value,
1674                               Register addr_base, int semantics, bool cmpxchgx_hint,
1675                               Register int_flag_success, Label* failed_ext, bool contention_hint, bool weak) {
1676   Label retry;
1677   Label failed_int;
1678   Label& failed = (failed_ext != NULL) ? *failed_ext : failed_int;
1679   Label done;
1680 
1681   // Save one branch if result is returned via register and result register is different from the other ones.
1682   bool use_result_reg    = (int_flag_success!=noreg);
1683   bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() &&
1684                             int_flag_success!=exchange_value && int_flag_success!=addr_base);
1685   assert(!weak || flag == CCR0, "weak only supported with CCR0");
1686   assert(int_flag_success == noreg || failed_ext == NULL, "cannot have both");
1687 
1688   if (use_result_reg && preset_result_reg) {
1689     li(int_flag_success, 0); // preset (assume cas failed)
1690   }
1691 
1692   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1693   if (contention_hint) { // Don't try to reserve if cmp fails.
1694     ld(dest_current_value, 0, addr_base);
1695     cmpd(flag, compare_value, dest_current_value);
1696     bne(flag, failed);
1697   }
1698 
1699   // release/fence semantics
1700   if (semantics & MemBarRel) {
1701     release();
1702   }
1703 
1704   // atomic emulation loop
1705   bind(retry);
1706 
1707   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1708   cmpd(flag, compare_value, dest_current_value);
1709   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1710     bne_predict_not_taken(flag, failed);
1711   } else {
1712     bne(                  flag, failed);
1713   }
1714 
1715   stdcx_(exchange_value, addr_base);
1716   if (!weak || use_result_reg || failed_ext) {
1717     if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1718       bne_predict_not_taken(CCR0, weak ? failed : retry); // stXcx_ sets CCR0
1719     } else {
1720       bne(                  CCR0, weak ? failed : retry); // stXcx_ sets CCR0
1721     }
1722   }
1723 
1724   // result in register (must do this at the end because int_flag_success can be the same register as one above)
1725   if (use_result_reg) {
1726     li(int_flag_success, 1);
1727   }
1728 
1729   if (semantics & MemBarFenceAfter) {
1730     fence();
1731   } else if (semantics & MemBarAcq) {
1732     isync();
1733   }
1734 
1735   if (use_result_reg && !preset_result_reg) {
1736     b(done);
1737   }
1738 
1739   bind(failed_int);
1740   if (use_result_reg && !preset_result_reg) {
1741     li(int_flag_success, 0);
1742   }
1743 
1744   bind(done);
1745   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1746   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1747 }
1748 
1749 // Look up the method for a megamorphic invokeinterface call.
1750 // The target method is determined by <intf_klass, itable_index>.
1751 // The receiver klass is in recv_klass.
1752 // On success, the result will be in method_result, and execution falls through.
1753 // On failure, execution transfers to the given label.
1754 void MacroAssembler::lookup_interface_method(Register recv_klass,
1755                                              Register intf_klass,
1756                                              RegisterOrConstant itable_index,
1757                                              Register method_result,
1758                                              Register scan_temp,
1759                                              Register temp2,
1760                                              Label& L_no_such_interface,
1761                                              bool return_method) {
1762   assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
1763 
1764   // Compute start of first itableOffsetEntry (which is at the end of the vtable).
1765   int vtable_base = in_bytes(Klass::vtable_start_offset());
1766   int itentry_off = itableMethodEntry::method_offset_in_bytes();
1767   int logMEsize   = exact_log2(itableMethodEntry::size() * wordSize);
1768   int scan_step   = itableOffsetEntry::size() * wordSize;
1769   int log_vte_size= exact_log2(vtableEntry::size_in_bytes());
1770 
1771   lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass);
1772   // %%% We should store the aligned, prescaled offset in the klassoop.
1773   // Then the next several instructions would fold away.
1774 
1775   sldi(scan_temp, scan_temp, log_vte_size);
1776   addi(scan_temp, scan_temp, vtable_base);
1777   add(scan_temp, recv_klass, scan_temp);
1778 
1779   // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1780   if (return_method) {
1781     if (itable_index.is_register()) {
1782       Register itable_offset = itable_index.as_register();
1783       sldi(method_result, itable_offset, logMEsize);
1784       if (itentry_off) { addi(method_result, method_result, itentry_off); }
1785       add(method_result, method_result, recv_klass);
1786     } else {
1787       long itable_offset = (long)itable_index.as_constant();
1788       // static address, no relocation
1789       add_const_optimized(method_result, recv_klass, (itable_offset << logMEsize) + itentry_off, temp2);
1790     }
1791   }
1792 
1793   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
1794   //   if (scan->interface() == intf) {
1795   //     result = (klass + scan->offset() + itable_index);
1796   //   }
1797   // }
1798   Label search, found_method;
1799 
1800   for (int peel = 1; peel >= 0; peel--) {
1801     // %%%% Could load both offset and interface in one ldx, if they were
1802     // in the opposite order. This would save a load.
1803     ld(temp2, itableOffsetEntry::interface_offset_in_bytes(), scan_temp);
1804 
1805     // Check that this entry is non-null. A null entry means that
1806     // the receiver class doesn't implement the interface, and wasn't the
1807     // same as when the caller was compiled.
1808     cmpd(CCR0, temp2, intf_klass);
1809 
1810     if (peel) {
1811       beq(CCR0, found_method);
1812     } else {
1813       bne(CCR0, search);
1814       // (invert the test to fall through to found_method...)
1815     }
1816 
1817     if (!peel) break;
1818 
1819     bind(search);
1820 
1821     cmpdi(CCR0, temp2, 0);
1822     beq(CCR0, L_no_such_interface);
1823     addi(scan_temp, scan_temp, scan_step);
1824   }
1825 
1826   bind(found_method);
1827 
1828   // Got a hit.
1829   if (return_method) {
1830     int ito_offset = itableOffsetEntry::offset_offset_in_bytes();
1831     lwz(scan_temp, ito_offset, scan_temp);
1832     ldx(method_result, scan_temp, method_result);
1833   }
1834 }
1835 
1836 // virtual method calling
1837 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1838                                            RegisterOrConstant vtable_index,
1839                                            Register method_result) {
1840 
1841   assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg());
1842 
1843   const int base = in_bytes(Klass::vtable_start_offset());
1844   assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1845 
1846   if (vtable_index.is_register()) {
1847     sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord);
1848     add(recv_klass, vtable_index.as_register(), recv_klass);
1849   } else {
1850     addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord);
1851   }
1852   ld(R19_method, base + vtableEntry::method_offset_in_bytes(), recv_klass);
1853 }
1854 
1855 /////////////////////////////////////////// subtype checking ////////////////////////////////////////////
1856 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1857                                                    Register super_klass,
1858                                                    Register temp1_reg,
1859                                                    Register temp2_reg,
1860                                                    Label* L_success,
1861                                                    Label* L_failure,
1862                                                    Label* L_slow_path,
1863                                                    RegisterOrConstant super_check_offset) {
1864 
1865   const Register check_cache_offset = temp1_reg;
1866   const Register cached_super       = temp2_reg;
1867 
1868   assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super);
1869 
1870   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1871   int sc_offset  = in_bytes(Klass::secondary_super_cache_offset());
1872 
1873   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1874   bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset);
1875 
1876   Label L_fallthrough;
1877   int label_nulls = 0;
1878   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1879   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1880   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1881   assert(label_nulls <= 1 ||
1882          (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path),
1883          "at most one NULL in the batch, usually");
1884 
1885   // If the pointers are equal, we are done (e.g., String[] elements).
1886   // This self-check enables sharing of secondary supertype arrays among
1887   // non-primary types such as array-of-interface. Otherwise, each such
1888   // type would need its own customized SSA.
1889   // We move this check to the front of the fast path because many
1890   // type checks are in fact trivially successful in this manner,
1891   // so we get a nicely predicted branch right at the start of the check.
1892   cmpd(CCR0, sub_klass, super_klass);
1893   beq(CCR0, *L_success);
1894 
1895   // Check the supertype display:
1896   if (must_load_sco) {
1897     // The super check offset is always positive...
1898     lwz(check_cache_offset, sco_offset, super_klass);
1899     super_check_offset = RegisterOrConstant(check_cache_offset);
1900     // super_check_offset is register.
1901     assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register());
1902   }
1903   // The loaded value is the offset from KlassOopDesc.
1904 
1905   ld(cached_super, super_check_offset, sub_klass);
1906   cmpd(CCR0, cached_super, super_klass);
1907 
1908   // This check has worked decisively for primary supers.
1909   // Secondary supers are sought in the super_cache ('super_cache_addr').
1910   // (Secondary supers are interfaces and very deeply nested subtypes.)
1911   // This works in the same check above because of a tricky aliasing
1912   // between the super_cache and the primary super display elements.
1913   // (The 'super_check_addr' can address either, as the case requires.)
1914   // Note that the cache is updated below if it does not help us find
1915   // what we need immediately.
1916   // So if it was a primary super, we can just fail immediately.
1917   // Otherwise, it's the slow path for us (no success at this point).
1918 
1919 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); }
1920 
1921   if (super_check_offset.is_register()) {
1922     beq(CCR0, *L_success);
1923     cmpwi(CCR0, super_check_offset.as_register(), sc_offset);
1924     if (L_failure == &L_fallthrough) {
1925       beq(CCR0, *L_slow_path);
1926     } else {
1927       bne(CCR0, *L_failure);
1928       FINAL_JUMP(*L_slow_path);
1929     }
1930   } else {
1931     if (super_check_offset.as_constant() == sc_offset) {
1932       // Need a slow path; fast failure is impossible.
1933       if (L_slow_path == &L_fallthrough) {
1934         beq(CCR0, *L_success);
1935       } else {
1936         bne(CCR0, *L_slow_path);
1937         FINAL_JUMP(*L_success);
1938       }
1939     } else {
1940       // No slow path; it's a fast decision.
1941       if (L_failure == &L_fallthrough) {
1942         beq(CCR0, *L_success);
1943       } else {
1944         bne(CCR0, *L_failure);
1945         FINAL_JUMP(*L_success);
1946       }
1947     }
1948   }
1949 
1950   bind(L_fallthrough);
1951 #undef FINAL_JUMP
1952 }
1953 
1954 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1955                                                    Register super_klass,
1956                                                    Register temp1_reg,
1957                                                    Register temp2_reg,
1958                                                    Label* L_success,
1959                                                    Register result_reg) {
1960   const Register array_ptr = temp1_reg; // current value from cache array
1961   const Register temp      = temp2_reg;
1962 
1963   assert_different_registers(sub_klass, super_klass, array_ptr, temp);
1964 
1965   int source_offset = in_bytes(Klass::secondary_supers_offset());
1966   int target_offset = in_bytes(Klass::secondary_super_cache_offset());
1967 
1968   int length_offset = Array<Klass*>::length_offset_in_bytes();
1969   int base_offset   = Array<Klass*>::base_offset_in_bytes();
1970 
1971   Label hit, loop, failure, fallthru;
1972 
1973   ld(array_ptr, source_offset, sub_klass);
1974 
1975   // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated.");
1976   lwz(temp, length_offset, array_ptr);
1977   cmpwi(CCR0, temp, 0);
1978   beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0
1979 
1980   mtctr(temp); // load ctr
1981 
1982   bind(loop);
1983   // Oops in table are NO MORE compressed.
1984   ld(temp, base_offset, array_ptr);
1985   cmpd(CCR0, temp, super_klass);
1986   beq(CCR0, hit);
1987   addi(array_ptr, array_ptr, BytesPerWord);
1988   bdnz(loop);
1989 
1990   bind(failure);
1991   if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss)
1992   b(fallthru);
1993 
1994   bind(hit);
1995   std(super_klass, target_offset, sub_klass); // save result to cache
1996   if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit)
1997   if (L_success != NULL) { b(*L_success); }
1998   else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided
1999 
2000   bind(fallthru);
2001 }
2002 
2003 // Try fast path, then go to slow one if not successful
2004 void MacroAssembler::check_klass_subtype(Register sub_klass,
2005                          Register super_klass,
2006                          Register temp1_reg,
2007                          Register temp2_reg,
2008                          Label& L_success) {
2009   Label L_failure;
2010   check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure);
2011   check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success);
2012   bind(L_failure); // Fallthru if not successful.
2013 }
2014 
2015 void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) {
2016   assert(L_fast_path != NULL || L_slow_path != NULL, "at least one is required");
2017 
2018   Label L_fallthrough;
2019   if (L_fast_path == NULL) {
2020     L_fast_path = &L_fallthrough;
2021   } else if (L_slow_path == NULL) {
2022     L_slow_path = &L_fallthrough;
2023   }
2024 
2025   // Fast path check: class is fully initialized
2026   lbz(R0, in_bytes(InstanceKlass::init_state_offset()), klass);
2027   cmpwi(CCR0, R0, InstanceKlass::fully_initialized);
2028   beq(CCR0, *L_fast_path);
2029 
2030   // Fast path check: current thread is initializer thread
2031   ld(R0, in_bytes(InstanceKlass::init_thread_offset()), klass);
2032   cmpd(CCR0, thread, R0);
2033   if (L_slow_path == &L_fallthrough) {
2034     beq(CCR0, *L_fast_path);
2035   } else if (L_fast_path == &L_fallthrough) {
2036     bne(CCR0, *L_slow_path);
2037   } else {
2038     Unimplemented();
2039   }
2040 
2041   bind(L_fallthrough);
2042 }
2043 
2044 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot,
2045                                                    Register temp_reg,
2046                                                    int extra_slot_offset) {
2047   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
2048   int stackElementSize = Interpreter::stackElementSize;
2049   int offset = extra_slot_offset * stackElementSize;
2050   if (arg_slot.is_constant()) {
2051     offset += arg_slot.as_constant() * stackElementSize;
2052     return offset;
2053   } else {
2054     assert(temp_reg != noreg, "must specify");
2055     sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize));
2056     if (offset != 0)
2057       addi(temp_reg, temp_reg, offset);
2058     return temp_reg;
2059   }
2060 }
2061 
2062 // Supports temp2_reg = R0.
2063 void MacroAssembler::biased_locking_enter(ConditionRegister cr_reg, Register obj_reg,
2064                                           Register mark_reg, Register temp_reg,
2065                                           Register temp2_reg, Label& done, Label* slow_case) {
2066   assert(UseBiasedLocking, "why call this otherwise?");
2067 
2068 #ifdef ASSERT
2069   assert_different_registers(obj_reg, mark_reg, temp_reg, temp2_reg);
2070 #endif
2071 
2072   Label cas_label;
2073 
2074   // Branch to done if fast path fails and no slow_case provided.
2075   Label *slow_case_int = (slow_case != NULL) ? slow_case : &done;
2076 
2077   // Biased locking
2078   // See whether the lock is currently biased toward our thread and
2079   // whether the epoch is still valid
2080   // Note that the runtime guarantees sufficient alignment of JavaThread
2081   // pointers to allow age to be placed into low bits
2082   assert(markWord::age_shift == markWord::lock_bits + markWord::biased_lock_bits,
2083          "biased locking makes assumptions about bit layout");
2084 
2085   if (PrintBiasedLockingStatistics) {
2086     load_const(temp2_reg, (address) BiasedLocking::total_entry_count_addr(), temp_reg);
2087     lwzx(temp_reg, temp2_reg);
2088     addi(temp_reg, temp_reg, 1);
2089     stwx(temp_reg, temp2_reg);
2090   }
2091 
2092   andi(temp_reg, mark_reg, markWord::biased_lock_mask_in_place);
2093   cmpwi(cr_reg, temp_reg, markWord::biased_lock_pattern);
2094   bne(cr_reg, cas_label);
2095 
2096   load_klass(temp_reg, obj_reg);
2097 
2098   load_const_optimized(temp2_reg, ~((int) markWord::age_mask_in_place));
2099   ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
2100   orr(temp_reg, R16_thread, temp_reg);
2101   xorr(temp_reg, mark_reg, temp_reg);
2102   andr(temp_reg, temp_reg, temp2_reg);
2103   cmpdi(cr_reg, temp_reg, 0);
2104   if (PrintBiasedLockingStatistics) {
2105     Label l;
2106     bne(cr_reg, l);
2107     load_const(temp2_reg, (address) BiasedLocking::biased_lock_entry_count_addr());
2108     lwzx(mark_reg, temp2_reg);
2109     addi(mark_reg, mark_reg, 1);
2110     stwx(mark_reg, temp2_reg);
2111     // restore mark_reg
2112     ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);
2113     bind(l);
2114   }
2115   beq(cr_reg, done);
2116 
2117   Label try_revoke_bias;
2118   Label try_rebias;
2119 
2120   // At this point we know that the header has the bias pattern and
2121   // that we are not the bias owner in the current epoch. We need to
2122   // figure out more details about the state of the header in order to
2123   // know what operations can be legally performed on the object's
2124   // header.
2125 
2126   // If the low three bits in the xor result aren't clear, that means
2127   // the prototype header is no longer biased and we have to revoke
2128   // the bias on this object.
2129   andi(temp2_reg, temp_reg, markWord::biased_lock_mask_in_place);
2130   cmpwi(cr_reg, temp2_reg, 0);
2131   bne(cr_reg, try_revoke_bias);
2132 
2133   // Biasing is still enabled for this data type. See whether the
2134   // epoch of the current bias is still valid, meaning that the epoch
2135   // bits of the mark word are equal to the epoch bits of the
2136   // prototype header. (Note that the prototype header's epoch bits
2137   // only change at a safepoint.) If not, attempt to rebias the object
2138   // toward the current thread. Note that we must be absolutely sure
2139   // that the current epoch is invalid in order to do this because
2140   // otherwise the manipulations it performs on the mark word are
2141   // illegal.
2142 
2143   int shift_amount = 64 - markWord::epoch_shift;
2144   // rotate epoch bits to right (little) end and set other bits to 0
2145   // [ big part | epoch | little part ] -> [ 0..0 | epoch ]
2146   rldicl_(temp2_reg, temp_reg, shift_amount, 64 - markWord::epoch_bits);
2147   // branch if epoch bits are != 0, i.e. they differ, because the epoch has been incremented
2148   bne(CCR0, try_rebias);
2149 
2150   // The epoch of the current bias is still valid but we know nothing
2151   // about the owner; it might be set or it might be clear. Try to
2152   // acquire the bias of the object using an atomic operation. If this
2153   // fails we will go in to the runtime to revoke the object's bias.
2154   // Note that we first construct the presumed unbiased header so we
2155   // don't accidentally blow away another thread's valid bias.
2156   andi(mark_reg, mark_reg, (markWord::biased_lock_mask_in_place |
2157                                 markWord::age_mask_in_place |
2158                                 markWord::epoch_mask_in_place));
2159   orr(temp_reg, R16_thread, mark_reg);
2160 
2161   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2162 
2163   // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
2164   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
2165            /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
2166            /*where=*/obj_reg,
2167            MacroAssembler::MemBarAcq,
2168            MacroAssembler::cmpxchgx_hint_acquire_lock(),
2169            noreg, slow_case_int); // bail out if failed
2170 
2171   // If the biasing toward our thread failed, this means that
2172   // another thread succeeded in biasing it toward itself and we
2173   // need to revoke that bias. The revocation will occur in the
2174   // interpreter runtime in the slow case.
2175   if (PrintBiasedLockingStatistics) {
2176     load_const(temp2_reg, (address) BiasedLocking::anonymously_biased_lock_entry_count_addr(), temp_reg);
2177     lwzx(temp_reg, temp2_reg);
2178     addi(temp_reg, temp_reg, 1);
2179     stwx(temp_reg, temp2_reg);
2180   }
2181   b(done);
2182 
2183   bind(try_rebias);
2184   // At this point we know the epoch has expired, meaning that the
2185   // current "bias owner", if any, is actually invalid. Under these
2186   // circumstances _only_, we are allowed to use the current header's
2187   // value as the comparison value when doing the cas to acquire the
2188   // bias in the current epoch. In other words, we allow transfer of
2189   // the bias from one thread to another directly in this situation.
2190   load_klass(temp_reg, obj_reg);
2191   andi(temp2_reg, mark_reg, markWord::age_mask_in_place);
2192   orr(temp2_reg, R16_thread, temp2_reg);
2193   ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
2194   orr(temp_reg, temp2_reg, temp_reg);
2195 
2196   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2197 
2198   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
2199                  /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
2200                  /*where=*/obj_reg,
2201                  MacroAssembler::MemBarAcq,
2202                  MacroAssembler::cmpxchgx_hint_acquire_lock(),
2203                  noreg, slow_case_int); // bail out if failed
2204 
2205   // If the biasing toward our thread failed, this means that
2206   // another thread succeeded in biasing it toward itself and we
2207   // need to revoke that bias. The revocation will occur in the
2208   // interpreter runtime in the slow case.
2209   if (PrintBiasedLockingStatistics) {
2210     load_const(temp2_reg, (address) BiasedLocking::rebiased_lock_entry_count_addr(), temp_reg);
2211     lwzx(temp_reg, temp2_reg);
2212     addi(temp_reg, temp_reg, 1);
2213     stwx(temp_reg, temp2_reg);
2214   }
2215   b(done);
2216 
2217   bind(try_revoke_bias);
2218   // The prototype mark in the klass doesn't have the bias bit set any
2219   // more, indicating that objects of this data type are not supposed
2220   // to be biased any more. We are going to try to reset the mark of
2221   // this object to the prototype value and fall through to the
2222   // CAS-based locking scheme. Note that if our CAS fails, it means
2223   // that another thread raced us for the privilege of revoking the
2224   // bias of this particular object, so it's okay to continue in the
2225   // normal locking code.
2226   load_klass(temp_reg, obj_reg);
2227   ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
2228   andi(temp2_reg, mark_reg, markWord::age_mask_in_place);
2229   orr(temp_reg, temp_reg, temp2_reg);
2230 
2231   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2232 
2233   // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
2234   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
2235                  /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
2236                  /*where=*/obj_reg,
2237                  MacroAssembler::MemBarAcq,
2238                  MacroAssembler::cmpxchgx_hint_acquire_lock());
2239 
2240   // reload markWord in mark_reg before continuing with lightweight locking
2241   ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);
2242 
2243   // Fall through to the normal CAS-based lock, because no matter what
2244   // the result of the above CAS, some thread must have succeeded in
2245   // removing the bias bit from the object's header.
2246   if (PrintBiasedLockingStatistics) {
2247     Label l;
2248     bne(cr_reg, l);
2249     load_const(temp2_reg, (address) BiasedLocking::revoked_lock_entry_count_addr(), temp_reg);
2250     lwzx(temp_reg, temp2_reg);
2251     addi(temp_reg, temp_reg, 1);
2252     stwx(temp_reg, temp2_reg);
2253     bind(l);
2254   }
2255 
2256   bind(cas_label);
2257 }
2258 
2259 void MacroAssembler::biased_locking_exit (ConditionRegister cr_reg, Register mark_addr, Register temp_reg, Label& done) {
2260   // Check for biased locking unlock case, which is a no-op
2261   // Note: we do not have to check the thread ID for two reasons.
2262   // First, the interpreter checks for IllegalMonitorStateException at
2263   // a higher level. Second, if the bias was revoked while we held the
2264   // lock, the object could not be rebiased toward another thread, so
2265   // the bias bit would be clear.
2266 
2267   ld(temp_reg, 0, mark_addr);
2268   andi(temp_reg, temp_reg, markWord::biased_lock_mask_in_place);
2269 
2270   cmpwi(cr_reg, temp_reg, markWord::biased_lock_pattern);
2271   beq(cr_reg, done);
2272 }
2273 
2274 // allocation (for C1)
2275 void MacroAssembler::eden_allocate(
2276   Register obj,                      // result: pointer to object after successful allocation
2277   Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
2278   int      con_size_in_bytes,        // object size in bytes if   known at compile time
2279   Register t1,                       // temp register
2280   Register t2,                       // temp register
2281   Label&   slow_case                 // continuation point if fast allocation fails
2282 ) {
2283   b(slow_case);
2284 }
2285 
2286 void MacroAssembler::tlab_allocate(
2287   Register obj,                      // result: pointer to object after successful allocation
2288   Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
2289   int      con_size_in_bytes,        // object size in bytes if   known at compile time
2290   Register t1,                       // temp register
2291   Label&   slow_case                 // continuation point if fast allocation fails
2292 ) {
2293   // make sure arguments make sense
2294   assert_different_registers(obj, var_size_in_bytes, t1);
2295   assert(0 <= con_size_in_bytes && is_simm16(con_size_in_bytes), "illegal object size");
2296   assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment");
2297 
2298   const Register new_top = t1;
2299   //verify_tlab(); not implemented
2300 
2301   ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2302   ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread);
2303   if (var_size_in_bytes == noreg) {
2304     addi(new_top, obj, con_size_in_bytes);
2305   } else {
2306     add(new_top, obj, var_size_in_bytes);
2307   }
2308   cmpld(CCR0, new_top, R0);
2309   bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case);
2310 
2311 #ifdef ASSERT
2312   // make sure new free pointer is properly aligned
2313   {
2314     Label L;
2315     andi_(R0, new_top, MinObjAlignmentInBytesMask);
2316     beq(CCR0, L);
2317     stop("updated TLAB free is not properly aligned", 0x934);
2318     bind(L);
2319   }
2320 #endif // ASSERT
2321 
2322   // update the tlab top pointer
2323   std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2324   //verify_tlab(); not implemented
2325 }
2326 void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register t1, Register t2) {
2327   unimplemented("incr_allocated_bytes");
2328 }
2329 
2330 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset,
2331                                              int insts_call_instruction_offset, Register Rtoc) {
2332   // Start the stub.
2333   address stub = start_a_stub(64);
2334   if (stub == NULL) { return NULL; } // CodeCache full: bail out
2335 
2336   // Create a trampoline stub relocation which relates this trampoline stub
2337   // with the call instruction at insts_call_instruction_offset in the
2338   // instructions code-section.
2339   relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset));
2340   const int stub_start_offset = offset();
2341 
2342   // For java_to_interp stubs we use R11_scratch1 as scratch register
2343   // and in call trampoline stubs we use R12_scratch2. This way we
2344   // can distinguish them (see is_NativeCallTrampolineStub_at()).
2345   Register reg_scratch = R12_scratch2;
2346 
2347   // Now, create the trampoline stub's code:
2348   // - load the TOC
2349   // - load the call target from the constant pool
2350   // - call
2351   if (Rtoc == noreg) {
2352     calculate_address_from_global_toc(reg_scratch, method_toc());
2353     Rtoc = reg_scratch;
2354   }
2355 
2356   ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false);
2357   mtctr(reg_scratch);
2358   bctr();
2359 
2360   const address stub_start_addr = addr_at(stub_start_offset);
2361 
2362   // Assert that the encoded destination_toc_offset can be identified and that it is correct.
2363   assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(),
2364          "encoded offset into the constant pool must match");
2365   // Trampoline_stub_size should be good.
2366   assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size");
2367   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
2368 
2369   // End the stub.
2370   end_a_stub();
2371   return stub;
2372 }
2373 
2374 // TM on PPC64.
2375 void MacroAssembler::atomic_inc_ptr(Register addr, Register result, int simm16) {
2376   Label retry;
2377   bind(retry);
2378   ldarx(result, addr, /*hint*/ false);
2379   addi(result, result, simm16);
2380   stdcx_(result, addr);
2381   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
2382     bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
2383   } else {
2384     bne(                  CCR0, retry); // stXcx_ sets CCR0
2385   }
2386 }
2387 
2388 void MacroAssembler::atomic_ori_int(Register addr, Register result, int uimm16) {
2389   Label retry;
2390   bind(retry);
2391   lwarx(result, addr, /*hint*/ false);
2392   ori(result, result, uimm16);
2393   stwcx_(result, addr);
2394   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
2395     bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
2396   } else {
2397     bne(                  CCR0, retry); // stXcx_ sets CCR0
2398   }
2399 }
2400 
2401 #if INCLUDE_RTM_OPT
2402 
2403 // Update rtm_counters based on abort status
2404 // input: abort_status
2405 //        rtm_counters_Reg (RTMLockingCounters*)
2406 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters_Reg) {
2407   // Mapping to keep PreciseRTMLockingStatistics similar to x86.
2408   // x86 ppc (! means inverted, ? means not the same)
2409   //  0   31  Set if abort caused by XABORT instruction.
2410   //  1  ! 7  If set, the transaction may succeed on a retry. This bit is always clear if bit 0 is set.
2411   //  2   13  Set if another logical processor conflicted with a memory address that was part of the transaction that aborted.
2412   //  3   10  Set if an internal buffer overflowed.
2413   //  4  ?12  Set if a debug breakpoint was hit.
2414   //  5  ?32  Set if an abort occurred during execution of a nested transaction.
2415   const int failure_bit[] = {tm_tabort, // Signal handler will set this too.
2416                              tm_failure_persistent,
2417                              tm_non_trans_cf,
2418                              tm_trans_cf,
2419                              tm_footprint_of,
2420                              tm_failure_code,
2421                              tm_transaction_level};
2422 
2423   const int num_failure_bits = sizeof(failure_bit) / sizeof(int);
2424   const int num_counters = RTMLockingCounters::ABORT_STATUS_LIMIT;
2425 
2426   const int bit2counter_map[][num_counters] =
2427   // 0 = no map; 1 = mapped, no inverted logic; -1 = mapped, inverted logic
2428   // Inverted logic means that if a bit is set don't count it, or vice-versa.
2429   // Care must be taken when mapping bits to counters as bits for a given
2430   // counter must be mutually exclusive. Otherwise, the counter will be
2431   // incremented more than once.
2432   // counters:
2433   // 0        1        2         3         4         5
2434   // abort  , persist, conflict, overflow, debug   , nested         bits:
2435   {{ 1      , 0      , 0       , 0       , 0       , 0      },   // abort
2436    { 0      , -1     , 0       , 0       , 0       , 0      },   // failure_persistent
2437    { 0      , 0      , 1       , 0       , 0       , 0      },   // non_trans_cf
2438    { 0      , 0      , 1       , 0       , 0       , 0      },   // trans_cf
2439    { 0      , 0      , 0       , 1       , 0       , 0      },   // footprint_of
2440    { 0      , 0      , 0       , 0       , -1      , 0      },   // failure_code = 0xD4
2441    { 0      , 0      , 0       , 0       , 0       , 1      }};  // transaction_level > 1
2442   // ...
2443 
2444   // Move abort_status value to R0 and use abort_status register as a
2445   // temporary register because R0 as third operand in ld/std is treated
2446   // as base address zero (value). Likewise, R0 as second operand in addi
2447   // is problematic because it amounts to li.
2448   const Register temp_Reg = abort_status;
2449   const Register abort_status_R0 = R0;
2450   mr(abort_status_R0, abort_status);
2451 
2452   // Increment total abort counter.
2453   int counters_offs = RTMLockingCounters::abort_count_offset();
2454   ld(temp_Reg, counters_offs, rtm_counters_Reg);
2455   addi(temp_Reg, temp_Reg, 1);
2456   std(temp_Reg, counters_offs, rtm_counters_Reg);
2457 
2458   // Increment specific abort counters.
2459   if (PrintPreciseRTMLockingStatistics) {
2460 
2461     // #0 counter offset.
2462     int abortX_offs = RTMLockingCounters::abortX_count_offset();
2463 
2464     for (int nbit = 0; nbit < num_failure_bits; nbit++) {
2465       for (int ncounter = 0; ncounter < num_counters; ncounter++) {
2466         if (bit2counter_map[nbit][ncounter] != 0) {
2467           Label check_abort;
2468           int abort_counter_offs = abortX_offs + (ncounter << 3);
2469 
2470           if (failure_bit[nbit] == tm_transaction_level) {
2471             // Don't check outer transaction, TL = 1 (bit 63). Hence only
2472             // 11 bits in the TL field are checked to find out if failure
2473             // occured in a nested transaction. This check also matches
2474             // the case when nesting_of = 1 (nesting overflow).
2475             rldicr_(temp_Reg, abort_status_R0, failure_bit[nbit], 10);
2476           } else if (failure_bit[nbit] == tm_failure_code) {
2477             // Check failure code for trap or illegal caught in TM.
2478             // Bits 0:7 are tested as bit 7 (persistent) is copied from
2479             // tabort or treclaim source operand.
2480             // On Linux: trap or illegal is TM_CAUSE_SIGNAL (0xD4).
2481             rldicl(temp_Reg, abort_status_R0, 8, 56);
2482             cmpdi(CCR0, temp_Reg, 0xD4);
2483           } else {
2484             rldicr_(temp_Reg, abort_status_R0, failure_bit[nbit], 0);
2485           }
2486 
2487           if (bit2counter_map[nbit][ncounter] == 1) {
2488             beq(CCR0, check_abort);
2489           } else {
2490             bne(CCR0, check_abort);
2491           }
2492 
2493           // We don't increment atomically.
2494           ld(temp_Reg, abort_counter_offs, rtm_counters_Reg);
2495           addi(temp_Reg, temp_Reg, 1);
2496           std(temp_Reg, abort_counter_offs, rtm_counters_Reg);
2497 
2498           bind(check_abort);
2499         }
2500       }
2501     }
2502   }
2503   // Restore abort_status.
2504   mr(abort_status, abort_status_R0);
2505 }
2506 
2507 // Branch if (random & (count-1) != 0), count is 2^n
2508 // tmp and CR0 are killed
2509 void MacroAssembler::branch_on_random_using_tb(Register tmp, int count, Label& brLabel) {
2510   mftb(tmp);
2511   andi_(tmp, tmp, count-1);
2512   bne(CCR0, brLabel);
2513 }
2514 
2515 // Perform abort ratio calculation, set no_rtm bit if high ratio.
2516 // input:  rtm_counters_Reg (RTMLockingCounters* address) - KILLED
2517 void MacroAssembler::rtm_abort_ratio_calculation(Register rtm_counters_Reg,
2518                                                  RTMLockingCounters* rtm_counters,
2519                                                  Metadata* method_data) {
2520   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
2521 
2522   if (RTMLockingCalculationDelay > 0) {
2523     // Delay calculation.
2524     ld(rtm_counters_Reg, (RegisterOrConstant)(intptr_t)RTMLockingCounters::rtm_calculation_flag_addr());
2525     cmpdi(CCR0, rtm_counters_Reg, 0);
2526     beq(CCR0, L_done);
2527     load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2528   }
2529   // Abort ratio calculation only if abort_count > RTMAbortThreshold.
2530   //   Aborted transactions = abort_count * 100
2531   //   All transactions = total_count *  RTMTotalCountIncrRate
2532   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
2533   ld(R0, RTMLockingCounters::abort_count_offset(), rtm_counters_Reg);
2534   if (is_simm(RTMAbortThreshold, 16)) {   // cmpdi can handle 16bit immediate only.
2535     cmpdi(CCR0, R0, RTMAbortThreshold);
2536     blt(CCR0, L_check_always_rtm2);  // reload of rtm_counters_Reg not necessary
2537   } else {
2538     load_const_optimized(rtm_counters_Reg, RTMAbortThreshold);
2539     cmpd(CCR0, R0, rtm_counters_Reg);
2540     blt(CCR0, L_check_always_rtm1);  // reload of rtm_counters_Reg required
2541   }
2542   mulli(R0, R0, 100);
2543 
2544   const Register tmpReg = rtm_counters_Reg;
2545   ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2546   mulli(tmpReg, tmpReg, RTMTotalCountIncrRate); // allowable range: int16
2547   mulli(tmpReg, tmpReg, RTMAbortRatio);         // allowable range: int16
2548   cmpd(CCR0, R0, tmpReg);
2549   blt(CCR0, L_check_always_rtm1); // jump to reload
2550   if (method_data != NULL) {
2551     // Set rtm_state to "no rtm" in MDO.
2552     // Not using a metadata relocation. Method and Class Loader are kept alive anyway.
2553     // (See nmethod::metadata_do and CodeBuffer::finalize_oop_references.)
2554     load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2555     atomic_ori_int(R0, tmpReg, NoRTM);
2556   }
2557   b(L_done);
2558 
2559   bind(L_check_always_rtm1);
2560   load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2561   bind(L_check_always_rtm2);
2562   ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2563   int64_t thresholdValue = RTMLockingThreshold / RTMTotalCountIncrRate;
2564   if (is_simm(thresholdValue, 16)) {   // cmpdi can handle 16bit immediate only.
2565     cmpdi(CCR0, tmpReg, thresholdValue);
2566   } else {
2567     load_const_optimized(R0, thresholdValue);
2568     cmpd(CCR0, tmpReg, R0);
2569   }
2570   blt(CCR0, L_done);
2571   if (method_data != NULL) {
2572     // Set rtm_state to "always rtm" in MDO.
2573     // Not using a metadata relocation. See above.
2574     load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2575     atomic_ori_int(R0, tmpReg, UseRTM);
2576   }
2577   bind(L_done);
2578 }
2579 
2580 // Update counters and perform abort ratio calculation.
2581 // input: abort_status_Reg
2582 void MacroAssembler::rtm_profiling(Register abort_status_Reg, Register temp_Reg,
2583                                    RTMLockingCounters* rtm_counters,
2584                                    Metadata* method_data,
2585                                    bool profile_rtm) {
2586 
2587   assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2588   // Update rtm counters based on state at abort.
2589   // Reads abort_status_Reg, updates flags.
2590   assert_different_registers(abort_status_Reg, temp_Reg);
2591   load_const_optimized(temp_Reg, (address)rtm_counters, R0);
2592   rtm_counters_update(abort_status_Reg, temp_Reg);
2593   if (profile_rtm) {
2594     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2595     rtm_abort_ratio_calculation(temp_Reg, rtm_counters, method_data);
2596   }
2597 }
2598 
2599 // Retry on abort if abort's status indicates non-persistent failure.
2600 // inputs: retry_count_Reg
2601 //       : abort_status_Reg
2602 // output: retry_count_Reg decremented by 1
2603 void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg,
2604                                              Label& retryLabel, Label* checkRetry) {
2605   Label doneRetry;
2606 
2607   // Don't retry if failure is persistent.
2608   // The persistent bit is set when a (A) Disallowed operation is performed in
2609   // transactional state, like for instance trying to write the TFHAR after a
2610   // transaction is started; or when there is (B) a Nesting Overflow (too many
2611   // nested transactions); or when (C) the Footprint overflows (too many
2612   // addressess touched in TM state so there is no more space in the footprint
2613   // area to track them); or in case of (D) a Self-Induced Conflict, i.e. a
2614   // store is performed to a given address in TM state, then once in suspended
2615   // state the same address is accessed. Failure (A) is very unlikely to occur
2616   // in the JVM. Failure (D) will never occur because Suspended state is never
2617   // used in the JVM. Thus mostly (B) a Nesting Overflow or (C) a Footprint
2618   // Overflow will set the persistent bit.
2619   rldicr_(R0, abort_status_Reg, tm_failure_persistent, 0);
2620   bne(CCR0, doneRetry);
2621 
2622   // Don't retry if transaction was deliberately aborted, i.e. caused by a
2623   // tabort instruction.
2624   rldicr_(R0, abort_status_Reg, tm_tabort, 0);
2625   bne(CCR0, doneRetry);
2626 
2627   // Retry if transaction aborted due to a conflict with another thread.
2628   if (checkRetry) { bind(*checkRetry); }
2629   addic_(retry_count_Reg, retry_count_Reg, -1);
2630   blt(CCR0, doneRetry);
2631   b(retryLabel);
2632   bind(doneRetry);
2633 }
2634 
2635 // Spin and retry if lock is busy.
2636 // inputs: owner_addr_Reg (monitor address)
2637 //       : retry_count_Reg
2638 // output: retry_count_Reg decremented by 1
2639 // CTR is killed
2640 void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register owner_addr_Reg, Label& retryLabel) {
2641   Label SpinLoop, doneRetry, doRetry;
2642   addic_(retry_count_Reg, retry_count_Reg, -1);
2643   blt(CCR0, doneRetry);
2644 
2645   if (RTMSpinLoopCount > 1) {
2646     li(R0, RTMSpinLoopCount);
2647     mtctr(R0);
2648   }
2649 
2650   // low thread priority
2651   smt_prio_low();
2652   bind(SpinLoop);
2653 
2654   if (RTMSpinLoopCount > 1) {
2655     bdz(doRetry);
2656     ld(R0, 0, owner_addr_Reg);
2657     cmpdi(CCR0, R0, 0);
2658     bne(CCR0, SpinLoop);
2659   }
2660 
2661   bind(doRetry);
2662 
2663   // restore thread priority to default in userspace
2664 #ifdef LINUX
2665   smt_prio_medium_low();
2666 #else
2667   smt_prio_medium();
2668 #endif
2669 
2670   b(retryLabel);
2671 
2672   bind(doneRetry);
2673 }
2674 
2675 // Use RTM for normal stack locks.
2676 // Input: objReg (object to lock)
2677 void MacroAssembler::rtm_stack_locking(ConditionRegister flag,
2678                                        Register obj, Register mark_word, Register tmp,
2679                                        Register retry_on_abort_count_Reg,
2680                                        RTMLockingCounters* stack_rtm_counters,
2681                                        Metadata* method_data, bool profile_rtm,
2682                                        Label& DONE_LABEL, Label& IsInflated) {
2683   assert(UseRTMForStackLocks, "why call this otherwise?");
2684   assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
2685   Label L_rtm_retry, L_decrement_retry, L_on_abort;
2686 
2687   if (RTMRetryCount > 0) {
2688     load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
2689     bind(L_rtm_retry);
2690   }
2691   andi_(R0, mark_word, markWord::monitor_value);  // inflated vs stack-locked|neutral|biased
2692   bne(CCR0, IsInflated);
2693 
2694   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2695     Label L_noincrement;
2696     if (RTMTotalCountIncrRate > 1) {
2697       branch_on_random_using_tb(tmp, RTMTotalCountIncrRate, L_noincrement);
2698     }
2699     assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
2700     load_const_optimized(tmp, (address)stack_rtm_counters->total_count_addr(), R0);
2701     //atomic_inc_ptr(tmp, /*temp, will be reloaded*/mark_word); We don't increment atomically
2702     ldx(mark_word, tmp);
2703     addi(mark_word, mark_word, 1);
2704     stdx(mark_word, tmp);
2705     bind(L_noincrement);
2706   }
2707   tbegin_();
2708   beq(CCR0, L_on_abort);
2709   ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);      // Reload in transaction, conflicts need to be tracked.
2710   andi(R0, mark_word, markWord::biased_lock_mask_in_place); // look at 3 lock bits
2711   cmpwi(flag, R0, markWord::unlocked_value);                // bits = 001 unlocked
2712   beq(flag, DONE_LABEL);                                    // all done if unlocked
2713 
2714   if (UseRTMXendForLockBusy) {
2715     tend_();
2716     b(L_decrement_retry);
2717   } else {
2718     tabort_();
2719   }
2720   bind(L_on_abort);
2721   const Register abort_status_Reg = tmp;
2722   mftexasr(abort_status_Reg);
2723   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2724     rtm_profiling(abort_status_Reg, /*temp*/mark_word, stack_rtm_counters, method_data, profile_rtm);
2725   }
2726   ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // reload
2727   if (RTMRetryCount > 0) {
2728     // Retry on lock abort if abort status is not permanent.
2729     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry, &L_decrement_retry);
2730   } else {
2731     bind(L_decrement_retry);
2732   }
2733 }
2734 
2735 // Use RTM for inflating locks
2736 // inputs: obj       (object to lock)
2737 //         mark_word (current header - KILLED)
2738 //         boxReg    (on-stack box address (displaced header location) - KILLED)
2739 void MacroAssembler::rtm_inflated_locking(ConditionRegister flag,
2740                                           Register obj, Register mark_word, Register boxReg,
2741                                           Register retry_on_busy_count_Reg, Register retry_on_abort_count_Reg,
2742                                           RTMLockingCounters* rtm_counters,
2743                                           Metadata* method_data, bool profile_rtm,
2744                                           Label& DONE_LABEL) {
2745   assert(UseRTMLocking, "why call this otherwise?");
2746   Label L_rtm_retry, L_decrement_retry, L_on_abort;
2747   // Clean monitor_value bit to get valid pointer.
2748   int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markWord::monitor_value;
2749 
2750   // Store non-null, using boxReg instead of (intptr_t)markWord::unused_mark().
2751   std(boxReg, BasicLock::displaced_header_offset_in_bytes(), boxReg);
2752   const Register tmpReg = boxReg;
2753   const Register owner_addr_Reg = mark_word;
2754   addi(owner_addr_Reg, mark_word, owner_offset);
2755 
2756   if (RTMRetryCount > 0) {
2757     load_const_optimized(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy.
2758     load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort.
2759     bind(L_rtm_retry);
2760   }
2761   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2762     Label L_noincrement;
2763     if (RTMTotalCountIncrRate > 1) {
2764       branch_on_random_using_tb(R0, RTMTotalCountIncrRate, L_noincrement);
2765     }
2766     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2767     load_const(R0, (address)rtm_counters->total_count_addr(), tmpReg);
2768     //atomic_inc_ptr(R0, tmpReg); We don't increment atomically
2769     ldx(tmpReg, R0);
2770     addi(tmpReg, tmpReg, 1);
2771     stdx(tmpReg, R0);
2772     bind(L_noincrement);
2773   }
2774   tbegin_();
2775   beq(CCR0, L_on_abort);
2776   // We don't reload mark word. Will only be reset at safepoint.
2777   ld(R0, 0, owner_addr_Reg); // Load in transaction, conflicts need to be tracked.
2778   cmpdi(flag, R0, 0);
2779   beq(flag, DONE_LABEL);
2780 
2781   if (UseRTMXendForLockBusy) {
2782     tend_();
2783     b(L_decrement_retry);
2784   } else {
2785     tabort_();
2786   }
2787   bind(L_on_abort);
2788   const Register abort_status_Reg = tmpReg;
2789   mftexasr(abort_status_Reg);
2790   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2791     rtm_profiling(abort_status_Reg, /*temp*/ owner_addr_Reg, rtm_counters, method_data, profile_rtm);
2792     // Restore owner_addr_Reg
2793     ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);
2794 #ifdef ASSERT
2795     andi_(R0, mark_word, markWord::monitor_value);
2796     asm_assert_ne("must be inflated", 0xa754); // Deflating only allowed at safepoint.
2797 #endif
2798     addi(owner_addr_Reg, mark_word, owner_offset);
2799   }
2800   if (RTMRetryCount > 0) {
2801     // Retry on lock abort if abort status is not permanent.
2802     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
2803   }
2804 
2805   // Appears unlocked - try to swing _owner from null to non-null.
2806   cmpxchgd(flag, /*current val*/ R0, (intptr_t)0, /*new val*/ R16_thread, owner_addr_Reg,
2807            MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2808            MacroAssembler::cmpxchgx_hint_acquire_lock(), noreg, &L_decrement_retry, true);
2809 
2810   if (RTMRetryCount > 0) {
2811     // success done else retry
2812     b(DONE_LABEL);
2813     bind(L_decrement_retry);
2814     // Spin and retry if lock is busy.
2815     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, owner_addr_Reg, L_rtm_retry);
2816   } else {
2817     bind(L_decrement_retry);
2818   }
2819 }
2820 
2821 #endif //  INCLUDE_RTM_OPT
2822 
2823 // "The box" is the space on the stack where we copy the object mark.
2824 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box,
2825                                                Register temp, Register displaced_header, Register current_header,
2826                                                bool try_bias,
2827                                                RTMLockingCounters* rtm_counters,
2828                                                RTMLockingCounters* stack_rtm_counters,
2829                                                Metadata* method_data,
2830                                                bool use_rtm, bool profile_rtm) {
2831   assert_different_registers(oop, box, temp, displaced_header, current_header);
2832   assert(flag != CCR0, "bad condition register");
2833   Label cont;
2834   Label object_has_monitor;
2835   Label cas_failed;
2836 
2837   // Load markWord from object into displaced_header.
2838   ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop);
2839 
2840 
2841   if (try_bias) {
2842     biased_locking_enter(flag, oop, displaced_header, temp, current_header, cont);
2843   }
2844 
2845 #if INCLUDE_RTM_OPT
2846   if (UseRTMForStackLocks && use_rtm) {
2847     rtm_stack_locking(flag, oop, displaced_header, temp, /*temp*/ current_header,
2848                       stack_rtm_counters, method_data, profile_rtm,
2849                       cont, object_has_monitor);
2850   }
2851 #endif // INCLUDE_RTM_OPT
2852 
2853   // Handle existing monitor.
2854   // The object has an existing monitor iff (mark & monitor_value) != 0.
2855   andi_(temp, displaced_header, markWord::monitor_value);
2856   bne(CCR0, object_has_monitor);
2857 
2858   // Set displaced_header to be (markWord of object | UNLOCK_VALUE).
2859   ori(displaced_header, displaced_header, markWord::unlocked_value);
2860 
2861   // Load Compare Value application register.
2862 
2863   // Initialize the box. (Must happen before we update the object mark!)
2864   std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2865 
2866   // Must fence, otherwise, preceding store(s) may float below cmpxchg.
2867   // Compare object markWord with mark and if equal exchange scratch1 with object markWord.
2868   cmpxchgd(/*flag=*/flag,
2869            /*current_value=*/current_header,
2870            /*compare_value=*/displaced_header,
2871            /*exchange_value=*/box,
2872            /*where=*/oop,
2873            MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2874            MacroAssembler::cmpxchgx_hint_acquire_lock(),
2875            noreg,
2876            &cas_failed,
2877            /*check without membar and ldarx first*/true);
2878   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2879 
2880   // If the compare-and-exchange succeeded, then we found an unlocked
2881   // object and we have now locked it.
2882   b(cont);
2883 
2884   bind(cas_failed);
2885   // We did not see an unlocked object so try the fast recursive case.
2886 
2887   // Check if the owner is self by comparing the value in the markWord of object
2888   // (current_header) with the stack pointer.
2889   sub(current_header, current_header, R1_SP);
2890   load_const_optimized(temp, ~(os::vm_page_size()-1) | markWord::lock_mask_in_place);
2891 
2892   and_(R0/*==0?*/, current_header, temp);
2893   // If condition is true we are cont and hence we can store 0 as the
2894   // displaced header in the box, which indicates that it is a recursive lock.
2895   mcrf(flag,CCR0);
2896   std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box);
2897 
2898   // Handle existing monitor.
2899   b(cont);
2900 
2901   bind(object_has_monitor);
2902   // The object's monitor m is unlocked iff m->owner == NULL,
2903   // otherwise m->owner may contain a thread or a stack address.
2904 
2905 #if INCLUDE_RTM_OPT
2906   // Use the same RTM locking code in 32- and 64-bit VM.
2907   if (use_rtm) {
2908     rtm_inflated_locking(flag, oop, displaced_header, box, temp, /*temp*/ current_header,
2909                          rtm_counters, method_data, profile_rtm, cont);
2910   } else {
2911 #endif // INCLUDE_RTM_OPT
2912 
2913   // Try to CAS m->owner from NULL to current thread.
2914   addi(temp, displaced_header, ObjectMonitor::owner_offset_in_bytes()-markWord::monitor_value);
2915   cmpxchgd(/*flag=*/flag,
2916            /*current_value=*/current_header,
2917            /*compare_value=*/(intptr_t)0,
2918            /*exchange_value=*/R16_thread,
2919            /*where=*/temp,
2920            MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2921            MacroAssembler::cmpxchgx_hint_acquire_lock());
2922 
2923   // Store a non-null value into the box.
2924   std(box, BasicLock::displaced_header_offset_in_bytes(), box);
2925 
2926 # ifdef ASSERT
2927   bne(flag, cont);
2928   // We have acquired the monitor, check some invariants.
2929   addi(/*monitor=*/temp, temp, -ObjectMonitor::owner_offset_in_bytes());
2930   // Invariant 1: _recursions should be 0.
2931   //assert(ObjectMonitor::recursions_size_in_bytes() == 8, "unexpected size");
2932   asm_assert_mem8_is_zero(ObjectMonitor::recursions_offset_in_bytes(), temp,
2933                             "monitor->_recursions should be 0", -1);
2934 # endif
2935 
2936 #if INCLUDE_RTM_OPT
2937   } // use_rtm()
2938 #endif
2939 
2940   bind(cont);
2941   // flag == EQ indicates success
2942   // flag == NE indicates failure
2943 }
2944 
2945 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box,
2946                                                  Register temp, Register displaced_header, Register current_header,
2947                                                  bool try_bias, bool use_rtm) {
2948   assert_different_registers(oop, box, temp, displaced_header, current_header);
2949   assert(flag != CCR0, "bad condition register");
2950   Label cont;
2951   Label object_has_monitor;
2952 
2953   if (try_bias) {
2954     biased_locking_exit(flag, oop, current_header, cont);
2955   }
2956 
2957 #if INCLUDE_RTM_OPT
2958   if (UseRTMForStackLocks && use_rtm) {
2959     assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
2960     Label L_regular_unlock;
2961     ld(current_header, oopDesc::mark_offset_in_bytes(), oop);      // fetch markword
2962     andi(R0, current_header, markWord::biased_lock_mask_in_place); // look at 3 lock bits
2963     cmpwi(flag, R0, markWord::unlocked_value);                     // bits = 001 unlocked
2964     bne(flag, L_regular_unlock);                                   // else RegularLock
2965     tend_();                                                       // otherwise end...
2966     b(cont);                                                       // ... and we're done
2967     bind(L_regular_unlock);
2968   }
2969 #endif
2970 
2971   // Find the lock address and load the displaced header from the stack.
2972   ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2973 
2974   // If the displaced header is 0, we have a recursive unlock.
2975   cmpdi(flag, displaced_header, 0);
2976   beq(flag, cont);
2977 
2978   // Handle existing monitor.
2979   // The object has an existing monitor iff (mark & monitor_value) != 0.
2980   RTM_OPT_ONLY( if (!(UseRTMForStackLocks && use_rtm)) ) // skip load if already done
2981   ld(current_header, oopDesc::mark_offset_in_bytes(), oop);
2982   andi_(R0, current_header, markWord::monitor_value);
2983   bne(CCR0, object_has_monitor);
2984 
2985   // Check if it is still a light weight lock, this is is true if we see
2986   // the stack address of the basicLock in the markWord of the object.
2987   // Cmpxchg sets flag to cmpd(current_header, box).
2988   cmpxchgd(/*flag=*/flag,
2989            /*current_value=*/current_header,
2990            /*compare_value=*/box,
2991            /*exchange_value=*/displaced_header,
2992            /*where=*/oop,
2993            MacroAssembler::MemBarRel,
2994            MacroAssembler::cmpxchgx_hint_release_lock(),
2995            noreg,
2996            &cont);
2997 
2998   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2999 
3000   // Handle existing monitor.
3001   b(cont);
3002 
3003   bind(object_has_monitor);
3004   STATIC_ASSERT(markWord::monitor_value <= INT_MAX);
3005   addi(current_header, current_header, -(int)markWord::monitor_value); // monitor
3006   ld(temp,             ObjectMonitor::owner_offset_in_bytes(), current_header);
3007 
3008     // It's inflated.
3009 #if INCLUDE_RTM_OPT
3010   if (use_rtm) {
3011     Label L_regular_inflated_unlock;
3012     // Clean monitor_value bit to get valid pointer
3013     cmpdi(flag, temp, 0);
3014     bne(flag, L_regular_inflated_unlock);
3015     tend_();
3016     b(cont);
3017     bind(L_regular_inflated_unlock);
3018   }
3019 #endif
3020 
3021   ld(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header);
3022   xorr(temp, R16_thread, temp);      // Will be 0 if we are the owner.
3023   orr(temp, temp, displaced_header); // Will be 0 if there are 0 recursions.
3024   cmpdi(flag, temp, 0);
3025   bne(flag, cont);
3026 
3027   ld(temp,             ObjectMonitor::EntryList_offset_in_bytes(), current_header);
3028   ld(displaced_header, ObjectMonitor::cxq_offset_in_bytes(), current_header);
3029   orr(temp, temp, displaced_header); // Will be 0 if both are 0.
3030   cmpdi(flag, temp, 0);
3031   bne(flag, cont);
3032   release();
3033   std(temp, ObjectMonitor::owner_offset_in_bytes(), current_header);
3034 
3035   bind(cont);
3036   // flag == EQ indicates success
3037   // flag == NE indicates failure
3038 }
3039 
3040 void MacroAssembler::safepoint_poll(Label& slow_path, Register temp_reg) {
3041   if (SafepointMechanism::uses_thread_local_poll()) {
3042     ld(temp_reg, in_bytes(Thread::polling_page_offset()), R16_thread);
3043     // Armed page has poll_bit set.
3044     andi_(temp_reg, temp_reg, SafepointMechanism::poll_bit());
3045   } else {
3046     lwz(temp_reg, (RegisterOrConstant)(intptr_t)SafepointSynchronize::address_of_state());
3047     cmpwi(CCR0, temp_reg, SafepointSynchronize::_not_synchronized);
3048   }
3049   bne(CCR0, slow_path);
3050 }
3051 
3052 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2, bool needs_frame) {
3053   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3054   bs->resolve_jobject(this, value, tmp1, tmp2, needs_frame);
3055 }
3056 
3057 // Values for last_Java_pc, and last_Java_sp must comply to the rules
3058 // in frame_ppc.hpp.
3059 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) {
3060   // Always set last_Java_pc and flags first because once last_Java_sp
3061   // is visible has_last_Java_frame is true and users will look at the
3062   // rest of the fields. (Note: flags should always be zero before we
3063   // get here so doesn't need to be set.)
3064 
3065   // Verify that last_Java_pc was zeroed on return to Java
3066   asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread,
3067                           "last_Java_pc not zeroed before leaving Java", 0x200);
3068 
3069   // When returning from calling out from Java mode the frame anchor's
3070   // last_Java_pc will always be set to NULL. It is set here so that
3071   // if we are doing a call to native (not VM) that we capture the
3072   // known pc and don't have to rely on the native call having a
3073   // standard frame linkage where we can find the pc.
3074   if (last_Java_pc != noreg)
3075     std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
3076 
3077   // Set last_Java_sp last.
3078   std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
3079 }
3080 
3081 void MacroAssembler::reset_last_Java_frame(void) {
3082   asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()),
3083                              R16_thread, "SP was not set, still zero", 0x202);
3084 
3085   BLOCK_COMMENT("reset_last_Java_frame {");
3086   li(R0, 0);
3087 
3088   // _last_Java_sp = 0
3089   std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
3090 
3091   // _last_Java_pc = 0
3092   std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
3093   BLOCK_COMMENT("} reset_last_Java_frame");
3094 }
3095 
3096 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) {
3097   assert_different_registers(sp, tmp1);
3098 
3099   // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via
3100   // TOP_IJAVA_FRAME_ABI.
3101   // FIXME: assert that we really have a TOP_IJAVA_FRAME here!
3102   address entry = pc();
3103   load_const_optimized(tmp1, entry);
3104 
3105   set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1);
3106 }
3107 
3108 void MacroAssembler::get_vm_result(Register oop_result) {
3109   // Read:
3110   //   R16_thread
3111   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
3112   //
3113   // Updated:
3114   //   oop_result
3115   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
3116 
3117   verify_thread();
3118 
3119   ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread);
3120   li(R0, 0);
3121   std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread);
3122 
3123   verify_oop(oop_result);
3124 }
3125 
3126 void MacroAssembler::get_vm_result_2(Register metadata_result) {
3127   // Read:
3128   //   R16_thread
3129   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
3130   //
3131   // Updated:
3132   //   metadata_result
3133   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
3134 
3135   ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
3136   li(R0, 0);
3137   std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
3138 }
3139 
3140 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3141   Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided.
3142   if (CompressedKlassPointers::base() != 0) {
3143     // Use dst as temp if it is free.
3144     sub_const_optimized(dst, current, CompressedKlassPointers::base(), R0);
3145     current = dst;
3146   }
3147   if (CompressedKlassPointers::shift() != 0) {
3148     srdi(dst, current, CompressedKlassPointers::shift());
3149     current = dst;
3150   }
3151   return current;
3152 }
3153 
3154 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) {
3155   if (UseCompressedClassPointers) {
3156     Register compressedKlass = encode_klass_not_null(ck, klass);
3157     stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop);
3158   } else {
3159     std(klass, oopDesc::klass_offset_in_bytes(), dst_oop);
3160   }
3161 }
3162 
3163 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) {
3164   if (UseCompressedClassPointers) {
3165     if (val == noreg) {
3166       val = R0;
3167       li(val, 0);
3168     }
3169     stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed
3170   }
3171 }
3172 
3173 int MacroAssembler::instr_size_for_decode_klass_not_null() {
3174   if (!UseCompressedClassPointers) return 0;
3175   int num_instrs = 1;  // shift or move
3176   if (CompressedKlassPointers::base() != 0) num_instrs = 7;  // shift + load const + add
3177   return num_instrs * BytesPerInstWord;
3178 }
3179 
3180 void MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3181   assert(dst != R0, "Dst reg may not be R0, as R0 is used here.");
3182   if (src == noreg) src = dst;
3183   Register shifted_src = src;
3184   if (CompressedKlassPointers::shift() != 0 ||
3185       CompressedKlassPointers::base() == 0 && src != dst) {  // Move required.
3186     shifted_src = dst;
3187     sldi(shifted_src, src, CompressedKlassPointers::shift());
3188   }
3189   if (CompressedKlassPointers::base() != 0) {
3190     add_const_optimized(dst, shifted_src, CompressedKlassPointers::base(), R0);
3191   }
3192 }
3193 
3194 void MacroAssembler::load_klass(Register dst, Register src) {
3195   if (UseCompressedClassPointers) {
3196     lwz(dst, oopDesc::klass_offset_in_bytes(), src);
3197     // Attention: no null check here!
3198     decode_klass_not_null(dst, dst);
3199   } else {
3200     ld(dst, oopDesc::klass_offset_in_bytes(), src);
3201   }
3202 }
3203 
3204 // ((OopHandle)result).resolve();
3205 void MacroAssembler::resolve_oop_handle(Register result) {
3206   // OopHandle::resolve is an indirection.
3207   ld(result, 0, result);
3208 }
3209 
3210 void MacroAssembler::load_mirror_from_const_method(Register mirror, Register const_method) {
3211   ld(mirror, in_bytes(ConstMethod::constants_offset()), const_method);
3212   ld(mirror, ConstantPool::pool_holder_offset_in_bytes(), mirror);
3213   ld(mirror, in_bytes(Klass::java_mirror_offset()), mirror);
3214   resolve_oop_handle(mirror);
3215 }
3216 
3217 void MacroAssembler::load_method_holder(Register holder, Register method) {
3218   ld(holder, in_bytes(Method::const_offset()), method);
3219   ld(holder, in_bytes(ConstMethod::constants_offset()), holder);
3220   ld(holder, ConstantPool::pool_holder_offset_in_bytes(), holder);
3221 }
3222 
3223 // Clear Array
3224 // For very short arrays. tmp == R0 is allowed.
3225 void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) {
3226   if (cnt_dwords > 0) { li(tmp, 0); }
3227   for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); }
3228 }
3229 
3230 // Version for constant short array length. Kills base_ptr. tmp == R0 is allowed.
3231 void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) {
3232   if (cnt_dwords < 8) {
3233     clear_memory_unrolled(base_ptr, cnt_dwords, tmp);
3234     return;
3235   }
3236 
3237   Label loop;
3238   const long loopcnt   = cnt_dwords >> 1,
3239              remainder = cnt_dwords & 1;
3240 
3241   li(tmp, loopcnt);
3242   mtctr(tmp);
3243   li(tmp, 0);
3244   bind(loop);
3245     std(tmp, 0, base_ptr);
3246     std(tmp, 8, base_ptr);
3247     addi(base_ptr, base_ptr, 16);
3248     bdnz(loop);
3249   if (remainder) { std(tmp, 0, base_ptr); }
3250 }
3251 
3252 // Kills both input registers. tmp == R0 is allowed.
3253 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) {
3254   // Procedure for large arrays (uses data cache block zero instruction).
3255     Label startloop, fast, fastloop, small_rest, restloop, done;
3256     const int cl_size         = VM_Version::L1_data_cache_line_size(),
3257               cl_dwords       = cl_size >> 3,
3258               cl_dw_addr_bits = exact_log2(cl_dwords),
3259               dcbz_min        = 1,  // Min count of dcbz executions, needs to be >0.
3260               min_cnt         = ((dcbz_min + 1) << cl_dw_addr_bits) - 1;
3261 
3262   if (const_cnt >= 0) {
3263     // Constant case.
3264     if (const_cnt < min_cnt) {
3265       clear_memory_constlen(base_ptr, const_cnt, tmp);
3266       return;
3267     }
3268     load_const_optimized(cnt_dwords, const_cnt, tmp);
3269   } else {
3270     // cnt_dwords already loaded in register. Need to check size.
3271     cmpdi(CCR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included).
3272     blt(CCR1, small_rest);
3273   }
3274     rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line.
3275     beq(CCR0, fast);                                  // Already 128byte aligned.
3276 
3277     subfic(tmp, tmp, cl_dwords);
3278     mtctr(tmp);                        // Set ctr to hit 128byte boundary (0<ctr<cl_dwords).
3279     subf(cnt_dwords, tmp, cnt_dwords); // rest.
3280     li(tmp, 0);
3281 
3282   bind(startloop);                     // Clear at the beginning to reach 128byte boundary.
3283     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3284     addi(base_ptr, base_ptr, 8);
3285     bdnz(startloop);
3286 
3287   bind(fast);                                  // Clear 128byte blocks.
3288     srdi(tmp, cnt_dwords, cl_dw_addr_bits);    // Loop count for 128byte loop (>0).
3289     andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords.
3290     mtctr(tmp);                                // Load counter.
3291 
3292   bind(fastloop);
3293     dcbz(base_ptr);                    // Clear 128byte aligned block.
3294     addi(base_ptr, base_ptr, cl_size);
3295     bdnz(fastloop);
3296 
3297   bind(small_rest);
3298     cmpdi(CCR0, cnt_dwords, 0);        // size 0?
3299     beq(CCR0, done);                   // rest == 0
3300     li(tmp, 0);
3301     mtctr(cnt_dwords);                 // Load counter.
3302 
3303   bind(restloop);                      // Clear rest.
3304     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3305     addi(base_ptr, base_ptr, 8);
3306     bdnz(restloop);
3307 
3308   bind(done);
3309 }
3310 
3311 /////////////////////////////////////////// String intrinsics ////////////////////////////////////////////
3312 
3313 #ifdef COMPILER2
3314 // Intrinsics for CompactStrings
3315 
3316 // Compress char[] to byte[] by compressing 16 bytes at once.
3317 void MacroAssembler::string_compress_16(Register src, Register dst, Register cnt,
3318                                         Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5,
3319                                         Label& Lfailure) {
3320 
3321   const Register tmp0 = R0;
3322   assert_different_registers(src, dst, cnt, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5);
3323   Label Lloop, Lslow;
3324 
3325   // Check if cnt >= 8 (= 16 bytes)
3326   lis(tmp1, 0xFF);                // tmp1 = 0x00FF00FF00FF00FF
3327   srwi_(tmp2, cnt, 3);
3328   beq(CCR0, Lslow);
3329   ori(tmp1, tmp1, 0xFF);
3330   rldimi(tmp1, tmp1, 32, 0);
3331   mtctr(tmp2);
3332 
3333   // 2x unrolled loop
3334   bind(Lloop);
3335   ld(tmp2, 0, src);               // _0_1_2_3 (Big Endian)
3336   ld(tmp4, 8, src);               // _4_5_6_7
3337 
3338   orr(tmp0, tmp2, tmp4);
3339   rldicl(tmp3, tmp2, 6*8, 64-24); // _____1_2
3340   rldimi(tmp2, tmp2, 2*8, 2*8);   // _0_2_3_3
3341   rldicl(tmp5, tmp4, 6*8, 64-24); // _____5_6
3342   rldimi(tmp4, tmp4, 2*8, 2*8);   // _4_6_7_7
3343 
3344   andc_(tmp0, tmp0, tmp1);
3345   bne(CCR0, Lfailure);            // Not latin1.
3346   addi(src, src, 16);
3347 
3348   rlwimi(tmp3, tmp2, 0*8, 24, 31);// _____1_3
3349   srdi(tmp2, tmp2, 3*8);          // ____0_2_
3350   rlwimi(tmp5, tmp4, 0*8, 24, 31);// _____5_7
3351   srdi(tmp4, tmp4, 3*8);          // ____4_6_
3352 
3353   orr(tmp2, tmp2, tmp3);          // ____0123
3354   orr(tmp4, tmp4, tmp5);          // ____4567
3355 
3356   stw(tmp2, 0, dst);
3357   stw(tmp4, 4, dst);
3358   addi(dst, dst, 8);
3359   bdnz(Lloop);
3360 
3361   bind(Lslow);                    // Fallback to slow version
3362 }
3363 
3364 // Compress char[] to byte[]. cnt must be positive int.
3365 void MacroAssembler::string_compress(Register src, Register dst, Register cnt, Register tmp, Label& Lfailure) {
3366   Label Lloop;
3367   mtctr(cnt);
3368 
3369   bind(Lloop);
3370   lhz(tmp, 0, src);
3371   cmplwi(CCR0, tmp, 0xff);
3372   bgt(CCR0, Lfailure);            // Not latin1.
3373   addi(src, src, 2);
3374   stb(tmp, 0, dst);
3375   addi(dst, dst, 1);
3376   bdnz(Lloop);
3377 }
3378 
3379 // Inflate byte[] to char[] by inflating 16 bytes at once.
3380 void MacroAssembler::string_inflate_16(Register src, Register dst, Register cnt,
3381                                        Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
3382   const Register tmp0 = R0;
3383   assert_different_registers(src, dst, cnt, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5);
3384   Label Lloop, Lslow;
3385 
3386   // Check if cnt >= 8
3387   srwi_(tmp2, cnt, 3);
3388   beq(CCR0, Lslow);
3389   lis(tmp1, 0xFF);                // tmp1 = 0x00FF00FF
3390   ori(tmp1, tmp1, 0xFF);
3391   mtctr(tmp2);
3392 
3393   // 2x unrolled loop
3394   bind(Lloop);
3395   lwz(tmp2, 0, src);              // ____0123 (Big Endian)
3396   lwz(tmp4, 4, src);              // ____4567
3397   addi(src, src, 8);
3398 
3399   rldicl(tmp3, tmp2, 7*8, 64-8);  // _______2
3400   rlwimi(tmp2, tmp2, 3*8, 16, 23);// ____0113
3401   rldicl(tmp5, tmp4, 7*8, 64-8);  // _______6
3402   rlwimi(tmp4, tmp4, 3*8, 16, 23);// ____4557
3403 
3404   andc(tmp0, tmp2, tmp1);         // ____0_1_
3405   rlwimi(tmp2, tmp3, 2*8, 0, 23); // _____2_3
3406   andc(tmp3, tmp4, tmp1);         // ____4_5_
3407   rlwimi(tmp4, tmp5, 2*8, 0, 23); // _____6_7
3408 
3409   rldimi(tmp2, tmp0, 3*8, 0*8);   // _0_1_2_3
3410   rldimi(tmp4, tmp3, 3*8, 0*8);   // _4_5_6_7
3411 
3412   std(tmp2, 0, dst);
3413   std(tmp4, 8, dst);
3414   addi(dst, dst, 16);
3415   bdnz(Lloop);
3416 
3417   bind(Lslow);                    // Fallback to slow version
3418 }
3419 
3420 // Inflate byte[] to char[]. cnt must be positive int.
3421 void MacroAssembler::string_inflate(Register src, Register dst, Register cnt, Register tmp) {
3422   Label Lloop;
3423   mtctr(cnt);
3424 
3425   bind(Lloop);
3426   lbz(tmp, 0, src);
3427   addi(src, src, 1);
3428   sth(tmp, 0, dst);
3429   addi(dst, dst, 2);
3430   bdnz(Lloop);
3431 }
3432 
3433 void MacroAssembler::string_compare(Register str1, Register str2,
3434                                     Register cnt1, Register cnt2,
3435                                     Register tmp1, Register result, int ae) {
3436   const Register tmp0 = R0,
3437                  diff = tmp1;
3438 
3439   assert_different_registers(str1, str2, cnt1, cnt2, tmp0, tmp1, result);
3440   Label Ldone, Lslow, Lloop, Lreturn_diff;
3441 
3442   // Note: Making use of the fact that compareTo(a, b) == -compareTo(b, a)
3443   // we interchange str1 and str2 in the UL case and negate the result.
3444   // Like this, str1 is always latin1 encoded, except for the UU case.
3445   // In addition, we need 0 (or sign which is 0) extend.
3446 
3447   if (ae == StrIntrinsicNode::UU) {
3448     srwi(cnt1, cnt1, 1);
3449   } else {
3450     clrldi(cnt1, cnt1, 32);
3451   }
3452 
3453   if (ae != StrIntrinsicNode::LL) {
3454     srwi(cnt2, cnt2, 1);
3455   } else {
3456     clrldi(cnt2, cnt2, 32);
3457   }
3458 
3459   // See if the lengths are different, and calculate min in cnt1.
3460   // Save diff in case we need it for a tie-breaker.
3461   subf_(diff, cnt2, cnt1); // diff = cnt1 - cnt2
3462   // if (diff > 0) { cnt1 = cnt2; }
3463   if (VM_Version::has_isel()) {
3464     isel(cnt1, CCR0, Assembler::greater, /*invert*/ false, cnt2);
3465   } else {
3466     Label Lskip;
3467     blt(CCR0, Lskip);
3468     mr(cnt1, cnt2);
3469     bind(Lskip);
3470   }
3471 
3472   // Rename registers
3473   Register chr1 = result;
3474   Register chr2 = tmp0;
3475 
3476   // Compare multiple characters in fast loop (only implemented for same encoding).
3477   int stride1 = 8, stride2 = 8;
3478   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3479     int log2_chars_per_iter = (ae == StrIntrinsicNode::LL) ? 3 : 2;
3480     Label Lfastloop, Lskipfast;
3481 
3482     srwi_(tmp0, cnt1, log2_chars_per_iter);
3483     beq(CCR0, Lskipfast);
3484     rldicl(cnt2, cnt1, 0, 64 - log2_chars_per_iter); // Remaining characters.
3485     li(cnt1, 1 << log2_chars_per_iter); // Initialize for failure case: Rescan characters from current iteration.
3486     mtctr(tmp0);
3487 
3488     bind(Lfastloop);
3489     ld(chr1, 0, str1);
3490     ld(chr2, 0, str2);
3491     cmpd(CCR0, chr1, chr2);
3492     bne(CCR0, Lslow);
3493     addi(str1, str1, stride1);
3494     addi(str2, str2, stride2);
3495     bdnz(Lfastloop);
3496     mr(cnt1, cnt2); // Remaining characters.
3497     bind(Lskipfast);
3498   }
3499 
3500   // Loop which searches the first difference character by character.
3501   cmpwi(CCR0, cnt1, 0);
3502   beq(CCR0, Lreturn_diff);
3503   bind(Lslow);
3504   mtctr(cnt1);
3505 
3506   switch (ae) {
3507     case StrIntrinsicNode::LL: stride1 = 1; stride2 = 1; break;
3508     case StrIntrinsicNode::UL: // fallthru (see comment above)
3509     case StrIntrinsicNode::LU: stride1 = 1; stride2 = 2; break;
3510     case StrIntrinsicNode::UU: stride1 = 2; stride2 = 2; break;
3511     default: ShouldNotReachHere(); break;
3512   }
3513 
3514   bind(Lloop);
3515   if (stride1 == 1) { lbz(chr1, 0, str1); } else { lhz(chr1, 0, str1); }
3516   if (stride2 == 1) { lbz(chr2, 0, str2); } else { lhz(chr2, 0, str2); }
3517   subf_(result, chr2, chr1); // result = chr1 - chr2
3518   bne(CCR0, Ldone);
3519   addi(str1, str1, stride1);
3520   addi(str2, str2, stride2);
3521   bdnz(Lloop);
3522 
3523   // If strings are equal up to min length, return the length difference.
3524   bind(Lreturn_diff);
3525   mr(result, diff);
3526 
3527   // Otherwise, return the difference between the first mismatched chars.
3528   bind(Ldone);
3529   if (ae == StrIntrinsicNode::UL) {
3530     neg(result, result); // Negate result (see note above).
3531   }
3532 }
3533 
3534 void MacroAssembler::array_equals(bool is_array_equ, Register ary1, Register ary2,
3535                                   Register limit, Register tmp1, Register result, bool is_byte) {
3536   const Register tmp0 = R0;
3537   assert_different_registers(ary1, ary2, limit, tmp0, tmp1, result);
3538   Label Ldone, Lskiploop, Lloop, Lfastloop, Lskipfast;
3539   bool limit_needs_shift = false;
3540 
3541   if (is_array_equ) {
3542     const int length_offset = arrayOopDesc::length_offset_in_bytes();
3543     const int base_offset   = arrayOopDesc::base_offset_in_bytes(is_byte ? T_BYTE : T_CHAR);
3544 
3545     // Return true if the same array.
3546     cmpd(CCR0, ary1, ary2);
3547     beq(CCR0, Lskiploop);
3548 
3549     // Return false if one of them is NULL.
3550     cmpdi(CCR0, ary1, 0);
3551     cmpdi(CCR1, ary2, 0);
3552     li(result, 0);
3553     cror(CCR0, Assembler::equal, CCR1, Assembler::equal);
3554     beq(CCR0, Ldone);
3555 
3556     // Load the lengths of arrays.
3557     lwz(limit, length_offset, ary1);
3558     lwz(tmp0, length_offset, ary2);
3559 
3560     // Return false if the two arrays are not equal length.
3561     cmpw(CCR0, limit, tmp0);
3562     bne(CCR0, Ldone);
3563 
3564     // Load array addresses.
3565     addi(ary1, ary1, base_offset);
3566     addi(ary2, ary2, base_offset);
3567   } else {
3568     limit_needs_shift = !is_byte;
3569     li(result, 0); // Assume not equal.
3570   }
3571 
3572   // Rename registers
3573   Register chr1 = tmp0;
3574   Register chr2 = tmp1;
3575 
3576   // Compare 8 bytes per iteration in fast loop.
3577   const int log2_chars_per_iter = is_byte ? 3 : 2;
3578 
3579   srwi_(tmp0, limit, log2_chars_per_iter + (limit_needs_shift ? 1 : 0));
3580   beq(CCR0, Lskipfast);
3581   mtctr(tmp0);
3582 
3583   bind(Lfastloop);
3584   ld(chr1, 0, ary1);
3585   ld(chr2, 0, ary2);
3586   addi(ary1, ary1, 8);
3587   addi(ary2, ary2, 8);
3588   cmpd(CCR0, chr1, chr2);
3589   bne(CCR0, Ldone);
3590   bdnz(Lfastloop);
3591 
3592   bind(Lskipfast);
3593   rldicl_(limit, limit, limit_needs_shift ? 64 - 1 : 0, 64 - log2_chars_per_iter); // Remaining characters.
3594   beq(CCR0, Lskiploop);
3595   mtctr(limit);
3596 
3597   // Character by character.
3598   bind(Lloop);
3599   if (is_byte) {
3600     lbz(chr1, 0, ary1);
3601     lbz(chr2, 0, ary2);
3602     addi(ary1, ary1, 1);
3603     addi(ary2, ary2, 1);
3604   } else {
3605     lhz(chr1, 0, ary1);
3606     lhz(chr2, 0, ary2);
3607     addi(ary1, ary1, 2);
3608     addi(ary2, ary2, 2);
3609   }
3610   cmpw(CCR0, chr1, chr2);
3611   bne(CCR0, Ldone);
3612   bdnz(Lloop);
3613 
3614   bind(Lskiploop);
3615   li(result, 1); // All characters are equal.
3616   bind(Ldone);
3617 }
3618 
3619 void MacroAssembler::string_indexof(Register result, Register haystack, Register haycnt,
3620                                     Register needle, ciTypeArray* needle_values, Register needlecnt, int needlecntval,
3621                                     Register tmp1, Register tmp2, Register tmp3, Register tmp4, int ae) {
3622 
3623   // Ensure 0<needlecnt<=haycnt in ideal graph as prerequisite!
3624   Label L_TooShort, L_Found, L_NotFound, L_End;
3625   Register last_addr = haycnt, // Kill haycnt at the beginning.
3626   addr      = tmp1,
3627   n_start   = tmp2,
3628   ch1       = tmp3,
3629   ch2       = R0;
3630 
3631   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
3632   const int h_csize = (ae == StrIntrinsicNode::LL) ? 1 : 2;
3633   const int n_csize = (ae == StrIntrinsicNode::UU) ? 2 : 1;
3634 
3635   // **************************************************************************************************
3636   // Prepare for main loop: optimized for needle count >=2, bail out otherwise.
3637   // **************************************************************************************************
3638 
3639   // Compute last haystack addr to use if no match gets found.
3640   clrldi(haycnt, haycnt, 32);         // Ensure positive int is valid as 64 bit value.
3641   addi(addr, haystack, -h_csize);     // Accesses use pre-increment.
3642   if (needlecntval == 0) { // variable needlecnt
3643    cmpwi(CCR6, needlecnt, 2);
3644    clrldi(needlecnt, needlecnt, 32);  // Ensure positive int is valid as 64 bit value.
3645    blt(CCR6, L_TooShort);             // Variable needlecnt: handle short needle separately.
3646   }
3647 
3648   if (n_csize == 2) { lwz(n_start, 0, needle); } else { lhz(n_start, 0, needle); } // Load first 2 characters of needle.
3649 
3650   if (needlecntval == 0) { // variable needlecnt
3651    subf(ch1, needlecnt, haycnt);      // Last character index to compare is haycnt-needlecnt.
3652    addi(needlecnt, needlecnt, -2);    // Rest of needle.
3653   } else { // constant needlecnt
3654   guarantee(needlecntval != 1, "IndexOf with single-character needle must be handled separately");
3655   assert((needlecntval & 0x7fff) == needlecntval, "wrong immediate");
3656    addi(ch1, haycnt, -needlecntval);  // Last character index to compare is haycnt-needlecnt.
3657    if (needlecntval > 3) { li(needlecnt, needlecntval - 2); } // Rest of needle.
3658   }
3659 
3660   if (h_csize == 2) { slwi(ch1, ch1, 1); } // Scale to number of bytes.
3661 
3662   if (ae ==StrIntrinsicNode::UL) {
3663    srwi(tmp4, n_start, 1*8);          // ___0
3664    rlwimi(n_start, tmp4, 2*8, 0, 23); // _0_1
3665   }
3666 
3667   add(last_addr, haystack, ch1);      // Point to last address to compare (haystack+2*(haycnt-needlecnt)).
3668 
3669   // Main Loop (now we have at least 2 characters).
3670   Label L_OuterLoop, L_InnerLoop, L_FinalCheck, L_Comp1, L_Comp2;
3671   bind(L_OuterLoop); // Search for 1st 2 characters.
3672   Register addr_diff = tmp4;
3673    subf(addr_diff, addr, last_addr);  // Difference between already checked address and last address to check.
3674    addi(addr, addr, h_csize);         // This is the new address we want to use for comparing.
3675    srdi_(ch2, addr_diff, h_csize);
3676    beq(CCR0, L_FinalCheck);           // 2 characters left?
3677    mtctr(ch2);                        // num of characters / 2
3678   bind(L_InnerLoop);                  // Main work horse (2x unrolled search loop)
3679    if (h_csize == 2) {                // Load 2 characters of haystack (ignore alignment).
3680     lwz(ch1, 0, addr);
3681     lwz(ch2, 2, addr);
3682    } else {
3683     lhz(ch1, 0, addr);
3684     lhz(ch2, 1, addr);
3685    }
3686    cmpw(CCR0, ch1, n_start);          // Compare 2 characters (1 would be sufficient but try to reduce branches to CompLoop).
3687    cmpw(CCR1, ch2, n_start);
3688    beq(CCR0, L_Comp1);                // Did we find the needle start?
3689    beq(CCR1, L_Comp2);
3690    addi(addr, addr, 2 * h_csize);
3691    bdnz(L_InnerLoop);
3692   bind(L_FinalCheck);
3693    andi_(addr_diff, addr_diff, h_csize); // Remaining characters not covered by InnerLoop: (num of characters) & 1.
3694    beq(CCR0, L_NotFound);
3695    if (h_csize == 2) { lwz(ch1, 0, addr); } else { lhz(ch1, 0, addr); } // One position left at which we have to compare.
3696    cmpw(CCR1, ch1, n_start);
3697    beq(CCR1, L_Comp1);
3698   bind(L_NotFound);
3699    li(result, -1);                    // not found
3700    b(L_End);
3701 
3702    // **************************************************************************************************
3703    // Special Case: unfortunately, the variable needle case can be called with needlecnt<2
3704    // **************************************************************************************************
3705   if (needlecntval == 0) {           // We have to handle these cases separately.
3706   Label L_OneCharLoop;
3707   bind(L_TooShort);
3708    mtctr(haycnt);
3709    if (n_csize == 2) { lhz(n_start, 0, needle); } else { lbz(n_start, 0, needle); } // First character of needle
3710   bind(L_OneCharLoop);
3711    if (h_csize == 2) { lhzu(ch1, 2, addr); } else { lbzu(ch1, 1, addr); }
3712    cmpw(CCR1, ch1, n_start);
3713    beq(CCR1, L_Found);               // Did we find the one character needle?
3714    bdnz(L_OneCharLoop);
3715    li(result, -1);                   // Not found.
3716    b(L_End);
3717   }
3718 
3719   // **************************************************************************************************
3720   // Regular Case Part II: compare rest of needle (first 2 characters have been compared already)
3721   // **************************************************************************************************
3722 
3723   // Compare the rest
3724   bind(L_Comp2);
3725    addi(addr, addr, h_csize);        // First comparison has failed, 2nd one hit.
3726   bind(L_Comp1);                     // Addr points to possible needle start.
3727   if (needlecntval != 2) {           // Const needlecnt==2?
3728    if (needlecntval != 3) {
3729     if (needlecntval == 0) { beq(CCR6, L_Found); } // Variable needlecnt==2?
3730     Register n_ind = tmp4,
3731              h_ind = n_ind;
3732     li(n_ind, 2 * n_csize);          // First 2 characters are already compared, use index 2.
3733     mtctr(needlecnt);                // Decremented by 2, still > 0.
3734    Label L_CompLoop;
3735    bind(L_CompLoop);
3736     if (ae ==StrIntrinsicNode::UL) {
3737       h_ind = ch1;
3738       sldi(h_ind, n_ind, 1);
3739     }
3740     if (n_csize == 2) { lhzx(ch2, needle, n_ind); } else { lbzx(ch2, needle, n_ind); }
3741     if (h_csize == 2) { lhzx(ch1, addr, h_ind); } else { lbzx(ch1, addr, h_ind); }
3742     cmpw(CCR1, ch1, ch2);
3743     bne(CCR1, L_OuterLoop);
3744     addi(n_ind, n_ind, n_csize);
3745     bdnz(L_CompLoop);
3746    } else { // No loop required if there's only one needle character left.
3747     if (n_csize == 2) { lhz(ch2, 2 * 2, needle); } else { lbz(ch2, 2 * 1, needle); }
3748     if (h_csize == 2) { lhz(ch1, 2 * 2, addr); } else { lbz(ch1, 2 * 1, addr); }
3749     cmpw(CCR1, ch1, ch2);
3750     bne(CCR1, L_OuterLoop);
3751    }
3752   }
3753   // Return index ...
3754   bind(L_Found);
3755    subf(result, haystack, addr);     // relative to haystack, ...
3756    if (h_csize == 2) { srdi(result, result, 1); } // in characters.
3757   bind(L_End);
3758 } // string_indexof
3759 
3760 void MacroAssembler::string_indexof_char(Register result, Register haystack, Register haycnt,
3761                                          Register needle, jchar needleChar, Register tmp1, Register tmp2, bool is_byte) {
3762   assert_different_registers(haystack, haycnt, needle, tmp1, tmp2);
3763 
3764   Label L_InnerLoop, L_FinalCheck, L_Found1, L_Found2, L_NotFound, L_End;
3765   Register addr = tmp1,
3766            ch1 = tmp2,
3767            ch2 = R0;
3768 
3769   const int h_csize = is_byte ? 1 : 2;
3770 
3771 //4:
3772    srwi_(tmp2, haycnt, 1);   // Shift right by exact_log2(UNROLL_FACTOR).
3773    mr(addr, haystack);
3774    beq(CCR0, L_FinalCheck);
3775    mtctr(tmp2);              // Move to count register.
3776 //8:
3777   bind(L_InnerLoop);         // Main work horse (2x unrolled search loop).
3778    if (!is_byte) {
3779     lhz(ch1, 0, addr);
3780     lhz(ch2, 2, addr);
3781    } else {
3782     lbz(ch1, 0, addr);
3783     lbz(ch2, 1, addr);
3784    }
3785    (needle != R0) ? cmpw(CCR0, ch1, needle) : cmplwi(CCR0, ch1, (unsigned int)needleChar);
3786    (needle != R0) ? cmpw(CCR1, ch2, needle) : cmplwi(CCR1, ch2, (unsigned int)needleChar);
3787    beq(CCR0, L_Found1);      // Did we find the needle?
3788    beq(CCR1, L_Found2);
3789    addi(addr, addr, 2 * h_csize);
3790    bdnz(L_InnerLoop);
3791 //16:
3792   bind(L_FinalCheck);
3793    andi_(R0, haycnt, 1);
3794    beq(CCR0, L_NotFound);
3795    if (!is_byte) { lhz(ch1, 0, addr); } else { lbz(ch1, 0, addr); } // One position left at which we have to compare.
3796    (needle != R0) ? cmpw(CCR1, ch1, needle) : cmplwi(CCR1, ch1, (unsigned int)needleChar);
3797    beq(CCR1, L_Found1);
3798 //21:
3799   bind(L_NotFound);
3800    li(result, -1);           // Not found.
3801    b(L_End);
3802 
3803   bind(L_Found2);
3804    addi(addr, addr, h_csize);
3805 //24:
3806   bind(L_Found1);            // Return index ...
3807    subf(result, haystack, addr); // relative to haystack, ...
3808    if (!is_byte) { srdi(result, result, 1); } // in characters.
3809   bind(L_End);
3810 } // string_indexof_char
3811 
3812 
3813 void MacroAssembler::has_negatives(Register src, Register cnt, Register result,
3814                                    Register tmp1, Register tmp2) {
3815   const Register tmp0 = R0;
3816   assert_different_registers(src, result, cnt, tmp0, tmp1, tmp2);
3817   Label Lfastloop, Lslow, Lloop, Lnoneg, Ldone;
3818 
3819   // Check if cnt >= 8 (= 16 bytes)
3820   lis(tmp1, (int)(short)0x8080);  // tmp1 = 0x8080808080808080
3821   srwi_(tmp2, cnt, 4);
3822   li(result, 1);                  // Assume there's a negative byte.
3823   beq(CCR0, Lslow);
3824   ori(tmp1, tmp1, 0x8080);
3825   rldimi(tmp1, tmp1, 32, 0);
3826   mtctr(tmp2);
3827 
3828   // 2x unrolled loop
3829   bind(Lfastloop);
3830   ld(tmp2, 0, src);
3831   ld(tmp0, 8, src);
3832 
3833   orr(tmp0, tmp2, tmp0);
3834 
3835   and_(tmp0, tmp0, tmp1);
3836   bne(CCR0, Ldone);               // Found negative byte.
3837   addi(src, src, 16);
3838 
3839   bdnz(Lfastloop);
3840 
3841   bind(Lslow);                    // Fallback to slow version
3842   rldicl_(tmp0, cnt, 0, 64-4);
3843   beq(CCR0, Lnoneg);
3844   mtctr(tmp0);
3845   bind(Lloop);
3846   lbz(tmp0, 0, src);
3847   addi(src, src, 1);
3848   andi_(tmp0, tmp0, 0x80);
3849   bne(CCR0, Ldone);               // Found negative byte.
3850   bdnz(Lloop);
3851   bind(Lnoneg);
3852   li(result, 0);
3853 
3854   bind(Ldone);
3855 }
3856 
3857 #endif // Compiler2
3858 
3859 // Helpers for Intrinsic Emitters
3860 //
3861 // Revert the byte order of a 32bit value in a register
3862 //   src: 0x44556677
3863 //   dst: 0x77665544
3864 // Three steps to obtain the result:
3865 //  1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word
3866 //     into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared.
3867 //     This value initializes dst.
3868 //  2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost
3869 //     byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go.
3870 //     This value is mask inserted into dst with a [0..23] mask of 1s.
3871 //  3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position.
3872 //     This value is mask inserted into dst with a [8..15] mask of 1s.
3873 void MacroAssembler::load_reverse_32(Register dst, Register src) {
3874   assert_different_registers(dst, src);
3875 
3876   rldicl(dst, src, (4+1)*8, 56);       // Rotate byte 4 into position 7 (rightmost), clear all to the left.
3877   rlwimi(dst, src,     3*8,  0, 23);   // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone.
3878   rlwimi(dst, src,     1*8,  8, 15);   // Insert byte 6 into position 5, leave the rest alone.
3879 }
3880 
3881 // Calculate the column addresses of the crc32 lookup table into distinct registers.
3882 // This loop-invariant calculation is moved out of the loop body, reducing the loop
3883 // body size from 20 to 16 instructions.
3884 // Returns the offset that was used to calculate the address of column tc3.
3885 // Due to register shortage, setting tc3 may overwrite table. With the return offset
3886 // at hand, the original table address can be easily reconstructed.
3887 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {
3888   assert(!VM_Version::has_vpmsumb(), "Vector version should be used instead!");
3889 
3890   // Point to 4 byte folding tables (byte-reversed version for Big Endian)
3891   // Layout: See StubRoutines::generate_crc_constants.
3892 #ifdef VM_LITTLE_ENDIAN
3893   const int ix0 = 3 * CRC32_TABLE_SIZE;
3894   const int ix1 = 2 * CRC32_TABLE_SIZE;
3895   const int ix2 = 1 * CRC32_TABLE_SIZE;
3896   const int ix3 = 0 * CRC32_TABLE_SIZE;
3897 #else
3898   const int ix0 = 1 * CRC32_TABLE_SIZE;
3899   const int ix1 = 2 * CRC32_TABLE_SIZE;
3900   const int ix2 = 3 * CRC32_TABLE_SIZE;
3901   const int ix3 = 4 * CRC32_TABLE_SIZE;
3902 #endif
3903   assert_different_registers(table, tc0, tc1, tc2);
3904   assert(table == tc3, "must be!");
3905 
3906   addi(tc0, table, ix0);
3907   addi(tc1, table, ix1);
3908   addi(tc2, table, ix2);
3909   if (ix3 != 0) addi(tc3, table, ix3);
3910 
3911   return ix3;
3912 }
3913 
3914 /**
3915  * uint32_t crc;
3916  * table[crc & 0xFF] ^ (crc >> 8);
3917  */
3918 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) {
3919   assert_different_registers(crc, table, tmp);
3920   assert_different_registers(val, table);
3921 
3922   if (crc == val) {                   // Must rotate first to use the unmodified value.
3923     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3924                                       // As we use a word (4-byte) instruction, we have to adapt the mask bit positions.
3925     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
3926   } else {
3927     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
3928     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3929   }
3930   lwzx(tmp, table, tmp);
3931   xorr(crc, crc, tmp);
3932 }
3933 
3934 /**
3935  * Emits code to update CRC-32 with a byte value according to constants in table.
3936  *
3937  * @param [in,out]crc   Register containing the crc.
3938  * @param [in]val       Register containing the byte to fold into the CRC.
3939  * @param [in]table     Register containing the table of crc constants.
3940  *
3941  * uint32_t crc;
3942  * val = crc_table[(val ^ crc) & 0xFF];
3943  * crc = val ^ (crc >> 8);
3944  */
3945 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3946   BLOCK_COMMENT("update_byte_crc32:");
3947   xorr(val, val, crc);
3948   fold_byte_crc32(crc, val, table, val);
3949 }
3950 
3951 /**
3952  * @param crc   register containing existing CRC (32-bit)
3953  * @param buf   register pointing to input byte buffer (byte*)
3954  * @param len   register containing number of bytes
3955  * @param table register pointing to CRC table
3956  */
3957 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,
3958                                            Register data, bool loopAlignment) {
3959   assert_different_registers(crc, buf, len, table, data);
3960 
3961   Label L_mainLoop, L_done;
3962   const int mainLoop_stepping  = 1;
3963   const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4;
3964 
3965   // Process all bytes in a single-byte loop.
3966   clrldi_(len, len, 32);                         // Enforce 32 bit. Anything to do?
3967   beq(CCR0, L_done);
3968 
3969   mtctr(len);
3970   align(mainLoop_alignment);
3971   BIND(L_mainLoop);
3972     lbz(data, 0, buf);                           // Byte from buffer, zero-extended.
3973     addi(buf, buf, mainLoop_stepping);           // Advance buffer position.
3974     update_byte_crc32(crc, data, table);
3975     bdnz(L_mainLoop);                            // Iterate.
3976 
3977   bind(L_done);
3978 }
3979 
3980 /**
3981  * Emits code to update CRC-32 with a 4-byte value according to constants in table
3982  * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
3983  */
3984 // A note on the lookup table address(es):
3985 // The implementation uses 4 table columns (byte-reversed versions for Big Endian).
3986 // To save the effort of adding the column offset to the table address each time
3987 // a table element is looked up, it is possible to pass the pre-calculated
3988 // column addresses.
3989 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary.
3990 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,
3991                                         Register t0,  Register t1,  Register t2,  Register t3,
3992                                         Register tc0, Register tc1, Register tc2, Register tc3) {
3993   assert_different_registers(crc, t3);
3994 
3995   // XOR crc with next four bytes of buffer.
3996   lwz(t3, bufDisp, buf);
3997   if (bufInc != 0) {
3998     addi(buf, buf, bufInc);
3999   }
4000   xorr(t3, t3, crc);
4001 
4002   // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices.
4003   rlwinm(t0, t3,  2,         24-2, 31-2);  // ((t1 >>  0) & 0xff) << 2
4004   rlwinm(t1, t3,  32+(2- 8), 24-2, 31-2);  // ((t1 >>  8) & 0xff) << 2
4005   rlwinm(t2, t3,  32+(2-16), 24-2, 31-2);  // ((t1 >> 16) & 0xff) << 2
4006   rlwinm(t3, t3,  32+(2-24), 24-2, 31-2);  // ((t1 >> 24) & 0xff) << 2
4007 
4008   // Use the pre-calculated column addresses.
4009   // Load pre-calculated table values.
4010   lwzx(t0, tc0, t0);
4011   lwzx(t1, tc1, t1);
4012   lwzx(t2, tc2, t2);
4013   lwzx(t3, tc3, t3);
4014 
4015   // Calculate new crc from table values.
4016   xorr(t0,  t0, t1);
4017   xorr(t2,  t2, t3);
4018   xorr(crc, t0, t2);  // Now crc contains the final checksum value.
4019 }
4020 
4021 /**
4022  * @param crc   register containing existing CRC (32-bit)
4023  * @param buf   register pointing to input byte buffer (byte*)
4024  * @param len   register containing number of bytes
4025  * @param table register pointing to CRC table
4026  *
4027  * uses R9..R12 as work register. Must be saved/restored by caller!
4028  */
4029 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table,
4030                                         Register t0,  Register t1,  Register t2,  Register t3,
4031                                         Register tc0, Register tc1, Register tc2, Register tc3,
4032                                         bool invertCRC) {
4033   assert_different_registers(crc, buf, len, table);
4034 
4035   Label L_mainLoop, L_tail;
4036   Register  tmp          = t0;
4037   Register  data         = t0;
4038   Register  tmp2         = t1;
4039   const int mainLoop_stepping  = 4;
4040   const int tailLoop_stepping  = 1;
4041   const int log_stepping       = exact_log2(mainLoop_stepping);
4042   const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
4043   const int complexThreshold   = 2*mainLoop_stepping;
4044 
4045   // Don't test for len <= 0 here. This pathological case should not occur anyway.
4046   // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles
4047   // for all well-behaved cases. The situation itself is detected and handled correctly
4048   // within update_byteLoop_crc32.
4049   assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
4050 
4051   BLOCK_COMMENT("kernel_crc32_1word {");
4052 
4053   if (invertCRC) {
4054     nand(crc, crc, crc);                      // 1s complement of crc
4055   }
4056 
4057   // Check for short (<mainLoop_stepping) buffer.
4058   cmpdi(CCR0, len, complexThreshold);
4059   blt(CCR0, L_tail);
4060 
4061   // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
4062   // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
4063   {
4064     // Align buf addr to mainLoop_stepping boundary.
4065     neg(tmp2, buf);                              // Calculate # preLoop iterations for alignment.
4066     rldicl(tmp2, tmp2, 0, 64-log_stepping);      // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
4067 
4068     if (complexThreshold > mainLoop_stepping) {
4069       sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4070     } else {
4071       sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
4072       cmpdi(CCR0, tmp, mainLoop_stepping);
4073       blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
4074       mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4075     }
4076     update_byteLoop_crc32(crc, buf, tmp2, table, data, false);
4077   }
4078 
4079   srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
4080   andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
4081   mtctr(tmp2);
4082 
4083 #ifdef VM_LITTLE_ENDIAN
4084   Register crc_rv = crc;
4085 #else
4086   Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
4087                                                  // Occupies tmp, but frees up crc.
4088   load_reverse_32(crc_rv, crc);                  // Revert byte order because we are dealing with big-endian data.
4089   tmp = crc;
4090 #endif
4091 
4092   int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
4093 
4094   align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
4095   BIND(L_mainLoop);
4096     update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
4097     bdnz(L_mainLoop);
4098 
4099 #ifndef VM_LITTLE_ENDIAN
4100   load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
4101   tmp = crc_rv;                                  // Tmp uses it's original register again.
4102 #endif
4103 
4104   // Restore original table address for tailLoop.
4105   if (reconstructTableOffset != 0) {
4106     addi(table, table, -reconstructTableOffset);
4107   }
4108 
4109   // Process last few (<complexThreshold) bytes of buffer.
4110   BIND(L_tail);
4111   update_byteLoop_crc32(crc, buf, len, table, data, false);
4112 
4113   if (invertCRC) {
4114     nand(crc, crc, crc);                      // 1s complement of crc
4115   }
4116   BLOCK_COMMENT("} kernel_crc32_1word");
4117 }
4118 
4119 /**
4120  * @param crc             register containing existing CRC (32-bit)
4121  * @param buf             register pointing to input byte buffer (byte*)
4122  * @param len             register containing number of bytes
4123  * @param constants       register pointing to precomputed constants
4124  * @param t0-t6           temp registers
4125  */
4126 void MacroAssembler::kernel_crc32_vpmsum(Register crc, Register buf, Register len, Register constants,
4127                                          Register t0, Register t1, Register t2, Register t3,
4128                                          Register t4, Register t5, Register t6, bool invertCRC) {
4129   assert_different_registers(crc, buf, len, constants);
4130 
4131   Label L_tail;
4132 
4133   BLOCK_COMMENT("kernel_crc32_vpmsum {");
4134 
4135   if (invertCRC) {
4136     nand(crc, crc, crc);                      // 1s complement of crc
4137   }
4138 
4139   // Enforce 32 bit.
4140   clrldi(len, len, 32);
4141 
4142   // Align if we have enough bytes for the fast version.
4143   const int alignment = 16,
4144             threshold = 32;
4145   Register prealign = t0;
4146 
4147   neg(prealign, buf);
4148   addi(t1, len, -threshold);
4149   andi(prealign, prealign, alignment - 1);
4150   cmpw(CCR0, t1, prealign);
4151   blt(CCR0, L_tail); // len - prealign < threshold?
4152 
4153   subf(len, prealign, len);
4154   update_byteLoop_crc32(crc, buf, prealign, constants, t2, false);
4155 
4156   // Calculate from first aligned address as far as possible.
4157   addi(constants, constants, CRC32_TABLE_SIZE); // Point to vector constants.
4158   kernel_crc32_vpmsum_aligned(crc, buf, len, constants, t0, t1, t2, t3, t4, t5, t6);
4159   addi(constants, constants, -CRC32_TABLE_SIZE); // Point to table again.
4160 
4161   // Remaining bytes.
4162   BIND(L_tail);
4163   update_byteLoop_crc32(crc, buf, len, constants, t2, false);
4164 
4165   if (invertCRC) {
4166     nand(crc, crc, crc);                      // 1s complement of crc
4167   }
4168 
4169   BLOCK_COMMENT("} kernel_crc32_vpmsum");
4170 }
4171 
4172 /**
4173  * @param crc             register containing existing CRC (32-bit)
4174  * @param buf             register pointing to input byte buffer (byte*)
4175  * @param len             register containing number of bytes (will get updated to remaining bytes)
4176  * @param constants       register pointing to CRC table for 128-bit aligned memory
4177  * @param t0-t6           temp registers
4178  */
4179 void MacroAssembler::kernel_crc32_vpmsum_aligned(Register crc, Register buf, Register len, Register constants,
4180     Register t0, Register t1, Register t2, Register t3, Register t4, Register t5, Register t6) {
4181 
4182   // Save non-volatile vector registers (frameless).
4183   Register offset = t1;
4184   int offsetInt = 0;
4185   offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP);
4186   offsetInt -= 16; li(offset, offsetInt); stvx(VR21, offset, R1_SP);
4187   offsetInt -= 16; li(offset, offsetInt); stvx(VR22, offset, R1_SP);
4188   offsetInt -= 16; li(offset, offsetInt); stvx(VR23, offset, R1_SP);
4189   offsetInt -= 16; li(offset, offsetInt); stvx(VR24, offset, R1_SP);
4190   offsetInt -= 16; li(offset, offsetInt); stvx(VR25, offset, R1_SP);
4191 #ifndef VM_LITTLE_ENDIAN
4192   offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP);
4193 #endif
4194   offsetInt -= 8; std(R14, offsetInt, R1_SP);
4195   offsetInt -= 8; std(R15, offsetInt, R1_SP);
4196 
4197   // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor
4198   // bytes per iteration. The basic scheme is:
4199   // lvx: load vector (Big Endian needs reversal)
4200   // vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift
4201   // vxor: xor partial results together to get unroll_factor2 vectors
4202 
4203   // Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors.
4204 
4205   // Using 16 * unroll_factor / unroll_factor_2 bytes for constants.
4206   const int unroll_factor = CRC32_UNROLL_FACTOR,
4207             unroll_factor2 = CRC32_UNROLL_FACTOR2;
4208 
4209   const int outer_consts_size = (unroll_factor2 - 1) * 16,
4210             inner_consts_size = (unroll_factor / unroll_factor2) * 16;
4211 
4212   // Support registers.
4213   Register offs[] = { noreg, t0, t1, t2, t3, t4, t5, t6 };
4214   Register num_bytes = R14,
4215            loop_count = R15,
4216            cur_const = crc; // will live in VCRC
4217   // Constant array for outer loop: unroll_factor2 - 1 registers,
4218   // Constant array for inner loop: unroll_factor / unroll_factor2 registers.
4219   VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 },
4220                  consts1[] = { VR23, VR24 };
4221   // Data register arrays: 2 arrays with unroll_factor2 registers.
4222   VectorRegister data0[] = { VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7 },
4223                  data1[] = { VR8, VR9, VR10, VR11, VR12, VR13, VR14, VR15 };
4224 
4225   VectorRegister VCRC = data0[0];
4226   VectorRegister Vc = VR25;
4227   VectorRegister swap_bytes = VR26; // Only for Big Endian.
4228 
4229   // We have at least 1 iteration (ensured by caller).
4230   Label L_outer_loop, L_inner_loop, L_last;
4231 
4232   // If supported set DSCR pre-fetch to deepest.
4233   if (VM_Version::has_mfdscr()) {
4234     load_const_optimized(t0, VM_Version::_dscr_val | 7);
4235     mtdscr(t0);
4236   }
4237 
4238   mtvrwz(VCRC, crc); // crc lives in VCRC, now
4239 
4240   for (int i = 1; i < unroll_factor2; ++i) {
4241     li(offs[i], 16 * i);
4242   }
4243 
4244   // Load consts for outer loop
4245   lvx(consts0[0], constants);
4246   for (int i = 1; i < unroll_factor2 - 1; ++i) {
4247     lvx(consts0[i], offs[i], constants);
4248   }
4249 
4250   load_const_optimized(num_bytes, 16 * unroll_factor);
4251 
4252   // Reuse data registers outside of the loop.
4253   VectorRegister Vtmp = data1[0];
4254   VectorRegister Vtmp2 = data1[1];
4255   VectorRegister zeroes = data1[2];
4256 
4257   vspltisb(Vtmp, 0);
4258   vsldoi(VCRC, Vtmp, VCRC, 8); // 96 bit zeroes, 32 bit CRC.
4259 
4260   // Load vector for vpermxor (to xor both 64 bit parts together)
4261   lvsl(Vtmp, buf);   // 000102030405060708090a0b0c0d0e0f
4262   vspltisb(Vc, 4);
4263   vsl(Vc, Vtmp, Vc); // 00102030405060708090a0b0c0d0e0f0
4264   xxspltd(Vc->to_vsr(), Vc->to_vsr(), 0);
4265   vor(Vc, Vtmp, Vc); // 001122334455667708192a3b4c5d6e7f
4266 
4267 #ifdef VM_LITTLE_ENDIAN
4268 #define BE_swap_bytes(x)
4269 #else
4270   vspltisb(Vtmp2, 0xf);
4271   vxor(swap_bytes, Vtmp, Vtmp2);
4272 #define BE_swap_bytes(x) vperm(x, x, x, swap_bytes)
4273 #endif
4274 
4275   cmpd(CCR0, len, num_bytes);
4276   blt(CCR0, L_last);
4277 
4278   addi(cur_const, constants, outer_consts_size); // Point to consts for inner loop
4279   load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off.
4280 
4281   // ********** Main loop start **********
4282   align(32);
4283   bind(L_outer_loop);
4284 
4285   // Begin of unrolled first iteration (no xor).
4286   lvx(data1[0], buf);
4287   for (int i = 1; i < unroll_factor2 / 2; ++i) {
4288     lvx(data1[i], offs[i], buf);
4289   }
4290   vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
4291   lvx(consts1[0], cur_const);
4292   mtctr(loop_count);
4293   for (int i = 0; i < unroll_factor2 / 2; ++i) {
4294     BE_swap_bytes(data1[i]);
4295     if (i == 0) { vxor(data1[0], data1[0], VCRC); } // xor in previous CRC.
4296     lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
4297     vpmsumw(data0[i], data1[i], consts1[0]);
4298   }
4299   addi(buf, buf, 16 * unroll_factor2);
4300   subf(len, num_bytes, len);
4301   lvx(consts1[1], offs[1], cur_const);
4302   addi(cur_const, cur_const, 32);
4303   // Begin of unrolled second iteration (head).
4304   for (int i = 0; i < unroll_factor2 / 2; ++i) {
4305     BE_swap_bytes(data1[i + unroll_factor2 / 2]);
4306     if (i == 0) { lvx(data1[0], buf); } else { lvx(data1[i], offs[i], buf); }
4307     vpmsumw(data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[0]);
4308   }
4309   for (int i = 0; i < unroll_factor2 / 2; ++i) {
4310     BE_swap_bytes(data1[i]);
4311     lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
4312     vpmsumw(data1[i], data1[i], consts1[1]);
4313   }
4314   addi(buf, buf, 16 * unroll_factor2);
4315 
4316   // Generate most performance relevant code. Loads + half of the vpmsumw have been generated.
4317   // Double-iteration allows using the 2 constant registers alternatingly.
4318   align(32);
4319   bind(L_inner_loop);
4320   for (int j = 1; j < 3; ++j) { // j < unroll_factor / unroll_factor2 - 1 for complete unrolling.
4321     if (j & 1) {
4322       lvx(consts1[0], cur_const);
4323     } else {
4324       lvx(consts1[1], offs[1], cur_const);
4325       addi(cur_const, cur_const, 32);
4326     }
4327     for (int i = 0; i < unroll_factor2; ++i) {
4328       int idx = i + unroll_factor2 / 2, inc = 0; // For modulo-scheduled input.
4329       if (idx >= unroll_factor2) { idx -= unroll_factor2; inc = 1; }
4330       BE_swap_bytes(data1[idx]);
4331       vxor(data0[i], data0[i], data1[i]);
4332       if (i == 0) lvx(data1[0], buf); else lvx(data1[i], offs[i], buf);
4333       vpmsumw(data1[idx], data1[idx], consts1[(j + inc) & 1]);
4334     }
4335     addi(buf, buf, 16 * unroll_factor2);
4336   }
4337   bdnz(L_inner_loop);
4338 
4339   addi(cur_const, constants, outer_consts_size); // Reset
4340 
4341   // Tail of last iteration (no loads).
4342   for (int i = 0; i < unroll_factor2 / 2; ++i) {
4343     BE_swap_bytes(data1[i + unroll_factor2 / 2]);
4344     vxor(data0[i], data0[i], data1[i]);
4345     vpmsumw(data1[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[1]);
4346   }
4347   for (int i = 0; i < unroll_factor2 / 2; ++i) {
4348     vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); // First half of fixup shifts.
4349     vxor(data0[i + unroll_factor2 / 2], data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2]);
4350   }
4351 
4352   // Last data register is ok, other ones need fixup shift.
4353   for (int i = unroll_factor2 / 2; i < unroll_factor2 - 1; ++i) {
4354     vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]);
4355   }
4356 
4357   // Combine to 128 bit result vector VCRC = data0[0].
4358   for (int i = 1; i < unroll_factor2; i<<=1) {
4359     for (int j = 0; j <= unroll_factor2 - 2*i; j+=2*i) {
4360       vxor(data0[j], data0[j], data0[j+i]);
4361     }
4362   }
4363   cmpd(CCR0, len, num_bytes);
4364   bge(CCR0, L_outer_loop);
4365 
4366   // Last chance with lower num_bytes.
4367   bind(L_last);
4368   srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations.
4369   // Point behind last const for inner loop.
4370   add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
4371   sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used.
4372   clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2));
4373   subf(cur_const, R0, cur_const); // Point to constant to be used first.
4374 
4375   addic_(loop_count, loop_count, -1); // One double-iteration peeled off.
4376   bgt(CCR0, L_outer_loop);
4377   // ********** Main loop end **********
4378 
4379   // Restore DSCR pre-fetch value.
4380   if (VM_Version::has_mfdscr()) {
4381     load_const_optimized(t0, VM_Version::_dscr_val);
4382     mtdscr(t0);
4383   }
4384 
4385   // ********** Simple loop for remaining 16 byte blocks **********
4386   {
4387     Label L_loop, L_done;
4388 
4389     srdi_(t0, len, 4); // 16 bytes per iteration
4390     clrldi(len, len, 64-4);
4391     beq(CCR0, L_done);
4392 
4393     // Point to const (same as last const for inner loop).
4394     add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size - 16);
4395     mtctr(t0);
4396     lvx(Vtmp2, cur_const);
4397 
4398     align(32);
4399     bind(L_loop);
4400 
4401     lvx(Vtmp, buf);
4402     addi(buf, buf, 16);
4403     vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
4404     BE_swap_bytes(Vtmp);
4405     vxor(VCRC, VCRC, Vtmp);
4406     vpmsumw(VCRC, VCRC, Vtmp2);
4407     bdnz(L_loop);
4408 
4409     bind(L_done);
4410   }
4411   // ********** Simple loop end **********
4412 #undef BE_swap_bytes
4413 
4414   // Point to Barrett constants
4415   add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
4416 
4417   vspltisb(zeroes, 0);
4418 
4419   // Combine to 64 bit result.
4420   vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
4421 
4422   // Reduce to 32 bit CRC: Remainder by multiply-high.
4423   lvx(Vtmp, cur_const);
4424   vsldoi(Vtmp2, zeroes, VCRC, 12);  // Extract high 32 bit.
4425   vpmsumd(Vtmp2, Vtmp2, Vtmp);      // Multiply by inverse long poly.
4426   vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit.
4427   vsldoi(Vtmp, zeroes, Vtmp, 8);
4428   vpmsumd(Vtmp2, Vtmp2, Vtmp);      // Multiply quotient by long poly.
4429   vxor(VCRC, VCRC, Vtmp2);          // Remainder fits into 32 bit.
4430 
4431   // Move result. len is already updated.
4432   vsldoi(VCRC, VCRC, zeroes, 8);
4433   mfvrd(crc, VCRC);
4434 
4435   // Restore non-volatile Vector registers (frameless).
4436   offsetInt = 0;
4437   offsetInt -= 16; li(offset, offsetInt); lvx(VR20, offset, R1_SP);
4438   offsetInt -= 16; li(offset, offsetInt); lvx(VR21, offset, R1_SP);
4439   offsetInt -= 16; li(offset, offsetInt); lvx(VR22, offset, R1_SP);
4440   offsetInt -= 16; li(offset, offsetInt); lvx(VR23, offset, R1_SP);
4441   offsetInt -= 16; li(offset, offsetInt); lvx(VR24, offset, R1_SP);
4442   offsetInt -= 16; li(offset, offsetInt); lvx(VR25, offset, R1_SP);
4443 #ifndef VM_LITTLE_ENDIAN
4444   offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP);
4445 #endif
4446   offsetInt -= 8;  ld(R14, offsetInt, R1_SP);
4447   offsetInt -= 8;  ld(R15, offsetInt, R1_SP);
4448 }
4449 
4450 void MacroAssembler::crc32(Register crc, Register buf, Register len, Register t0, Register t1, Register t2,
4451                            Register t3, Register t4, Register t5, Register t6, Register t7, bool is_crc32c) {
4452   load_const_optimized(t0, is_crc32c ? StubRoutines::crc32c_table_addr()
4453                                      : StubRoutines::crc_table_addr()   , R0);
4454 
4455   if (VM_Version::has_vpmsumb()) {
4456     kernel_crc32_vpmsum(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, !is_crc32c);
4457   } else {
4458     kernel_crc32_1word(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, t0, !is_crc32c);
4459   }
4460 }
4461 
4462 void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) {
4463   assert_different_registers(crc, val, table);
4464 
4465   BLOCK_COMMENT("kernel_crc32_singleByteReg:");
4466   if (invertCRC) {
4467     nand(crc, crc, crc);                // 1s complement of crc
4468   }
4469 
4470   update_byte_crc32(crc, val, table);
4471 
4472   if (invertCRC) {
4473     nand(crc, crc, crc);                // 1s complement of crc
4474   }
4475 }
4476 
4477 // dest_lo += src1 + src2
4478 // dest_hi += carry1 + carry2
4479 void MacroAssembler::add2_with_carry(Register dest_hi,
4480                                      Register dest_lo,
4481                                      Register src1, Register src2) {
4482   li(R0, 0);
4483   addc(dest_lo, dest_lo, src1);
4484   adde(dest_hi, dest_hi, R0);
4485   addc(dest_lo, dest_lo, src2);
4486   adde(dest_hi, dest_hi, R0);
4487 }
4488 
4489 // Multiply 64 bit by 64 bit first loop.
4490 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,
4491                                            Register x_xstart,
4492                                            Register y, Register y_idx,
4493                                            Register z,
4494                                            Register carry,
4495                                            Register product_high, Register product,
4496                                            Register idx, Register kdx,
4497                                            Register tmp) {
4498   //  jlong carry, x[], y[], z[];
4499   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
4500   //    huge_128 product = y[idx] * x[xstart] + carry;
4501   //    z[kdx] = (jlong)product;
4502   //    carry  = (jlong)(product >>> 64);
4503   //  }
4504   //  z[xstart] = carry;
4505 
4506   Label L_first_loop, L_first_loop_exit;
4507   Label L_one_x, L_one_y, L_multiply;
4508 
4509   addic_(xstart, xstart, -1);
4510   blt(CCR0, L_one_x);   // Special case: length of x is 1.
4511 
4512   // Load next two integers of x.
4513   sldi(tmp, xstart, LogBytesPerInt);
4514   ldx(x_xstart, x, tmp);
4515 #ifdef VM_LITTLE_ENDIAN
4516   rldicl(x_xstart, x_xstart, 32, 0);
4517 #endif
4518 
4519   align(32, 16);
4520   bind(L_first_loop);
4521 
4522   cmpdi(CCR0, idx, 1);
4523   blt(CCR0, L_first_loop_exit);
4524   addi(idx, idx, -2);
4525   beq(CCR0, L_one_y);
4526 
4527   // Load next two integers of y.
4528   sldi(tmp, idx, LogBytesPerInt);
4529   ldx(y_idx, y, tmp);
4530 #ifdef VM_LITTLE_ENDIAN
4531   rldicl(y_idx, y_idx, 32, 0);
4532 #endif
4533 
4534 
4535   bind(L_multiply);
4536   multiply64(product_high, product, x_xstart, y_idx);
4537 
4538   li(tmp, 0);
4539   addc(product, product, carry);         // Add carry to result.
4540   adde(product_high, product_high, tmp); // Add carry of the last addition.
4541   addi(kdx, kdx, -2);
4542 
4543   // Store result.
4544 #ifdef VM_LITTLE_ENDIAN
4545   rldicl(product, product, 32, 0);
4546 #endif
4547   sldi(tmp, kdx, LogBytesPerInt);
4548   stdx(product, z, tmp);
4549   mr_if_needed(carry, product_high);
4550   b(L_first_loop);
4551 
4552 
4553   bind(L_one_y); // Load one 32 bit portion of y as (0,value).
4554 
4555   lwz(y_idx, 0, y);
4556   b(L_multiply);
4557 
4558 
4559   bind(L_one_x); // Load one 32 bit portion of x as (0,value).
4560 
4561   lwz(x_xstart, 0, x);
4562   b(L_first_loop);
4563 
4564   bind(L_first_loop_exit);
4565 }
4566 
4567 // Multiply 64 bit by 64 bit and add 128 bit.
4568 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y,
4569                                             Register z, Register yz_idx,
4570                                             Register idx, Register carry,
4571                                             Register product_high, Register product,
4572                                             Register tmp, int offset) {
4573 
4574   //  huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
4575   //  z[kdx] = (jlong)product;
4576 
4577   sldi(tmp, idx, LogBytesPerInt);
4578   if (offset) {
4579     addi(tmp, tmp, offset);
4580   }
4581   ldx(yz_idx, y, tmp);
4582 #ifdef VM_LITTLE_ENDIAN
4583   rldicl(yz_idx, yz_idx, 32, 0);
4584 #endif
4585 
4586   multiply64(product_high, product, x_xstart, yz_idx);
4587   ldx(yz_idx, z, tmp);
4588 #ifdef VM_LITTLE_ENDIAN
4589   rldicl(yz_idx, yz_idx, 32, 0);
4590 #endif
4591 
4592   add2_with_carry(product_high, product, carry, yz_idx);
4593 
4594   sldi(tmp, idx, LogBytesPerInt);
4595   if (offset) {
4596     addi(tmp, tmp, offset);
4597   }
4598 #ifdef VM_LITTLE_ENDIAN
4599   rldicl(product, product, 32, 0);
4600 #endif
4601   stdx(product, z, tmp);
4602 }
4603 
4604 // Multiply 128 bit by 128 bit. Unrolled inner loop.
4605 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart,
4606                                              Register y, Register z,
4607                                              Register yz_idx, Register idx, Register carry,
4608                                              Register product_high, Register product,
4609                                              Register carry2, Register tmp) {
4610 
4611   //  jlong carry, x[], y[], z[];
4612   //  int kdx = ystart+1;
4613   //  for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
4614   //    huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
4615   //    z[kdx+idx+1] = (jlong)product;
4616   //    jlong carry2 = (jlong)(product >>> 64);
4617   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
4618   //    z[kdx+idx] = (jlong)product;
4619   //    carry = (jlong)(product >>> 64);
4620   //  }
4621   //  idx += 2;
4622   //  if (idx > 0) {
4623   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
4624   //    z[kdx+idx] = (jlong)product;
4625   //    carry = (jlong)(product >>> 64);
4626   //  }
4627 
4628   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
4629   const Register jdx = R0;
4630 
4631   // Scale the index.
4632   srdi_(jdx, idx, 2);
4633   beq(CCR0, L_third_loop_exit);
4634   mtctr(jdx);
4635 
4636   align(32, 16);
4637   bind(L_third_loop);
4638 
4639   addi(idx, idx, -4);
4640 
4641   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8);
4642   mr_if_needed(carry2, product_high);
4643 
4644   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0);
4645   mr_if_needed(carry, product_high);
4646   bdnz(L_third_loop);
4647 
4648   bind(L_third_loop_exit);  // Handle any left-over operand parts.
4649 
4650   andi_(idx, idx, 0x3);
4651   beq(CCR0, L_post_third_loop_done);
4652 
4653   Label L_check_1;
4654 
4655   addic_(idx, idx, -2);
4656   blt(CCR0, L_check_1);
4657 
4658   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0);
4659   mr_if_needed(carry, product_high);
4660 
4661   bind(L_check_1);
4662 
4663   addi(idx, idx, 0x2);
4664   andi_(idx, idx, 0x1);
4665   addic_(idx, idx, -1);
4666   blt(CCR0, L_post_third_loop_done);
4667 
4668   sldi(tmp, idx, LogBytesPerInt);
4669   lwzx(yz_idx, y, tmp);
4670   multiply64(product_high, product, x_xstart, yz_idx);
4671   lwzx(yz_idx, z, tmp);
4672 
4673   add2_with_carry(product_high, product, yz_idx, carry);
4674 
4675   sldi(tmp, idx, LogBytesPerInt);
4676   stwx(product, z, tmp);
4677   srdi(product, product, 32);
4678 
4679   sldi(product_high, product_high, 32);
4680   orr(product, product, product_high);
4681   mr_if_needed(carry, product);
4682 
4683   bind(L_post_third_loop_done);
4684 }   // multiply_128_x_128_loop
4685 
4686 void MacroAssembler::muladd(Register out, Register in,
4687                             Register offset, Register len, Register k,
4688                             Register tmp1, Register tmp2, Register carry) {
4689 
4690   // Labels
4691   Label LOOP, SKIP;
4692 
4693   // Make sure length is positive.
4694   cmpdi  (CCR0,    len,     0);
4695 
4696   // Prepare variables
4697   subi   (offset,  offset,  4);
4698   li     (carry,   0);
4699   ble    (CCR0,    SKIP);
4700 
4701   mtctr  (len);
4702   subi   (len,     len,     1    );
4703   sldi   (len,     len,     2    );
4704 
4705   // Main loop
4706   bind(LOOP);
4707   lwzx   (tmp1,    len,     in   );
4708   lwzx   (tmp2,    offset,  out  );
4709   mulld  (tmp1,    tmp1,    k    );
4710   add    (tmp2,    carry,   tmp2 );
4711   add    (tmp2,    tmp1,    tmp2 );
4712   stwx   (tmp2,    offset,  out  );
4713   srdi   (carry,   tmp2,    32   );
4714   subi   (offset,  offset,  4    );
4715   subi   (len,     len,     4    );
4716   bdnz   (LOOP);
4717   bind(SKIP);
4718 }
4719 
4720 void MacroAssembler::multiply_to_len(Register x, Register xlen,
4721                                      Register y, Register ylen,
4722                                      Register z, Register zlen,
4723                                      Register tmp1, Register tmp2,
4724                                      Register tmp3, Register tmp4,
4725                                      Register tmp5, Register tmp6,
4726                                      Register tmp7, Register tmp8,
4727                                      Register tmp9, Register tmp10,
4728                                      Register tmp11, Register tmp12,
4729                                      Register tmp13) {
4730 
4731   ShortBranchVerifier sbv(this);
4732 
4733   assert_different_registers(x, xlen, y, ylen, z, zlen,
4734                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
4735   assert_different_registers(x, xlen, y, ylen, z, zlen,
4736                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp7);
4737   assert_different_registers(x, xlen, y, ylen, z, zlen,
4738                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp8);
4739 
4740   const Register idx = tmp1;
4741   const Register kdx = tmp2;
4742   const Register xstart = tmp3;
4743 
4744   const Register y_idx = tmp4;
4745   const Register carry = tmp5;
4746   const Register product = tmp6;
4747   const Register product_high = tmp7;
4748   const Register x_xstart = tmp8;
4749   const Register tmp = tmp9;
4750 
4751   // First Loop.
4752   //
4753   //  final static long LONG_MASK = 0xffffffffL;
4754   //  int xstart = xlen - 1;
4755   //  int ystart = ylen - 1;
4756   //  long carry = 0;
4757   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
4758   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
4759   //    z[kdx] = (int)product;
4760   //    carry = product >>> 32;
4761   //  }
4762   //  z[xstart] = (int)carry;
4763 
4764   mr_if_needed(idx, ylen);        // idx = ylen
4765   mr_if_needed(kdx, zlen);        // kdx = xlen + ylen
4766   li(carry, 0);                   // carry = 0
4767 
4768   Label L_done;
4769 
4770   addic_(xstart, xlen, -1);
4771   blt(CCR0, L_done);
4772 
4773   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z,
4774                         carry, product_high, product, idx, kdx, tmp);
4775 
4776   Label L_second_loop;
4777 
4778   cmpdi(CCR0, kdx, 0);
4779   beq(CCR0, L_second_loop);
4780 
4781   Label L_carry;
4782 
4783   addic_(kdx, kdx, -1);
4784   beq(CCR0, L_carry);
4785 
4786   // Store lower 32 bits of carry.
4787   sldi(tmp, kdx, LogBytesPerInt);
4788   stwx(carry, z, tmp);
4789   srdi(carry, carry, 32);
4790   addi(kdx, kdx, -1);
4791 
4792 
4793   bind(L_carry);
4794 
4795   // Store upper 32 bits of carry.
4796   sldi(tmp, kdx, LogBytesPerInt);
4797   stwx(carry, z, tmp);
4798 
4799   // Second and third (nested) loops.
4800   //
4801   //  for (int i = xstart-1; i >= 0; i--) { // Second loop
4802   //    carry = 0;
4803   //    for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
4804   //      long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
4805   //                     (z[k] & LONG_MASK) + carry;
4806   //      z[k] = (int)product;
4807   //      carry = product >>> 32;
4808   //    }
4809   //    z[i] = (int)carry;
4810   //  }
4811   //
4812   //  i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
4813 
4814   bind(L_second_loop);
4815 
4816   li(carry, 0);                   // carry = 0;
4817 
4818   addic_(xstart, xstart, -1);     // i = xstart-1;
4819   blt(CCR0, L_done);
4820 
4821   Register zsave = tmp10;
4822 
4823   mr(zsave, z);
4824 
4825 
4826   Label L_last_x;
4827 
4828   sldi(tmp, xstart, LogBytesPerInt);
4829   add(z, z, tmp);                 // z = z + k - j
4830   addi(z, z, 4);
4831   addic_(xstart, xstart, -1);     // i = xstart-1;
4832   blt(CCR0, L_last_x);
4833 
4834   sldi(tmp, xstart, LogBytesPerInt);
4835   ldx(x_xstart, x, tmp);
4836 #ifdef VM_LITTLE_ENDIAN
4837   rldicl(x_xstart, x_xstart, 32, 0);
4838 #endif
4839 
4840 
4841   Label L_third_loop_prologue;
4842 
4843   bind(L_third_loop_prologue);
4844 
4845   Register xsave = tmp11;
4846   Register xlensave = tmp12;
4847   Register ylensave = tmp13;
4848 
4849   mr(xsave, x);
4850   mr(xlensave, xstart);
4851   mr(ylensave, ylen);
4852 
4853 
4854   multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen,
4855                           carry, product_high, product, x, tmp);
4856 
4857   mr(z, zsave);
4858   mr(x, xsave);
4859   mr(xlen, xlensave);   // This is the decrement of the loop counter!
4860   mr(ylen, ylensave);
4861 
4862   addi(tmp3, xlen, 1);
4863   sldi(tmp, tmp3, LogBytesPerInt);
4864   stwx(carry, z, tmp);
4865   addic_(tmp3, tmp3, -1);
4866   blt(CCR0, L_done);
4867 
4868   srdi(carry, carry, 32);
4869   sldi(tmp, tmp3, LogBytesPerInt);
4870   stwx(carry, z, tmp);
4871   b(L_second_loop);
4872 
4873   // Next infrequent code is moved outside loops.
4874   bind(L_last_x);
4875 
4876   lwz(x_xstart, 0, x);
4877   b(L_third_loop_prologue);
4878 
4879   bind(L_done);
4880 }   // multiply_to_len
4881 
4882 void MacroAssembler::asm_assert(bool check_equal, const char *msg, int id) {
4883 #ifdef ASSERT
4884   Label ok;
4885   if (check_equal) {
4886     beq(CCR0, ok);
4887   } else {
4888     bne(CCR0, ok);
4889   }
4890   stop(msg, id);
4891   bind(ok);
4892 #endif
4893 }
4894 
4895 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset,
4896                                           Register mem_base, const char* msg, int id) {
4897 #ifdef ASSERT
4898   switch (size) {
4899     case 4:
4900       lwz(R0, mem_offset, mem_base);
4901       cmpwi(CCR0, R0, 0);
4902       break;
4903     case 8:
4904       ld(R0, mem_offset, mem_base);
4905       cmpdi(CCR0, R0, 0);
4906       break;
4907     default:
4908       ShouldNotReachHere();
4909   }
4910   asm_assert(check_equal, msg, id);
4911 #endif // ASSERT
4912 }
4913 
4914 void MacroAssembler::verify_thread() {
4915   if (VerifyThread) {
4916     unimplemented("'VerifyThread' currently not implemented on PPC");
4917   }
4918 }
4919 
4920 // READ: oop. KILL: R0. Volatile floats perhaps.
4921 void MacroAssembler::verify_oop(Register oop, const char* msg) {
4922   if (!VerifyOops) {
4923     return;
4924   }
4925 
4926   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4927   const Register tmp = R11; // Will be preserved.
4928   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
4929   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
4930 
4931   mr_if_needed(R4_ARG2, oop);
4932   save_LR_CR(tmp); // save in old frame
4933   push_frame_reg_args(nbytes_save, tmp);
4934   // load FunctionDescriptor** / entry_address *
4935   load_const_optimized(tmp, fd, R0);
4936   // load FunctionDescriptor* / entry_address
4937   ld(tmp, 0, tmp);
4938   load_const_optimized(R3_ARG1, (address)msg, R0);
4939   // Call destination for its side effect.
4940   call_c(tmp);
4941 
4942   pop_frame();
4943   restore_LR_CR(tmp);
4944   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
4945 }
4946 
4947 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) {
4948   if (!VerifyOops) {
4949     return;
4950   }
4951 
4952   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4953   const Register tmp = R11; // Will be preserved.
4954   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
4955   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
4956 
4957   ld(R4_ARG2, offs, base);
4958   save_LR_CR(tmp); // save in old frame
4959   push_frame_reg_args(nbytes_save, tmp);
4960   // load FunctionDescriptor** / entry_address *
4961   load_const_optimized(tmp, fd, R0);
4962   // load FunctionDescriptor* / entry_address
4963   ld(tmp, 0, tmp);
4964   load_const_optimized(R3_ARG1, (address)msg, R0);
4965   // Call destination for its side effect.
4966   call_c(tmp);
4967 
4968   pop_frame();
4969   restore_LR_CR(tmp);
4970   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
4971 }
4972 
4973 const char* stop_types[] = {
4974   "stop",
4975   "untested",
4976   "unimplemented",
4977   "shouldnotreachhere"
4978 };
4979 
4980 static void stop_on_request(int tp, const char* msg) {
4981   tty->print("PPC assembly code requires stop: (%s) %s\n", stop_types[tp%/*stop_end*/4], msg);
4982   guarantee(false, "PPC assembly code requires stop: %s", msg);
4983 }
4984 
4985 // Call a C-function that prints output.
4986 void MacroAssembler::stop(int type, const char* msg, int id) {
4987 #ifndef PRODUCT
4988   block_comment(err_msg("stop: %s %s {", stop_types[type%stop_end], msg));
4989 #else
4990   block_comment("stop {");
4991 #endif
4992 
4993   // setup arguments
4994   load_const_optimized(R3_ARG1, type);
4995   load_const_optimized(R4_ARG2, (void *)msg, /*tmp=*/R0);
4996   call_VM_leaf(CAST_FROM_FN_PTR(address, stop_on_request), R3_ARG1, R4_ARG2);
4997   illtrap();
4998   emit_int32(id);
4999   block_comment("} stop;");
5000 }
5001 
5002 #ifndef PRODUCT
5003 // Write pattern 0x0101010101010101 in memory region [low-before, high+after].
5004 // Val, addr are temp registers.
5005 // If low == addr, addr is killed.
5006 // High is preserved.
5007 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) {
5008   if (!ZapMemory) return;
5009 
5010   assert_different_registers(low, val);
5011 
5012   BLOCK_COMMENT("zap memory region {");
5013   load_const_optimized(val, 0x0101010101010101);
5014   int size = before + after;
5015   if (low == high && size < 5 && size > 0) {
5016     int offset = -before*BytesPerWord;
5017     for (int i = 0; i < size; ++i) {
5018       std(val, offset, low);
5019       offset += (1*BytesPerWord);
5020     }
5021   } else {
5022     addi(addr, low, -before*BytesPerWord);
5023     assert_different_registers(high, val);
5024     if (after) addi(high, high, after * BytesPerWord);
5025     Label loop;
5026     bind(loop);
5027     std(val, 0, addr);
5028     addi(addr, addr, 8);
5029     cmpd(CCR6, addr, high);
5030     ble(CCR6, loop);
5031     if (after) addi(high, high, -after * BytesPerWord);  // Correct back to old value.
5032   }
5033   BLOCK_COMMENT("} zap memory region");
5034 }
5035 
5036 #endif // !PRODUCT
5037 
5038 void SkipIfEqualZero::skip_to_label_if_equal_zero(MacroAssembler* masm, Register temp,
5039                                                   const bool* flag_addr, Label& label) {
5040   int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true);
5041   assert(sizeof(bool) == 1, "PowerPC ABI");
5042   masm->lbz(temp, simm16_offset, temp);
5043   masm->cmpwi(CCR0, temp, 0);
5044   masm->beq(CCR0, label);
5045 }
5046 
5047 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() {
5048   skip_to_label_if_equal_zero(masm, temp, flag_addr, _label);
5049 }
5050 
5051 SkipIfEqualZero::~SkipIfEqualZero() {
5052   _masm->bind(_label);
5053 }