1 /*
   2  * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2012, 2018, SAP SE. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "compiler/disassembler.hpp"
  29 #include "gc/shared/collectedHeap.inline.hpp"
  30 #include "gc/shared/barrierSet.hpp"
  31 #include "gc/shared/barrierSetAssembler.hpp"
  32 #include "interpreter/interpreter.hpp"
  33 #include "memory/resourceArea.hpp"
  34 #include "nativeInst_ppc.hpp"
  35 #include "prims/methodHandles.hpp"
  36 #include "runtime/biasedLocking.hpp"
  37 #include "runtime/icache.hpp"
  38 #include "runtime/interfaceSupport.inline.hpp"
  39 #include "runtime/objectMonitor.hpp"
  40 #include "runtime/os.hpp"
  41 #include "runtime/safepoint.hpp"
  42 #include "runtime/safepointMechanism.hpp"
  43 #include "runtime/sharedRuntime.hpp"
  44 #include "runtime/stubRoutines.hpp"
  45 #include "utilities/macros.hpp"
  46 #ifdef COMPILER2
  47 #include "opto/intrinsicnode.hpp"
  48 #endif
  49 
  50 #ifdef PRODUCT
  51 #define BLOCK_COMMENT(str) // nothing
  52 #else
  53 #define BLOCK_COMMENT(str) block_comment(str)
  54 #endif
  55 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  56 
  57 #ifdef ASSERT
  58 // On RISC, there's no benefit to verifying instruction boundaries.
  59 bool AbstractAssembler::pd_check_instruction_mark() { return false; }
  60 #endif
  61 
  62 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) {
  63   assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range");
  64   if (Assembler::is_simm(si31, 16)) {
  65     ld(d, si31, a);
  66     if (emit_filler_nop) nop();
  67   } else {
  68     const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31);
  69     const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31);
  70     addis(d, a, hi);
  71     ld(d, lo, d);
  72   }
  73 }
  74 
  75 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) {
  76   assert_different_registers(d, a);
  77   ld_largeoffset_unchecked(d, si31, a, emit_filler_nop);
  78 }
  79 
  80 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base,
  81                                       size_t size_in_bytes, bool is_signed) {
  82   switch (size_in_bytes) {
  83   case  8:              ld(dst, offs, base);                         break;
  84   case  4:  is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break;
  85   case  2:  is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break;
  86   case  1:  lbz(dst, offs, base); if (is_signed) extsb(dst, dst);    break; // lba doesn't exist :(
  87   default:  ShouldNotReachHere();
  88   }
  89 }
  90 
  91 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base,
  92                                        size_t size_in_bytes) {
  93   switch (size_in_bytes) {
  94   case  8:  std(dst, offs, base); break;
  95   case  4:  stw(dst, offs, base); break;
  96   case  2:  sth(dst, offs, base); break;
  97   case  1:  stb(dst, offs, base); break;
  98   default:  ShouldNotReachHere();
  99   }
 100 }
 101 
 102 void MacroAssembler::align(int modulus, int max, int rem) {
 103   int padding = (rem + modulus - (offset() % modulus)) % modulus;
 104   if (padding > max) return;
 105   for (int c = (padding >> 2); c > 0; --c) { nop(); }
 106 }
 107 
 108 // Issue instructions that calculate given TOC from global TOC.
 109 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16,
 110                                                        bool add_relocation, bool emit_dummy_addr) {
 111   int offset = -1;
 112   if (emit_dummy_addr) {
 113     offset = -128; // dummy address
 114   } else if (addr != (address)(intptr_t)-1) {
 115     offset = MacroAssembler::offset_to_global_toc(addr);
 116   }
 117 
 118   if (hi16) {
 119     addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset));
 120   }
 121   if (lo16) {
 122     if (add_relocation) {
 123       // Relocate at the addi to avoid confusion with a load from the method's TOC.
 124       relocate(internal_word_Relocation::spec(addr));
 125     }
 126     addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset));
 127   }
 128 }
 129 
 130 address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) {
 131   const int offset = MacroAssembler::offset_to_global_toc(addr);
 132 
 133   const address inst2_addr = a;
 134   const int inst2 = *(int *)inst2_addr;
 135 
 136   // The relocation points to the second instruction, the addi,
 137   // and the addi reads and writes the same register dst.
 138   const int dst = inv_rt_field(inst2);
 139   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 140 
 141   // Now, find the preceding addis which writes to dst.
 142   int inst1 = 0;
 143   address inst1_addr = inst2_addr - BytesPerInstWord;
 144   while (inst1_addr >= bound) {
 145     inst1 = *(int *) inst1_addr;
 146     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 147       // Stop, found the addis which writes dst.
 148       break;
 149     }
 150     inst1_addr -= BytesPerInstWord;
 151   }
 152 
 153   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 154   set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset));
 155   set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset));
 156   return inst1_addr;
 157 }
 158 
 159 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) {
 160   const address inst2_addr = a;
 161   const int inst2 = *(int *)inst2_addr;
 162 
 163   // The relocation points to the second instruction, the addi,
 164   // and the addi reads and writes the same register dst.
 165   const int dst = inv_rt_field(inst2);
 166   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 167 
 168   // Now, find the preceding addis which writes to dst.
 169   int inst1 = 0;
 170   address inst1_addr = inst2_addr - BytesPerInstWord;
 171   while (inst1_addr >= bound) {
 172     inst1 = *(int *) inst1_addr;
 173     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 174       // stop, found the addis which writes dst
 175       break;
 176     }
 177     inst1_addr -= BytesPerInstWord;
 178   }
 179 
 180   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 181 
 182   int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0);
 183   // -1 is a special case
 184   if (offset == -1) {
 185     return (address)(intptr_t)-1;
 186   } else {
 187     return global_toc() + offset;
 188   }
 189 }
 190 
 191 #ifdef _LP64
 192 // Patch compressed oops or klass constants.
 193 // Assembler sequence is
 194 // 1) compressed oops:
 195 //    lis  rx = const.hi
 196 //    ori rx = rx | const.lo
 197 // 2) compressed klass:
 198 //    lis  rx = const.hi
 199 //    clrldi rx = rx & 0xFFFFffff // clearMS32b, optional
 200 //    ori rx = rx | const.lo
 201 // Clrldi will be passed by.
 202 address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) {
 203   assert(UseCompressedOops, "Should only patch compressed oops");
 204 
 205   const address inst2_addr = a;
 206   const int inst2 = *(int *)inst2_addr;
 207 
 208   // The relocation points to the second instruction, the ori,
 209   // and the ori reads and writes the same register dst.
 210   const int dst = inv_rta_field(inst2);
 211   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 212   // Now, find the preceding addis which writes to dst.
 213   int inst1 = 0;
 214   address inst1_addr = inst2_addr - BytesPerInstWord;
 215   bool inst1_found = false;
 216   while (inst1_addr >= bound) {
 217     inst1 = *(int *)inst1_addr;
 218     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; }
 219     inst1_addr -= BytesPerInstWord;
 220   }
 221   assert(inst1_found, "inst is not lis");
 222 
 223   int xc = (data >> 16) & 0xffff;
 224   int xd = (data >>  0) & 0xffff;
 225 
 226   set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo
 227   set_imm((int *)inst2_addr,        (xd)); // unsigned int
 228   return inst1_addr;
 229 }
 230 
 231 // Get compressed oop or klass constant.
 232 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) {
 233   assert(UseCompressedOops, "Should only patch compressed oops");
 234 
 235   const address inst2_addr = a;
 236   const int inst2 = *(int *)inst2_addr;
 237 
 238   // The relocation points to the second instruction, the ori,
 239   // and the ori reads and writes the same register dst.
 240   const int dst = inv_rta_field(inst2);
 241   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 242   // Now, find the preceding lis which writes to dst.
 243   int inst1 = 0;
 244   address inst1_addr = inst2_addr - BytesPerInstWord;
 245   bool inst1_found = false;
 246 
 247   while (inst1_addr >= bound) {
 248     inst1 = *(int *) inst1_addr;
 249     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;}
 250     inst1_addr -= BytesPerInstWord;
 251   }
 252   assert(inst1_found, "inst is not lis");
 253 
 254   uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff));
 255   uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16);
 256 
 257   return (int) (xl | xh);
 258 }
 259 #endif // _LP64
 260 
 261 // Returns true if successful.
 262 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a,
 263                                                 Register toc, bool fixed_size) {
 264   int toc_offset = 0;
 265   // Use RelocationHolder::none for the constant pool entry, otherwise
 266   // we will end up with a failing NativeCall::verify(x) where x is
 267   // the address of the constant pool entry.
 268   // FIXME: We should insert relocation information for oops at the constant
 269   // pool entries instead of inserting it at the loads; patching of a constant
 270   // pool entry should be less expensive.
 271   address const_address = address_constant((address)a.value(), RelocationHolder::none);
 272   if (const_address == NULL) { return false; } // allocation failure
 273   // Relocate at the pc of the load.
 274   relocate(a.rspec());
 275   toc_offset = (int)(const_address - code()->consts()->start());
 276   ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size);
 277   return true;
 278 }
 279 
 280 bool MacroAssembler::is_load_const_from_method_toc_at(address a) {
 281   const address inst1_addr = a;
 282   const int inst1 = *(int *)inst1_addr;
 283 
 284    // The relocation points to the ld or the addis.
 285    return (is_ld(inst1)) ||
 286           (is_addis(inst1) && inv_ra_field(inst1) != 0);
 287 }
 288 
 289 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) {
 290   assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc");
 291 
 292   const address inst1_addr = a;
 293   const int inst1 = *(int *)inst1_addr;
 294 
 295   if (is_ld(inst1)) {
 296     return inv_d1_field(inst1);
 297   } else if (is_addis(inst1)) {
 298     const int dst = inv_rt_field(inst1);
 299 
 300     // Now, find the succeeding ld which reads and writes to dst.
 301     address inst2_addr = inst1_addr + BytesPerInstWord;
 302     int inst2 = 0;
 303     while (true) {
 304       inst2 = *(int *) inst2_addr;
 305       if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) {
 306         // Stop, found the ld which reads and writes dst.
 307         break;
 308       }
 309       inst2_addr += BytesPerInstWord;
 310     }
 311     return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2);
 312   }
 313   ShouldNotReachHere();
 314   return 0;
 315 }
 316 
 317 // Get the constant from a `load_const' sequence.
 318 long MacroAssembler::get_const(address a) {
 319   assert(is_load_const_at(a), "not a load of a constant");
 320   const int *p = (const int*) a;
 321   unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48);
 322   if (is_ori(*(p+1))) {
 323     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32);
 324     x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16);
 325     x |= (((unsigned long) (get_imm(a,4) & 0xffff)));
 326   } else if (is_lis(*(p+1))) {
 327     x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32);
 328     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16);
 329     x |= (((unsigned long) (get_imm(a,3) & 0xffff)));
 330   } else {
 331     ShouldNotReachHere();
 332     return (long) 0;
 333   }
 334   return (long) x;
 335 }
 336 
 337 // Patch the 64 bit constant of a `load_const' sequence. This is a low
 338 // level procedure. It neither flushes the instruction cache nor is it
 339 // mt safe.
 340 void MacroAssembler::patch_const(address a, long x) {
 341   assert(is_load_const_at(a), "not a load of a constant");
 342   int *p = (int*) a;
 343   if (is_ori(*(p+1))) {
 344     set_imm(0 + p, (x >> 48) & 0xffff);
 345     set_imm(1 + p, (x >> 32) & 0xffff);
 346     set_imm(3 + p, (x >> 16) & 0xffff);
 347     set_imm(4 + p, x & 0xffff);
 348   } else if (is_lis(*(p+1))) {
 349     set_imm(0 + p, (x >> 48) & 0xffff);
 350     set_imm(2 + p, (x >> 32) & 0xffff);
 351     set_imm(1 + p, (x >> 16) & 0xffff);
 352     set_imm(3 + p, x & 0xffff);
 353   } else {
 354     ShouldNotReachHere();
 355   }
 356 }
 357 
 358 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) {
 359   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
 360   int index = oop_recorder()->allocate_metadata_index(obj);
 361   RelocationHolder rspec = metadata_Relocation::spec(index);
 362   return AddressLiteral((address)obj, rspec);
 363 }
 364 
 365 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) {
 366   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
 367   int index = oop_recorder()->find_index(obj);
 368   RelocationHolder rspec = metadata_Relocation::spec(index);
 369   return AddressLiteral((address)obj, rspec);
 370 }
 371 
 372 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) {
 373   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
 374   int oop_index = oop_recorder()->allocate_oop_index(obj);
 375   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 376 }
 377 
 378 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) {
 379   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
 380   int oop_index = oop_recorder()->find_index(obj);
 381   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 382 }
 383 
 384 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
 385                                                       Register tmp, int offset) {
 386   intptr_t value = *delayed_value_addr;
 387   if (value != 0) {
 388     return RegisterOrConstant(value + offset);
 389   }
 390 
 391   // Load indirectly to solve generation ordering problem.
 392   // static address, no relocation
 393   int simm16_offset = load_const_optimized(tmp, delayed_value_addr, noreg, true);
 394   ld(tmp, simm16_offset, tmp); // must be aligned ((xa & 3) == 0)
 395 
 396   if (offset != 0) {
 397     addi(tmp, tmp, offset);
 398   }
 399 
 400   return RegisterOrConstant(tmp);
 401 }
 402 
 403 #ifndef PRODUCT
 404 void MacroAssembler::pd_print_patched_instruction(address branch) {
 405   Unimplemented(); // TODO: PPC port
 406 }
 407 #endif // ndef PRODUCT
 408 
 409 // Conditional far branch for destinations encodable in 24+2 bits.
 410 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) {
 411 
 412   // If requested by flag optimize, relocate the bc_far as a
 413   // runtime_call and prepare for optimizing it when the code gets
 414   // relocated.
 415   if (optimize == bc_far_optimize_on_relocate) {
 416     relocate(relocInfo::runtime_call_type);
 417   }
 418 
 419   // variant 2:
 420   //
 421   //    b!cxx SKIP
 422   //    bxx   DEST
 423   //  SKIP:
 424   //
 425 
 426   const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 427                                                 opposite_bcond(inv_boint_bcond(boint)));
 428 
 429   // We emit two branches.
 430   // First, a conditional branch which jumps around the far branch.
 431   const address not_taken_pc = pc() + 2 * BytesPerInstWord;
 432   const address bc_pc        = pc();
 433   bc(opposite_boint, biint, not_taken_pc);
 434 
 435   const int bc_instr = *(int*)bc_pc;
 436   assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition");
 437   assert(opposite_boint == inv_bo_field(bc_instr), "postcondition");
 438   assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))),
 439                                      opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))),
 440          "postcondition");
 441   assert(biint == inv_bi_field(bc_instr), "postcondition");
 442 
 443   // Second, an unconditional far branch which jumps to dest.
 444   // Note: target(dest) remembers the current pc (see CodeSection::target)
 445   //       and returns the current pc if the label is not bound yet; when
 446   //       the label gets bound, the unconditional far branch will be patched.
 447   const address target_pc = target(dest);
 448   const address b_pc  = pc();
 449   b(target_pc);
 450 
 451   assert(not_taken_pc == pc(),                     "postcondition");
 452   assert(dest.is_bound() || target_pc == b_pc, "postcondition");
 453 }
 454 
 455 // 1 or 2 instructions
 456 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) {
 457   if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) {
 458     bc(boint, biint, dest);
 459   } else {
 460     bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate);
 461   }
 462 }
 463 
 464 bool MacroAssembler::is_bc_far_at(address instruction_addr) {
 465   return is_bc_far_variant1_at(instruction_addr) ||
 466          is_bc_far_variant2_at(instruction_addr) ||
 467          is_bc_far_variant3_at(instruction_addr);
 468 }
 469 
 470 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) {
 471   if (is_bc_far_variant1_at(instruction_addr)) {
 472     const address instruction_1_addr = instruction_addr;
 473     const int instruction_1 = *(int*)instruction_1_addr;
 474     return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr);
 475   } else if (is_bc_far_variant2_at(instruction_addr)) {
 476     const address instruction_2_addr = instruction_addr + 4;
 477     return bxx_destination(instruction_2_addr);
 478   } else if (is_bc_far_variant3_at(instruction_addr)) {
 479     return instruction_addr + 8;
 480   }
 481   // variant 4 ???
 482   ShouldNotReachHere();
 483   return NULL;
 484 }
 485 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) {
 486 
 487   if (is_bc_far_variant3_at(instruction_addr)) {
 488     // variant 3, far cond branch to the next instruction, already patched to nops:
 489     //
 490     //    nop
 491     //    endgroup
 492     //  SKIP/DEST:
 493     //
 494     return;
 495   }
 496 
 497   // first, extract boint and biint from the current branch
 498   int boint = 0;
 499   int biint = 0;
 500 
 501   ResourceMark rm;
 502   const int code_size = 2 * BytesPerInstWord;
 503   CodeBuffer buf(instruction_addr, code_size);
 504   MacroAssembler masm(&buf);
 505   if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) {
 506     // Far branch to next instruction: Optimize it by patching nops (produce variant 3).
 507     masm.nop();
 508     masm.endgroup();
 509   } else {
 510     if (is_bc_far_variant1_at(instruction_addr)) {
 511       // variant 1, the 1st instruction contains the destination address:
 512       //
 513       //    bcxx  DEST
 514       //    nop
 515       //
 516       const int instruction_1 = *(int*)(instruction_addr);
 517       boint = inv_bo_field(instruction_1);
 518       biint = inv_bi_field(instruction_1);
 519     } else if (is_bc_far_variant2_at(instruction_addr)) {
 520       // variant 2, the 2nd instruction contains the destination address:
 521       //
 522       //    b!cxx SKIP
 523       //    bxx   DEST
 524       //  SKIP:
 525       //
 526       const int instruction_1 = *(int*)(instruction_addr);
 527       boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))),
 528           opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1))));
 529       biint = inv_bi_field(instruction_1);
 530     } else {
 531       // variant 4???
 532       ShouldNotReachHere();
 533     }
 534 
 535     // second, set the new branch destination and optimize the code
 536     if (dest != instruction_addr + 4 && // the bc_far is still unbound!
 537         masm.is_within_range_of_bcxx(dest, instruction_addr)) {
 538       // variant 1:
 539       //
 540       //    bcxx  DEST
 541       //    nop
 542       //
 543       masm.bc(boint, biint, dest);
 544       masm.nop();
 545     } else {
 546       // variant 2:
 547       //
 548       //    b!cxx SKIP
 549       //    bxx   DEST
 550       //  SKIP:
 551       //
 552       const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 553                                                     opposite_bcond(inv_boint_bcond(boint)));
 554       const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord;
 555       masm.bc(opposite_boint, biint, not_taken_pc);
 556       masm.b(dest);
 557     }
 558   }
 559   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 560 }
 561 
 562 // Emit a NOT mt-safe patchable 64 bit absolute call/jump.
 563 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) {
 564   // get current pc
 565   uint64_t start_pc = (uint64_t) pc();
 566 
 567   const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last
 568   const address pc_of_b  = (address) (start_pc + (0*BytesPerInstWord)); // b is first
 569 
 570   // relocate here
 571   if (rt != relocInfo::none) {
 572     relocate(rt);
 573   }
 574 
 575   if ( ReoptimizeCallSequences &&
 576        (( link && is_within_range_of_b(dest, pc_of_bl)) ||
 577         (!link && is_within_range_of_b(dest, pc_of_b)))) {
 578     // variant 2:
 579     // Emit an optimized, pc-relative call/jump.
 580 
 581     if (link) {
 582       // some padding
 583       nop();
 584       nop();
 585       nop();
 586       nop();
 587       nop();
 588       nop();
 589 
 590       // do the call
 591       assert(pc() == pc_of_bl, "just checking");
 592       bl(dest, relocInfo::none);
 593     } else {
 594       // do the jump
 595       assert(pc() == pc_of_b, "just checking");
 596       b(dest, relocInfo::none);
 597 
 598       // some padding
 599       nop();
 600       nop();
 601       nop();
 602       nop();
 603       nop();
 604       nop();
 605     }
 606 
 607     // Assert that we can identify the emitted call/jump.
 608     assert(is_bxx64_patchable_variant2_at((address)start_pc, link),
 609            "can't identify emitted call");
 610   } else {
 611     // variant 1:
 612     mr(R0, R11);  // spill R11 -> R0.
 613 
 614     // Load the destination address into CTR,
 615     // calculate destination relative to global toc.
 616     calculate_address_from_global_toc(R11, dest, true, true, false);
 617 
 618     mtctr(R11);
 619     mr(R11, R0);  // spill R11 <- R0.
 620     nop();
 621 
 622     // do the call/jump
 623     if (link) {
 624       bctrl();
 625     } else{
 626       bctr();
 627     }
 628     // Assert that we can identify the emitted call/jump.
 629     assert(is_bxx64_patchable_variant1b_at((address)start_pc, link),
 630            "can't identify emitted call");
 631   }
 632 
 633   // Assert that we can identify the emitted call/jump.
 634   assert(is_bxx64_patchable_at((address)start_pc, link),
 635          "can't identify emitted call");
 636   assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest,
 637          "wrong encoding of dest address");
 638 }
 639 
 640 // Identify a bxx64_patchable instruction.
 641 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) {
 642   return is_bxx64_patchable_variant1b_at(instruction_addr, link)
 643     //|| is_bxx64_patchable_variant1_at(instruction_addr, link)
 644       || is_bxx64_patchable_variant2_at(instruction_addr, link);
 645 }
 646 
 647 // Does the call64_patchable instruction use a pc-relative encoding of
 648 // the call destination?
 649 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) {
 650   // variant 2 is pc-relative
 651   return is_bxx64_patchable_variant2_at(instruction_addr, link);
 652 }
 653 
 654 // Identify variant 1.
 655 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) {
 656   unsigned int* instr = (unsigned int*) instruction_addr;
 657   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 658       && is_mtctr(instr[5]) // mtctr
 659     && is_load_const_at(instruction_addr);
 660 }
 661 
 662 // Identify variant 1b: load destination relative to global toc.
 663 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) {
 664   unsigned int* instr = (unsigned int*) instruction_addr;
 665   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 666     && is_mtctr(instr[3]) // mtctr
 667     && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr);
 668 }
 669 
 670 // Identify variant 2.
 671 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) {
 672   unsigned int* instr = (unsigned int*) instruction_addr;
 673   if (link) {
 674     return is_bl (instr[6])  // bl dest is last
 675       && is_nop(instr[0])  // nop
 676       && is_nop(instr[1])  // nop
 677       && is_nop(instr[2])  // nop
 678       && is_nop(instr[3])  // nop
 679       && is_nop(instr[4])  // nop
 680       && is_nop(instr[5]); // nop
 681   } else {
 682     return is_b  (instr[0])  // b  dest is first
 683       && is_nop(instr[1])  // nop
 684       && is_nop(instr[2])  // nop
 685       && is_nop(instr[3])  // nop
 686       && is_nop(instr[4])  // nop
 687       && is_nop(instr[5])  // nop
 688       && is_nop(instr[6]); // nop
 689   }
 690 }
 691 
 692 // Set dest address of a bxx64_patchable instruction.
 693 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) {
 694   ResourceMark rm;
 695   int code_size = MacroAssembler::bxx64_patchable_size;
 696   CodeBuffer buf(instruction_addr, code_size);
 697   MacroAssembler masm(&buf);
 698   masm.bxx64_patchable(dest, relocInfo::none, link);
 699   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 700 }
 701 
 702 // Get dest address of a bxx64_patchable instruction.
 703 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) {
 704   if (is_bxx64_patchable_variant1_at(instruction_addr, link)) {
 705     return (address) (unsigned long) get_const(instruction_addr);
 706   } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) {
 707     unsigned int* instr = (unsigned int*) instruction_addr;
 708     if (link) {
 709       const int instr_idx = 6; // bl is last
 710       int branchoffset = branch_destination(instr[instr_idx], 0);
 711       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 712     } else {
 713       const int instr_idx = 0; // b is first
 714       int branchoffset = branch_destination(instr[instr_idx], 0);
 715       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 716     }
 717   // Load dest relative to global toc.
 718   } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) {
 719     return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord,
 720                                                                instruction_addr);
 721   } else {
 722     ShouldNotReachHere();
 723     return NULL;
 724   }
 725 }
 726 
 727 // Uses ordering which corresponds to ABI:
 728 //    _savegpr0_14:  std  r14,-144(r1)
 729 //    _savegpr0_15:  std  r15,-136(r1)
 730 //    _savegpr0_16:  std  r16,-128(r1)
 731 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) {
 732   std(R14, offset, dst);   offset += 8;
 733   std(R15, offset, dst);   offset += 8;
 734   std(R16, offset, dst);   offset += 8;
 735   std(R17, offset, dst);   offset += 8;
 736   std(R18, offset, dst);   offset += 8;
 737   std(R19, offset, dst);   offset += 8;
 738   std(R20, offset, dst);   offset += 8;
 739   std(R21, offset, dst);   offset += 8;
 740   std(R22, offset, dst);   offset += 8;
 741   std(R23, offset, dst);   offset += 8;
 742   std(R24, offset, dst);   offset += 8;
 743   std(R25, offset, dst);   offset += 8;
 744   std(R26, offset, dst);   offset += 8;
 745   std(R27, offset, dst);   offset += 8;
 746   std(R28, offset, dst);   offset += 8;
 747   std(R29, offset, dst);   offset += 8;
 748   std(R30, offset, dst);   offset += 8;
 749   std(R31, offset, dst);   offset += 8;
 750 
 751   stfd(F14, offset, dst);   offset += 8;
 752   stfd(F15, offset, dst);   offset += 8;
 753   stfd(F16, offset, dst);   offset += 8;
 754   stfd(F17, offset, dst);   offset += 8;
 755   stfd(F18, offset, dst);   offset += 8;
 756   stfd(F19, offset, dst);   offset += 8;
 757   stfd(F20, offset, dst);   offset += 8;
 758   stfd(F21, offset, dst);   offset += 8;
 759   stfd(F22, offset, dst);   offset += 8;
 760   stfd(F23, offset, dst);   offset += 8;
 761   stfd(F24, offset, dst);   offset += 8;
 762   stfd(F25, offset, dst);   offset += 8;
 763   stfd(F26, offset, dst);   offset += 8;
 764   stfd(F27, offset, dst);   offset += 8;
 765   stfd(F28, offset, dst);   offset += 8;
 766   stfd(F29, offset, dst);   offset += 8;
 767   stfd(F30, offset, dst);   offset += 8;
 768   stfd(F31, offset, dst);
 769 }
 770 
 771 // Uses ordering which corresponds to ABI:
 772 //    _restgpr0_14:  ld   r14,-144(r1)
 773 //    _restgpr0_15:  ld   r15,-136(r1)
 774 //    _restgpr0_16:  ld   r16,-128(r1)
 775 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) {
 776   ld(R14, offset, src);   offset += 8;
 777   ld(R15, offset, src);   offset += 8;
 778   ld(R16, offset, src);   offset += 8;
 779   ld(R17, offset, src);   offset += 8;
 780   ld(R18, offset, src);   offset += 8;
 781   ld(R19, offset, src);   offset += 8;
 782   ld(R20, offset, src);   offset += 8;
 783   ld(R21, offset, src);   offset += 8;
 784   ld(R22, offset, src);   offset += 8;
 785   ld(R23, offset, src);   offset += 8;
 786   ld(R24, offset, src);   offset += 8;
 787   ld(R25, offset, src);   offset += 8;
 788   ld(R26, offset, src);   offset += 8;
 789   ld(R27, offset, src);   offset += 8;
 790   ld(R28, offset, src);   offset += 8;
 791   ld(R29, offset, src);   offset += 8;
 792   ld(R30, offset, src);   offset += 8;
 793   ld(R31, offset, src);   offset += 8;
 794 
 795   // FP registers
 796   lfd(F14, offset, src);   offset += 8;
 797   lfd(F15, offset, src);   offset += 8;
 798   lfd(F16, offset, src);   offset += 8;
 799   lfd(F17, offset, src);   offset += 8;
 800   lfd(F18, offset, src);   offset += 8;
 801   lfd(F19, offset, src);   offset += 8;
 802   lfd(F20, offset, src);   offset += 8;
 803   lfd(F21, offset, src);   offset += 8;
 804   lfd(F22, offset, src);   offset += 8;
 805   lfd(F23, offset, src);   offset += 8;
 806   lfd(F24, offset, src);   offset += 8;
 807   lfd(F25, offset, src);   offset += 8;
 808   lfd(F26, offset, src);   offset += 8;
 809   lfd(F27, offset, src);   offset += 8;
 810   lfd(F28, offset, src);   offset += 8;
 811   lfd(F29, offset, src);   offset += 8;
 812   lfd(F30, offset, src);   offset += 8;
 813   lfd(F31, offset, src);
 814 }
 815 
 816 // For verify_oops.
 817 void MacroAssembler::save_volatile_gprs(Register dst, int offset) {
 818   std(R2,  offset, dst);   offset += 8;
 819   std(R3,  offset, dst);   offset += 8;
 820   std(R4,  offset, dst);   offset += 8;
 821   std(R5,  offset, dst);   offset += 8;
 822   std(R6,  offset, dst);   offset += 8;
 823   std(R7,  offset, dst);   offset += 8;
 824   std(R8,  offset, dst);   offset += 8;
 825   std(R9,  offset, dst);   offset += 8;
 826   std(R10, offset, dst);   offset += 8;
 827   std(R11, offset, dst);   offset += 8;
 828   std(R12, offset, dst);   offset += 8;
 829 
 830   stfd(F0, offset, dst);   offset += 8;
 831   stfd(F1, offset, dst);   offset += 8;
 832   stfd(F2, offset, dst);   offset += 8;
 833   stfd(F3, offset, dst);   offset += 8;
 834   stfd(F4, offset, dst);   offset += 8;
 835   stfd(F5, offset, dst);   offset += 8;
 836   stfd(F6, offset, dst);   offset += 8;
 837   stfd(F7, offset, dst);   offset += 8;
 838   stfd(F8, offset, dst);   offset += 8;
 839   stfd(F9, offset, dst);   offset += 8;
 840   stfd(F10, offset, dst);  offset += 8;
 841   stfd(F11, offset, dst);  offset += 8;
 842   stfd(F12, offset, dst);  offset += 8;
 843   stfd(F13, offset, dst);
 844 }
 845 
 846 // For verify_oops.
 847 void MacroAssembler::restore_volatile_gprs(Register src, int offset) {
 848   ld(R2,  offset, src);   offset += 8;
 849   ld(R3,  offset, src);   offset += 8;
 850   ld(R4,  offset, src);   offset += 8;
 851   ld(R5,  offset, src);   offset += 8;
 852   ld(R6,  offset, src);   offset += 8;
 853   ld(R7,  offset, src);   offset += 8;
 854   ld(R8,  offset, src);   offset += 8;
 855   ld(R9,  offset, src);   offset += 8;
 856   ld(R10, offset, src);   offset += 8;
 857   ld(R11, offset, src);   offset += 8;
 858   ld(R12, offset, src);   offset += 8;
 859 
 860   lfd(F0, offset, src);   offset += 8;
 861   lfd(F1, offset, src);   offset += 8;
 862   lfd(F2, offset, src);   offset += 8;
 863   lfd(F3, offset, src);   offset += 8;
 864   lfd(F4, offset, src);   offset += 8;
 865   lfd(F5, offset, src);   offset += 8;
 866   lfd(F6, offset, src);   offset += 8;
 867   lfd(F7, offset, src);   offset += 8;
 868   lfd(F8, offset, src);   offset += 8;
 869   lfd(F9, offset, src);   offset += 8;
 870   lfd(F10, offset, src);  offset += 8;
 871   lfd(F11, offset, src);  offset += 8;
 872   lfd(F12, offset, src);  offset += 8;
 873   lfd(F13, offset, src);
 874 }
 875 
 876 void MacroAssembler::save_LR_CR(Register tmp) {
 877   mfcr(tmp);
 878   std(tmp, _abi(cr), R1_SP);
 879   mflr(tmp);
 880   std(tmp, _abi(lr), R1_SP);
 881   // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad)
 882 }
 883 
 884 void MacroAssembler::restore_LR_CR(Register tmp) {
 885   assert(tmp != R1_SP, "must be distinct");
 886   ld(tmp, _abi(lr), R1_SP);
 887   mtlr(tmp);
 888   ld(tmp, _abi(cr), R1_SP);
 889   mtcr(tmp);
 890 }
 891 
 892 address MacroAssembler::get_PC_trash_LR(Register result) {
 893   Label L;
 894   bl(L);
 895   bind(L);
 896   address lr_pc = pc();
 897   mflr(result);
 898   return lr_pc;
 899 }
 900 
 901 void MacroAssembler::resize_frame(Register offset, Register tmp) {
 902 #ifdef ASSERT
 903   assert_different_registers(offset, tmp, R1_SP);
 904   andi_(tmp, offset, frame::alignment_in_bytes-1);
 905   asm_assert_eq("resize_frame: unaligned", 0x204);
 906 #endif
 907 
 908   // tmp <- *(SP)
 909   ld(tmp, _abi(callers_sp), R1_SP);
 910   // addr <- SP + offset;
 911   // *(addr) <- tmp;
 912   // SP <- addr
 913   stdux(tmp, R1_SP, offset);
 914 }
 915 
 916 void MacroAssembler::resize_frame(int offset, Register tmp) {
 917   assert(is_simm(offset, 16), "too big an offset");
 918   assert_different_registers(tmp, R1_SP);
 919   assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned");
 920   // tmp <- *(SP)
 921   ld(tmp, _abi(callers_sp), R1_SP);
 922   // addr <- SP + offset;
 923   // *(addr) <- tmp;
 924   // SP <- addr
 925   stdu(tmp, offset, R1_SP);
 926 }
 927 
 928 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) {
 929   // (addr == tmp1) || (addr == tmp2) is allowed here!
 930   assert(tmp1 != tmp2, "must be distinct");
 931 
 932   // compute offset w.r.t. current stack pointer
 933   // tmp_1 <- addr - SP (!)
 934   subf(tmp1, R1_SP, addr);
 935 
 936   // atomically update SP keeping back link.
 937   resize_frame(tmp1/* offset */, tmp2/* tmp */);
 938 }
 939 
 940 void MacroAssembler::push_frame(Register bytes, Register tmp) {
 941 #ifdef ASSERT
 942   assert(bytes != R0, "r0 not allowed here");
 943   andi_(R0, bytes, frame::alignment_in_bytes-1);
 944   asm_assert_eq("push_frame(Reg, Reg): unaligned", 0x203);
 945 #endif
 946   neg(tmp, bytes);
 947   stdux(R1_SP, R1_SP, tmp);
 948 }
 949 
 950 // Push a frame of size `bytes'.
 951 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) {
 952   long offset = align_addr(bytes, frame::alignment_in_bytes);
 953   if (is_simm(-offset, 16)) {
 954     stdu(R1_SP, -offset, R1_SP);
 955   } else {
 956     load_const_optimized(tmp, -offset);
 957     stdux(R1_SP, R1_SP, tmp);
 958   }
 959 }
 960 
 961 // Push a frame of size `bytes' plus abi_reg_args on top.
 962 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) {
 963   push_frame(bytes + frame::abi_reg_args_size, tmp);
 964 }
 965 
 966 // Setup up a new C frame with a spill area for non-volatile GPRs and
 967 // additional space for local variables.
 968 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes,
 969                                                       Register tmp) {
 970   push_frame(bytes + frame::abi_reg_args_size + frame::spill_nonvolatiles_size, tmp);
 971 }
 972 
 973 // Pop current C frame.
 974 void MacroAssembler::pop_frame() {
 975   ld(R1_SP, _abi(callers_sp), R1_SP);
 976 }
 977 
 978 #if defined(ABI_ELFv2)
 979 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) {
 980   // TODO(asmundak): make sure the caller uses R12 as function descriptor
 981   // most of the times.
 982   if (R12 != r_function_entry) {
 983     mr(R12, r_function_entry);
 984   }
 985   mtctr(R12);
 986   // Do a call or a branch.
 987   if (and_link) {
 988     bctrl();
 989   } else {
 990     bctr();
 991   }
 992   _last_calls_return_pc = pc();
 993 
 994   return _last_calls_return_pc;
 995 }
 996 
 997 // Call a C function via a function descriptor and use full C
 998 // calling conventions. Updates and returns _last_calls_return_pc.
 999 address MacroAssembler::call_c(Register r_function_entry) {
1000   return branch_to(r_function_entry, /*and_link=*/true);
1001 }
1002 
1003 // For tail calls: only branch, don't link, so callee returns to caller of this function.
1004 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) {
1005   return branch_to(r_function_entry, /*and_link=*/false);
1006 }
1007 
1008 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) {
1009   load_const(R12, function_entry, R0);
1010   return branch_to(R12,  /*and_link=*/true);
1011 }
1012 
1013 #else
1014 // Generic version of a call to C function via a function descriptor
1015 // with variable support for C calling conventions (TOC, ENV, etc.).
1016 // Updates and returns _last_calls_return_pc.
1017 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call,
1018                                   bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) {
1019   // we emit standard ptrgl glue code here
1020   assert((function_descriptor != R0), "function_descriptor cannot be R0");
1021 
1022   // retrieve necessary entries from the function descriptor
1023   ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor);
1024   mtctr(R0);
1025 
1026   if (load_toc_of_callee) {
1027     ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor);
1028   }
1029   if (load_env_of_callee) {
1030     ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor);
1031   } else if (load_toc_of_callee) {
1032     li(R11, 0);
1033   }
1034 
1035   // do a call or a branch
1036   if (and_link) {
1037     bctrl();
1038   } else {
1039     bctr();
1040   }
1041   _last_calls_return_pc = pc();
1042 
1043   return _last_calls_return_pc;
1044 }
1045 
1046 // Call a C function via a function descriptor and use full C calling
1047 // conventions.
1048 // We don't use the TOC in generated code, so there is no need to save
1049 // and restore its value.
1050 address MacroAssembler::call_c(Register fd) {
1051   return branch_to(fd, /*and_link=*/true,
1052                        /*save toc=*/false,
1053                        /*restore toc=*/false,
1054                        /*load toc=*/true,
1055                        /*load env=*/true);
1056 }
1057 
1058 address MacroAssembler::call_c_and_return_to_caller(Register fd) {
1059   return branch_to(fd, /*and_link=*/false,
1060                        /*save toc=*/false,
1061                        /*restore toc=*/false,
1062                        /*load toc=*/true,
1063                        /*load env=*/true);
1064 }
1065 
1066 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) {
1067   if (rt != relocInfo::none) {
1068     // this call needs to be relocatable
1069     if (!ReoptimizeCallSequences
1070         || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1071         || fd == NULL   // support code-size estimation
1072         || !fd->is_friend_function()
1073         || fd->entry() == NULL) {
1074       // it's not a friend function as defined by class FunctionDescriptor,
1075       // so do a full call-c here.
1076       load_const(R11, (address)fd, R0);
1077 
1078       bool has_env = (fd != NULL && fd->env() != NULL);
1079       return branch_to(R11, /*and_link=*/true,
1080                             /*save toc=*/false,
1081                             /*restore toc=*/false,
1082                             /*load toc=*/true,
1083                             /*load env=*/has_env);
1084     } else {
1085       // It's a friend function. Load the entry point and don't care about
1086       // toc and env. Use an optimizable call instruction, but ensure the
1087       // same code-size as in the case of a non-friend function.
1088       nop();
1089       nop();
1090       nop();
1091       bl64_patchable(fd->entry(), rt);
1092       _last_calls_return_pc = pc();
1093       return _last_calls_return_pc;
1094     }
1095   } else {
1096     // This call does not need to be relocatable, do more aggressive
1097     // optimizations.
1098     if (!ReoptimizeCallSequences
1099       || !fd->is_friend_function()) {
1100       // It's not a friend function as defined by class FunctionDescriptor,
1101       // so do a full call-c here.
1102       load_const(R11, (address)fd, R0);
1103       return branch_to(R11, /*and_link=*/true,
1104                             /*save toc=*/false,
1105                             /*restore toc=*/false,
1106                             /*load toc=*/true,
1107                             /*load env=*/true);
1108     } else {
1109       // it's a friend function, load the entry point and don't care about
1110       // toc and env.
1111       address dest = fd->entry();
1112       if (is_within_range_of_b(dest, pc())) {
1113         bl(dest);
1114       } else {
1115         bl64_patchable(dest, rt);
1116       }
1117       _last_calls_return_pc = pc();
1118       return _last_calls_return_pc;
1119     }
1120   }
1121 }
1122 
1123 // Call a C function.  All constants needed reside in TOC.
1124 //
1125 // Read the address to call from the TOC.
1126 // Read env from TOC, if fd specifies an env.
1127 // Read new TOC from TOC.
1128 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd,
1129                                          relocInfo::relocType rt, Register toc) {
1130   if (!ReoptimizeCallSequences
1131     || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1132     || !fd->is_friend_function()) {
1133     // It's not a friend function as defined by class FunctionDescriptor,
1134     // so do a full call-c here.
1135     assert(fd->entry() != NULL, "function must be linked");
1136 
1137     AddressLiteral fd_entry(fd->entry());
1138     bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true);
1139     mtctr(R11);
1140     if (fd->env() == NULL) {
1141       li(R11, 0);
1142       nop();
1143     } else {
1144       AddressLiteral fd_env(fd->env());
1145       success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true);
1146     }
1147     AddressLiteral fd_toc(fd->toc());
1148     // Set R2_TOC (load from toc)
1149     success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true);
1150     bctrl();
1151     _last_calls_return_pc = pc();
1152     if (!success) { return NULL; }
1153   } else {
1154     // It's a friend function, load the entry point and don't care about
1155     // toc and env. Use an optimizable call instruction, but ensure the
1156     // same code-size as in the case of a non-friend function.
1157     nop();
1158     bl64_patchable(fd->entry(), rt);
1159     _last_calls_return_pc = pc();
1160   }
1161   return _last_calls_return_pc;
1162 }
1163 #endif // ABI_ELFv2
1164 
1165 void MacroAssembler::call_VM_base(Register oop_result,
1166                                   Register last_java_sp,
1167                                   address  entry_point,
1168                                   bool     check_exceptions) {
1169   BLOCK_COMMENT("call_VM {");
1170   // Determine last_java_sp register.
1171   if (!last_java_sp->is_valid()) {
1172     last_java_sp = R1_SP;
1173   }
1174   set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1);
1175 
1176   // ARG1 must hold thread address.
1177   mr(R3_ARG1, R16_thread);
1178 #if defined(ABI_ELFv2)
1179   address return_pc = call_c(entry_point, relocInfo::none);
1180 #else
1181   address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none);
1182 #endif
1183 
1184   reset_last_Java_frame();
1185 
1186   // Check for pending exceptions.
1187   if (check_exceptions) {
1188     // We don't check for exceptions here.
1189     ShouldNotReachHere();
1190   }
1191 
1192   // Get oop result if there is one and reset the value in the thread.
1193   if (oop_result->is_valid()) {
1194     get_vm_result(oop_result);
1195   }
1196 
1197   _last_calls_return_pc = return_pc;
1198   BLOCK_COMMENT("} call_VM");
1199 }
1200 
1201 void MacroAssembler::call_VM_leaf_base(address entry_point) {
1202   BLOCK_COMMENT("call_VM_leaf {");
1203 #if defined(ABI_ELFv2)
1204   call_c(entry_point, relocInfo::none);
1205 #else
1206   call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none);
1207 #endif
1208   BLOCK_COMMENT("} call_VM_leaf");
1209 }
1210 
1211 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) {
1212   call_VM_base(oop_result, noreg, entry_point, check_exceptions);
1213 }
1214 
1215 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1,
1216                              bool check_exceptions) {
1217   // R3_ARG1 is reserved for the thread.
1218   mr_if_needed(R4_ARG2, arg_1);
1219   call_VM(oop_result, entry_point, check_exceptions);
1220 }
1221 
1222 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2,
1223                              bool check_exceptions) {
1224   // R3_ARG1 is reserved for the thread
1225   mr_if_needed(R4_ARG2, arg_1);
1226   assert(arg_2 != R4_ARG2, "smashed argument");
1227   mr_if_needed(R5_ARG3, arg_2);
1228   call_VM(oop_result, entry_point, check_exceptions);
1229 }
1230 
1231 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3,
1232                              bool check_exceptions) {
1233   // R3_ARG1 is reserved for the thread
1234   mr_if_needed(R4_ARG2, arg_1);
1235   assert(arg_2 != R4_ARG2, "smashed argument");
1236   mr_if_needed(R5_ARG3, arg_2);
1237   mr_if_needed(R6_ARG4, arg_3);
1238   call_VM(oop_result, entry_point, check_exceptions);
1239 }
1240 
1241 void MacroAssembler::call_VM_leaf(address entry_point) {
1242   call_VM_leaf_base(entry_point);
1243 }
1244 
1245 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) {
1246   mr_if_needed(R3_ARG1, arg_1);
1247   call_VM_leaf(entry_point);
1248 }
1249 
1250 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) {
1251   mr_if_needed(R3_ARG1, arg_1);
1252   assert(arg_2 != R3_ARG1, "smashed argument");
1253   mr_if_needed(R4_ARG2, arg_2);
1254   call_VM_leaf(entry_point);
1255 }
1256 
1257 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) {
1258   mr_if_needed(R3_ARG1, arg_1);
1259   assert(arg_2 != R3_ARG1, "smashed argument");
1260   mr_if_needed(R4_ARG2, arg_2);
1261   assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument");
1262   mr_if_needed(R5_ARG3, arg_3);
1263   call_VM_leaf(entry_point);
1264 }
1265 
1266 // Check whether instruction is a read access to the polling page
1267 // which was emitted by load_from_polling_page(..).
1268 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext,
1269                                                address* polling_address_ptr) {
1270   if (!is_ld(instruction))
1271     return false; // It's not a ld. Fail.
1272 
1273   int rt = inv_rt_field(instruction);
1274   int ra = inv_ra_field(instruction);
1275   int ds = inv_ds_field(instruction);
1276   if (!(ds == 0 && ra != 0 && rt == 0)) {
1277     return false; // It's not a ld(r0, X, ra). Fail.
1278   }
1279 
1280   if (!ucontext) {
1281     // Set polling address.
1282     if (polling_address_ptr != NULL) {
1283       *polling_address_ptr = NULL;
1284     }
1285     return true; // No ucontext given. Can't check value of ra. Assume true.
1286   }
1287 
1288 #ifdef LINUX
1289   // Ucontext given. Check that register ra contains the address of
1290   // the safepoing polling page.
1291   ucontext_t* uc = (ucontext_t*) ucontext;
1292   // Set polling address.
1293   address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds;
1294   if (polling_address_ptr != NULL) {
1295     *polling_address_ptr = addr;
1296   }
1297   return os::is_poll_address(addr);
1298 #else
1299   // Not on Linux, ucontext must be NULL.
1300   ShouldNotReachHere();
1301   return false;
1302 #endif
1303 }
1304 
1305 bool MacroAssembler::is_memory_serialization(int instruction, JavaThread* thread, void* ucontext) {
1306 #ifdef LINUX
1307   ucontext_t* uc = (ucontext_t*) ucontext;
1308 
1309   if (is_stwx(instruction) || is_stwux(instruction)) {
1310     int ra = inv_ra_field(instruction);
1311     int rb = inv_rb_field(instruction);
1312 
1313     // look up content of ra and rb in ucontext
1314     address ra_val=(address)uc->uc_mcontext.regs->gpr[ra];
1315     long rb_val=(long)uc->uc_mcontext.regs->gpr[rb];
1316     return os::is_memory_serialize_page(thread, ra_val+rb_val);
1317   } else if (is_stw(instruction) || is_stwu(instruction)) {
1318     int ra = inv_ra_field(instruction);
1319     int d1 = inv_d1_field(instruction);
1320 
1321     // look up content of ra in ucontext
1322     address ra_val=(address)uc->uc_mcontext.regs->gpr[ra];
1323     return os::is_memory_serialize_page(thread, ra_val+d1);
1324   } else {
1325     return false;
1326   }
1327 #else
1328   // workaround not needed on !LINUX :-)
1329   ShouldNotCallThis();
1330   return false;
1331 #endif
1332 }
1333 
1334 void MacroAssembler::bang_stack_with_offset(int offset) {
1335   // When increasing the stack, the old stack pointer will be written
1336   // to the new top of stack according to the PPC64 abi.
1337   // Therefore, stack banging is not necessary when increasing
1338   // the stack by <= os::vm_page_size() bytes.
1339   // When increasing the stack by a larger amount, this method is
1340   // called repeatedly to bang the intermediate pages.
1341 
1342   // Stack grows down, caller passes positive offset.
1343   assert(offset > 0, "must bang with positive offset");
1344 
1345   long stdoffset = -offset;
1346 
1347   if (is_simm(stdoffset, 16)) {
1348     // Signed 16 bit offset, a simple std is ok.
1349     if (UseLoadInstructionsForStackBangingPPC64) {
1350       ld(R0, (int)(signed short)stdoffset, R1_SP);
1351     } else {
1352       std(R0,(int)(signed short)stdoffset, R1_SP);
1353     }
1354   } else if (is_simm(stdoffset, 31)) {
1355     const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset);
1356     const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset);
1357 
1358     Register tmp = R11;
1359     addis(tmp, R1_SP, hi);
1360     if (UseLoadInstructionsForStackBangingPPC64) {
1361       ld(R0,  lo, tmp);
1362     } else {
1363       std(R0, lo, tmp);
1364     }
1365   } else {
1366     ShouldNotReachHere();
1367   }
1368 }
1369 
1370 // If instruction is a stack bang of the form
1371 //    std    R0,    x(Ry),       (see bang_stack_with_offset())
1372 //    stdu   R1_SP, x(R1_SP),    (see push_frame(), resize_frame())
1373 // or stdux  R1_SP, Rx, R1_SP    (see push_frame(), resize_frame())
1374 // return the banged address. Otherwise, return 0.
1375 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) {
1376 #ifdef LINUX
1377   ucontext_t* uc = (ucontext_t*) ucontext;
1378   int rs = inv_rs_field(instruction);
1379   int ra = inv_ra_field(instruction);
1380   if (   (is_ld(instruction)   && rs == 0 &&  UseLoadInstructionsForStackBangingPPC64)
1381       || (is_std(instruction)  && rs == 0 && !UseLoadInstructionsForStackBangingPPC64)
1382       || (is_stdu(instruction) && rs == 1)) {
1383     int ds = inv_ds_field(instruction);
1384     // return banged address
1385     return ds+(address)uc->uc_mcontext.regs->gpr[ra];
1386   } else if (is_stdux(instruction) && rs == 1) {
1387     int rb = inv_rb_field(instruction);
1388     address sp = (address)uc->uc_mcontext.regs->gpr[1];
1389     long rb_val = (long)uc->uc_mcontext.regs->gpr[rb];
1390     return ra != 1 || rb_val >= 0 ? NULL         // not a stack bang
1391                                   : sp + rb_val; // banged address
1392   }
1393   return NULL; // not a stack bang
1394 #else
1395   // workaround not needed on !LINUX :-)
1396   ShouldNotCallThis();
1397   return NULL;
1398 #endif
1399 }
1400 
1401 void MacroAssembler::reserved_stack_check(Register return_pc) {
1402   // Test if reserved zone needs to be enabled.
1403   Label no_reserved_zone_enabling;
1404 
1405   ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread);
1406   cmpld(CCR0, R1_SP, R0);
1407   blt_predict_taken(CCR0, no_reserved_zone_enabling);
1408 
1409   // Enable reserved zone again, throw stack overflow exception.
1410   push_frame_reg_args(0, R0);
1411   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread);
1412   pop_frame();
1413   mtlr(return_pc);
1414   load_const_optimized(R0, StubRoutines::throw_delayed_StackOverflowError_entry());
1415   mtctr(R0);
1416   bctr();
1417 
1418   should_not_reach_here();
1419 
1420   bind(no_reserved_zone_enabling);
1421 }
1422 
1423 void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base,
1424                                 bool cmpxchgx_hint) {
1425   Label retry;
1426   bind(retry);
1427   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1428   stdcx_(exchange_value, addr_base);
1429   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1430     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1431   } else {
1432     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1433   }
1434 }
1435 
1436 void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base,
1437                                 Register tmp, bool cmpxchgx_hint) {
1438   Label retry;
1439   bind(retry);
1440   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1441   add(tmp, dest_current_value, inc_value);
1442   stdcx_(tmp, addr_base);
1443   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1444     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1445   } else {
1446     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1447   }
1448 }
1449 
1450 // Word/sub-word atomic helper functions
1451 
1452 // Temps and addr_base are killed if size < 4 and processor does not support respective instructions.
1453 // Only signed types are supported with size < 4.
1454 // Atomic add always kills tmp1.
1455 void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value,
1456                                                    Register addr_base, Register tmp1, Register tmp2, Register tmp3,
1457                                                    bool cmpxchgx_hint, bool is_add, int size) {
1458   // Sub-word instructions are available since Power 8.
1459   // For older processors, instruction_type != size holds, and we
1460   // emulate the sub-word instructions by constructing a 4-byte value
1461   // that leaves the other bytes unchanged.
1462   const int instruction_type = VM_Version::has_lqarx() ? size : 4;
1463 
1464   Label retry;
1465   Register shift_amount = noreg,
1466            val32 = dest_current_value,
1467            modval = is_add ? tmp1 : exchange_value;
1468 
1469   if (instruction_type != size) {
1470     assert_different_registers(tmp1, tmp2, tmp3, dest_current_value, exchange_value, addr_base);
1471     modval = tmp1;
1472     shift_amount = tmp2;
1473     val32 = tmp3;
1474     // Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.
1475 #ifdef VM_LITTLE_ENDIAN
1476     rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;
1477     clrrdi(addr_base, addr_base, 2);
1478 #else
1479     xori(shift_amount, addr_base, (size == 1) ? 3 : 2);
1480     clrrdi(addr_base, addr_base, 2);
1481     rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;
1482 #endif
1483   }
1484 
1485   // atomic emulation loop
1486   bind(retry);
1487 
1488   switch (instruction_type) {
1489     case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1490     case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1491     case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1492     default: ShouldNotReachHere();
1493   }
1494 
1495   if (instruction_type != size) {
1496     srw(dest_current_value, val32, shift_amount);
1497   }
1498 
1499   if (is_add) { add(modval, dest_current_value, exchange_value); }
1500 
1501   if (instruction_type != size) {
1502     // Transform exchange value such that the replacement can be done by one xor instruction.
1503     xorr(modval, dest_current_value, is_add ? modval : exchange_value);
1504     clrldi(modval, modval, (size == 1) ? 56 : 48);
1505     slw(modval, modval, shift_amount);
1506     xorr(modval, val32, modval);
1507   }
1508 
1509   switch (instruction_type) {
1510     case 4: stwcx_(modval, addr_base); break;
1511     case 2: sthcx_(modval, addr_base); break;
1512     case 1: stbcx_(modval, addr_base); break;
1513     default: ShouldNotReachHere();
1514   }
1515 
1516   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1517     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1518   } else {
1519     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1520   }
1521 
1522   // l?arx zero-extends, but Java wants byte/short values sign-extended.
1523   if (size == 1) {
1524     extsb(dest_current_value, dest_current_value);
1525   } else if (size == 2) {
1526     extsh(dest_current_value, dest_current_value);
1527   };
1528 }
1529 
1530 // Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions.
1531 // Only signed types are supported with size < 4.
1532 void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value,
1533                                        Register compare_value, Register exchange_value,
1534                                        Register addr_base, Register tmp1, Register tmp2,
1535                                        Label &retry, Label &failed, bool cmpxchgx_hint, int size) {
1536   // Sub-word instructions are available since Power 8.
1537   // For older processors, instruction_type != size holds, and we
1538   // emulate the sub-word instructions by constructing a 4-byte value
1539   // that leaves the other bytes unchanged.
1540   const int instruction_type = VM_Version::has_lqarx() ? size : 4;
1541 
1542   Register shift_amount = noreg,
1543            val32 = dest_current_value,
1544            modval = exchange_value;
1545 
1546   if (instruction_type != size) {
1547     assert_different_registers(tmp1, tmp2, dest_current_value, compare_value, exchange_value, addr_base);
1548     shift_amount = tmp1;
1549     val32 = tmp2;
1550     modval = tmp2;
1551     // Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.
1552 #ifdef VM_LITTLE_ENDIAN
1553     rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;
1554     clrrdi(addr_base, addr_base, 2);
1555 #else
1556     xori(shift_amount, addr_base, (size == 1) ? 3 : 2);
1557     clrrdi(addr_base, addr_base, 2);
1558     rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;
1559 #endif
1560     // Transform exchange value such that the replacement can be done by one xor instruction.
1561     xorr(exchange_value, compare_value, exchange_value);
1562     clrldi(exchange_value, exchange_value, (size == 1) ? 56 : 48);
1563     slw(exchange_value, exchange_value, shift_amount);
1564   }
1565 
1566   // atomic emulation loop
1567   bind(retry);
1568 
1569   switch (instruction_type) {
1570     case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1571     case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1572     case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1573     default: ShouldNotReachHere();
1574   }
1575 
1576   if (instruction_type != size) {
1577     srw(dest_current_value, val32, shift_amount);
1578   }
1579   if (size == 1) {
1580     extsb(dest_current_value, dest_current_value);
1581   } else if (size == 2) {
1582     extsh(dest_current_value, dest_current_value);
1583   };
1584 
1585   cmpw(flag, dest_current_value, compare_value);
1586   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1587     bne_predict_not_taken(flag, failed);
1588   } else {
1589     bne(                  flag, failed);
1590   }
1591   // branch to done  => (flag == ne), (dest_current_value != compare_value)
1592   // fall through    => (flag == eq), (dest_current_value == compare_value)
1593 
1594   if (instruction_type != size) {
1595     xorr(modval, val32, exchange_value);
1596   }
1597 
1598   switch (instruction_type) {
1599     case 4: stwcx_(modval, addr_base); break;
1600     case 2: sthcx_(modval, addr_base); break;
1601     case 1: stbcx_(modval, addr_base); break;
1602     default: ShouldNotReachHere();
1603   }
1604 }
1605 
1606 // CmpxchgX sets condition register to cmpX(current, compare).
1607 void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value,
1608                                      Register compare_value, Register exchange_value,
1609                                      Register addr_base, Register tmp1, Register tmp2,
1610                                      int semantics, bool cmpxchgx_hint,
1611                                      Register int_flag_success, bool contention_hint, bool weak, int size) {
1612   Label retry;
1613   Label failed;
1614   Label done;
1615 
1616   // Save one branch if result is returned via register and
1617   // result register is different from the other ones.
1618   bool use_result_reg    = (int_flag_success != noreg);
1619   bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value &&
1620                             int_flag_success != exchange_value && int_flag_success != addr_base &&
1621                             int_flag_success != tmp1 && int_flag_success != tmp2);
1622   assert(!weak || flag == CCR0, "weak only supported with CCR0");
1623   assert(size == 1 || size == 2 || size == 4, "unsupported");
1624 
1625   if (use_result_reg && preset_result_reg) {
1626     li(int_flag_success, 0); // preset (assume cas failed)
1627   }
1628 
1629   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1630   if (contention_hint) { // Don't try to reserve if cmp fails.
1631     switch (size) {
1632       case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break;
1633       case 2: lha(dest_current_value, 0, addr_base); break;
1634       case 4: lwz(dest_current_value, 0, addr_base); break;
1635       default: ShouldNotReachHere();
1636     }
1637     cmpw(flag, dest_current_value, compare_value);
1638     bne(flag, failed);
1639   }
1640 
1641   // release/fence semantics
1642   if (semantics & MemBarRel) {
1643     release();
1644   }
1645 
1646   cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base, tmp1, tmp2,
1647                     retry, failed, cmpxchgx_hint, size);
1648   if (!weak || use_result_reg) {
1649     if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1650       bne_predict_not_taken(CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1651     } else {
1652       bne(                  CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1653     }
1654   }
1655   // fall through    => (flag == eq), (dest_current_value == compare_value), (swapped)
1656 
1657   // Result in register (must do this at the end because int_flag_success can be the
1658   // same register as one above).
1659   if (use_result_reg) {
1660     li(int_flag_success, 1);
1661   }
1662 
1663   if (semantics & MemBarFenceAfter) {
1664     fence();
1665   } else if (semantics & MemBarAcq) {
1666     isync();
1667   }
1668 
1669   if (use_result_reg && !preset_result_reg) {
1670     b(done);
1671   }
1672 
1673   bind(failed);
1674   if (use_result_reg && !preset_result_reg) {
1675     li(int_flag_success, 0);
1676   }
1677 
1678   bind(done);
1679   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1680   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1681 }
1682 
1683 // Preforms atomic compare exchange:
1684 //   if (compare_value == *addr_base)
1685 //     *addr_base = exchange_value
1686 //     int_flag_success = 1;
1687 //   else
1688 //     int_flag_success = 0;
1689 //
1690 // ConditionRegister flag       = cmp(compare_value, *addr_base)
1691 // Register dest_current_value  = *addr_base
1692 // Register compare_value       Used to compare with value in memory
1693 // Register exchange_value      Written to memory if compare_value == *addr_base
1694 // Register addr_base           The memory location to compareXChange
1695 // Register int_flag_success    Set to 1 if exchange_value was written to *addr_base
1696 //
1697 // To avoid the costly compare exchange the value is tested beforehand.
1698 // Several special cases exist to avoid that unnecessary information is generated.
1699 //
1700 void MacroAssembler::cmpxchgd(ConditionRegister flag,
1701                               Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value,
1702                               Register addr_base, int semantics, bool cmpxchgx_hint,
1703                               Register int_flag_success, Label* failed_ext, bool contention_hint, bool weak) {
1704   Label retry;
1705   Label failed_int;
1706   Label& failed = (failed_ext != NULL) ? *failed_ext : failed_int;
1707   Label done;
1708 
1709   // Save one branch if result is returned via register and result register is different from the other ones.
1710   bool use_result_reg    = (int_flag_success!=noreg);
1711   bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() &&
1712                             int_flag_success!=exchange_value && int_flag_success!=addr_base);
1713   assert(!weak || flag == CCR0, "weak only supported with CCR0");
1714   assert(int_flag_success == noreg || failed_ext == NULL, "cannot have both");
1715 
1716   if (use_result_reg && preset_result_reg) {
1717     li(int_flag_success, 0); // preset (assume cas failed)
1718   }
1719 
1720   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1721   if (contention_hint) { // Don't try to reserve if cmp fails.
1722     ld(dest_current_value, 0, addr_base);
1723     cmpd(flag, compare_value, dest_current_value);
1724     bne(flag, failed);
1725   }
1726 
1727   // release/fence semantics
1728   if (semantics & MemBarRel) {
1729     release();
1730   }
1731 
1732   // atomic emulation loop
1733   bind(retry);
1734 
1735   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1736   cmpd(flag, compare_value, dest_current_value);
1737   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1738     bne_predict_not_taken(flag, failed);
1739   } else {
1740     bne(                  flag, failed);
1741   }
1742 
1743   stdcx_(exchange_value, addr_base);
1744   if (!weak || use_result_reg || failed_ext) {
1745     if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1746       bne_predict_not_taken(CCR0, weak ? failed : retry); // stXcx_ sets CCR0
1747     } else {
1748       bne(                  CCR0, weak ? failed : retry); // stXcx_ sets CCR0
1749     }
1750   }
1751 
1752   // result in register (must do this at the end because int_flag_success can be the same register as one above)
1753   if (use_result_reg) {
1754     li(int_flag_success, 1);
1755   }
1756 
1757   if (semantics & MemBarFenceAfter) {
1758     fence();
1759   } else if (semantics & MemBarAcq) {
1760     isync();
1761   }
1762 
1763   if (use_result_reg && !preset_result_reg) {
1764     b(done);
1765   }
1766 
1767   bind(failed_int);
1768   if (use_result_reg && !preset_result_reg) {
1769     li(int_flag_success, 0);
1770   }
1771 
1772   bind(done);
1773   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1774   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1775 }
1776 
1777 // Look up the method for a megamorphic invokeinterface call.
1778 // The target method is determined by <intf_klass, itable_index>.
1779 // The receiver klass is in recv_klass.
1780 // On success, the result will be in method_result, and execution falls through.
1781 // On failure, execution transfers to the given label.
1782 void MacroAssembler::lookup_interface_method(Register recv_klass,
1783                                              Register intf_klass,
1784                                              RegisterOrConstant itable_index,
1785                                              Register method_result,
1786                                              Register scan_temp,
1787                                              Register temp2,
1788                                              Label& L_no_such_interface,
1789                                              bool return_method) {
1790   assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
1791 
1792   // Compute start of first itableOffsetEntry (which is at the end of the vtable).
1793   int vtable_base = in_bytes(Klass::vtable_start_offset());
1794   int itentry_off = itableMethodEntry::method_offset_in_bytes();
1795   int logMEsize   = exact_log2(itableMethodEntry::size() * wordSize);
1796   int scan_step   = itableOffsetEntry::size() * wordSize;
1797   int log_vte_size= exact_log2(vtableEntry::size_in_bytes());
1798 
1799   lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass);
1800   // %%% We should store the aligned, prescaled offset in the klassoop.
1801   // Then the next several instructions would fold away.
1802 
1803   sldi(scan_temp, scan_temp, log_vte_size);
1804   addi(scan_temp, scan_temp, vtable_base);
1805   add(scan_temp, recv_klass, scan_temp);
1806 
1807   // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1808   if (return_method) {
1809     if (itable_index.is_register()) {
1810       Register itable_offset = itable_index.as_register();
1811       sldi(method_result, itable_offset, logMEsize);
1812       if (itentry_off) { addi(method_result, method_result, itentry_off); }
1813       add(method_result, method_result, recv_klass);
1814     } else {
1815       long itable_offset = (long)itable_index.as_constant();
1816       // static address, no relocation
1817       add_const_optimized(method_result, recv_klass, (itable_offset << logMEsize) + itentry_off, temp2);
1818     }
1819   }
1820 
1821   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
1822   //   if (scan->interface() == intf) {
1823   //     result = (klass + scan->offset() + itable_index);
1824   //   }
1825   // }
1826   Label search, found_method;
1827 
1828   for (int peel = 1; peel >= 0; peel--) {
1829     // %%%% Could load both offset and interface in one ldx, if they were
1830     // in the opposite order. This would save a load.
1831     ld(temp2, itableOffsetEntry::interface_offset_in_bytes(), scan_temp);
1832 
1833     // Check that this entry is non-null. A null entry means that
1834     // the receiver class doesn't implement the interface, and wasn't the
1835     // same as when the caller was compiled.
1836     cmpd(CCR0, temp2, intf_klass);
1837 
1838     if (peel) {
1839       beq(CCR0, found_method);
1840     } else {
1841       bne(CCR0, search);
1842       // (invert the test to fall through to found_method...)
1843     }
1844 
1845     if (!peel) break;
1846 
1847     bind(search);
1848 
1849     cmpdi(CCR0, temp2, 0);
1850     beq(CCR0, L_no_such_interface);
1851     addi(scan_temp, scan_temp, scan_step);
1852   }
1853 
1854   bind(found_method);
1855 
1856   // Got a hit.
1857   if (return_method) {
1858     int ito_offset = itableOffsetEntry::offset_offset_in_bytes();
1859     lwz(scan_temp, ito_offset, scan_temp);
1860     ldx(method_result, scan_temp, method_result);
1861   }
1862 }
1863 
1864 // virtual method calling
1865 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1866                                            RegisterOrConstant vtable_index,
1867                                            Register method_result) {
1868 
1869   assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg());
1870 
1871   const int base = in_bytes(Klass::vtable_start_offset());
1872   assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1873 
1874   if (vtable_index.is_register()) {
1875     sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord);
1876     add(recv_klass, vtable_index.as_register(), recv_klass);
1877   } else {
1878     addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord);
1879   }
1880   ld(R19_method, base + vtableEntry::method_offset_in_bytes(), recv_klass);
1881 }
1882 
1883 /////////////////////////////////////////// subtype checking ////////////////////////////////////////////
1884 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1885                                                    Register super_klass,
1886                                                    Register temp1_reg,
1887                                                    Register temp2_reg,
1888                                                    Label* L_success,
1889                                                    Label* L_failure,
1890                                                    Label* L_slow_path,
1891                                                    RegisterOrConstant super_check_offset) {
1892 
1893   const Register check_cache_offset = temp1_reg;
1894   const Register cached_super       = temp2_reg;
1895 
1896   assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super);
1897 
1898   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1899   int sc_offset  = in_bytes(Klass::secondary_super_cache_offset());
1900 
1901   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1902   bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset);
1903 
1904   Label L_fallthrough;
1905   int label_nulls = 0;
1906   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1907   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1908   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1909   assert(label_nulls <= 1 ||
1910          (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path),
1911          "at most one NULL in the batch, usually");
1912 
1913   // If the pointers are equal, we are done (e.g., String[] elements).
1914   // This self-check enables sharing of secondary supertype arrays among
1915   // non-primary types such as array-of-interface. Otherwise, each such
1916   // type would need its own customized SSA.
1917   // We move this check to the front of the fast path because many
1918   // type checks are in fact trivially successful in this manner,
1919   // so we get a nicely predicted branch right at the start of the check.
1920   cmpd(CCR0, sub_klass, super_klass);
1921   beq(CCR0, *L_success);
1922 
1923   // Check the supertype display:
1924   if (must_load_sco) {
1925     // The super check offset is always positive...
1926     lwz(check_cache_offset, sco_offset, super_klass);
1927     super_check_offset = RegisterOrConstant(check_cache_offset);
1928     // super_check_offset is register.
1929     assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register());
1930   }
1931   // The loaded value is the offset from KlassOopDesc.
1932 
1933   ld(cached_super, super_check_offset, sub_klass);
1934   cmpd(CCR0, cached_super, super_klass);
1935 
1936   // This check has worked decisively for primary supers.
1937   // Secondary supers are sought in the super_cache ('super_cache_addr').
1938   // (Secondary supers are interfaces and very deeply nested subtypes.)
1939   // This works in the same check above because of a tricky aliasing
1940   // between the super_cache and the primary super display elements.
1941   // (The 'super_check_addr' can address either, as the case requires.)
1942   // Note that the cache is updated below if it does not help us find
1943   // what we need immediately.
1944   // So if it was a primary super, we can just fail immediately.
1945   // Otherwise, it's the slow path for us (no success at this point).
1946 
1947 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); }
1948 
1949   if (super_check_offset.is_register()) {
1950     beq(CCR0, *L_success);
1951     cmpwi(CCR0, super_check_offset.as_register(), sc_offset);
1952     if (L_failure == &L_fallthrough) {
1953       beq(CCR0, *L_slow_path);
1954     } else {
1955       bne(CCR0, *L_failure);
1956       FINAL_JUMP(*L_slow_path);
1957     }
1958   } else {
1959     if (super_check_offset.as_constant() == sc_offset) {
1960       // Need a slow path; fast failure is impossible.
1961       if (L_slow_path == &L_fallthrough) {
1962         beq(CCR0, *L_success);
1963       } else {
1964         bne(CCR0, *L_slow_path);
1965         FINAL_JUMP(*L_success);
1966       }
1967     } else {
1968       // No slow path; it's a fast decision.
1969       if (L_failure == &L_fallthrough) {
1970         beq(CCR0, *L_success);
1971       } else {
1972         bne(CCR0, *L_failure);
1973         FINAL_JUMP(*L_success);
1974       }
1975     }
1976   }
1977 
1978   bind(L_fallthrough);
1979 #undef FINAL_JUMP
1980 }
1981 
1982 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1983                                                    Register super_klass,
1984                                                    Register temp1_reg,
1985                                                    Register temp2_reg,
1986                                                    Label* L_success,
1987                                                    Register result_reg) {
1988   const Register array_ptr = temp1_reg; // current value from cache array
1989   const Register temp      = temp2_reg;
1990 
1991   assert_different_registers(sub_klass, super_klass, array_ptr, temp);
1992 
1993   int source_offset = in_bytes(Klass::secondary_supers_offset());
1994   int target_offset = in_bytes(Klass::secondary_super_cache_offset());
1995 
1996   int length_offset = Array<Klass*>::length_offset_in_bytes();
1997   int base_offset   = Array<Klass*>::base_offset_in_bytes();
1998 
1999   Label hit, loop, failure, fallthru;
2000 
2001   ld(array_ptr, source_offset, sub_klass);
2002 
2003   // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated.");
2004   lwz(temp, length_offset, array_ptr);
2005   cmpwi(CCR0, temp, 0);
2006   beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0
2007 
2008   mtctr(temp); // load ctr
2009 
2010   bind(loop);
2011   // Oops in table are NO MORE compressed.
2012   ld(temp, base_offset, array_ptr);
2013   cmpd(CCR0, temp, super_klass);
2014   beq(CCR0, hit);
2015   addi(array_ptr, array_ptr, BytesPerWord);
2016   bdnz(loop);
2017 
2018   bind(failure);
2019   if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss)
2020   b(fallthru);
2021 
2022   bind(hit);
2023   std(super_klass, target_offset, sub_klass); // save result to cache
2024   if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit)
2025   if (L_success != NULL) { b(*L_success); }
2026   else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided
2027 
2028   bind(fallthru);
2029 }
2030 
2031 // Try fast path, then go to slow one if not successful
2032 void MacroAssembler::check_klass_subtype(Register sub_klass,
2033                          Register super_klass,
2034                          Register temp1_reg,
2035                          Register temp2_reg,
2036                          Label& L_success) {
2037   Label L_failure;
2038   check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure);
2039   check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success);
2040   bind(L_failure); // Fallthru if not successful.
2041 }
2042 
2043 void MacroAssembler::check_method_handle_type(Register mtype_reg, Register mh_reg,
2044                                               Register temp_reg,
2045                                               Label& wrong_method_type) {
2046   assert_different_registers(mtype_reg, mh_reg, temp_reg);
2047   // Compare method type against that of the receiver.
2048   load_heap_oop_not_null(temp_reg, delayed_value(java_lang_invoke_MethodHandle::type_offset_in_bytes, temp_reg), mh_reg);
2049   cmpd(CCR0, temp_reg, mtype_reg);
2050   bne(CCR0, wrong_method_type);
2051 }
2052 
2053 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot,
2054                                                    Register temp_reg,
2055                                                    int extra_slot_offset) {
2056   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
2057   int stackElementSize = Interpreter::stackElementSize;
2058   int offset = extra_slot_offset * stackElementSize;
2059   if (arg_slot.is_constant()) {
2060     offset += arg_slot.as_constant() * stackElementSize;
2061     return offset;
2062   } else {
2063     assert(temp_reg != noreg, "must specify");
2064     sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize));
2065     if (offset != 0)
2066       addi(temp_reg, temp_reg, offset);
2067     return temp_reg;
2068   }
2069 }
2070 
2071 // Supports temp2_reg = R0.
2072 void MacroAssembler::biased_locking_enter(ConditionRegister cr_reg, Register obj_reg,
2073                                           Register mark_reg, Register temp_reg,
2074                                           Register temp2_reg, Label& done, Label* slow_case) {
2075   assert(UseBiasedLocking, "why call this otherwise?");
2076 
2077 #ifdef ASSERT
2078   assert_different_registers(obj_reg, mark_reg, temp_reg, temp2_reg);
2079 #endif
2080 
2081   Label cas_label;
2082 
2083   // Branch to done if fast path fails and no slow_case provided.
2084   Label *slow_case_int = (slow_case != NULL) ? slow_case : &done;
2085 
2086   // Biased locking
2087   // See whether the lock is currently biased toward our thread and
2088   // whether the epoch is still valid
2089   // Note that the runtime guarantees sufficient alignment of JavaThread
2090   // pointers to allow age to be placed into low bits
2091   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits,
2092          "biased locking makes assumptions about bit layout");
2093 
2094   if (PrintBiasedLockingStatistics) {
2095     load_const(temp2_reg, (address) BiasedLocking::total_entry_count_addr(), temp_reg);
2096     lwzx(temp_reg, temp2_reg);
2097     addi(temp_reg, temp_reg, 1);
2098     stwx(temp_reg, temp2_reg);
2099   }
2100 
2101   andi(temp_reg, mark_reg, markOopDesc::biased_lock_mask_in_place);
2102   cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern);
2103   bne(cr_reg, cas_label);
2104 
2105   load_klass(temp_reg, obj_reg);
2106 
2107   load_const_optimized(temp2_reg, ~((int) markOopDesc::age_mask_in_place));
2108   ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
2109   orr(temp_reg, R16_thread, temp_reg);
2110   xorr(temp_reg, mark_reg, temp_reg);
2111   andr(temp_reg, temp_reg, temp2_reg);
2112   cmpdi(cr_reg, temp_reg, 0);
2113   if (PrintBiasedLockingStatistics) {
2114     Label l;
2115     bne(cr_reg, l);
2116     load_const(temp2_reg, (address) BiasedLocking::biased_lock_entry_count_addr());
2117     lwzx(mark_reg, temp2_reg);
2118     addi(mark_reg, mark_reg, 1);
2119     stwx(mark_reg, temp2_reg);
2120     // restore mark_reg
2121     ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);
2122     bind(l);
2123   }
2124   beq(cr_reg, done);
2125 
2126   Label try_revoke_bias;
2127   Label try_rebias;
2128 
2129   // At this point we know that the header has the bias pattern and
2130   // that we are not the bias owner in the current epoch. We need to
2131   // figure out more details about the state of the header in order to
2132   // know what operations can be legally performed on the object's
2133   // header.
2134 
2135   // If the low three bits in the xor result aren't clear, that means
2136   // the prototype header is no longer biased and we have to revoke
2137   // the bias on this object.
2138   andi(temp2_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
2139   cmpwi(cr_reg, temp2_reg, 0);
2140   bne(cr_reg, try_revoke_bias);
2141 
2142   // Biasing is still enabled for this data type. See whether the
2143   // epoch of the current bias is still valid, meaning that the epoch
2144   // bits of the mark word are equal to the epoch bits of the
2145   // prototype header. (Note that the prototype header's epoch bits
2146   // only change at a safepoint.) If not, attempt to rebias the object
2147   // toward the current thread. Note that we must be absolutely sure
2148   // that the current epoch is invalid in order to do this because
2149   // otherwise the manipulations it performs on the mark word are
2150   // illegal.
2151 
2152   int shift_amount = 64 - markOopDesc::epoch_shift;
2153   // rotate epoch bits to right (little) end and set other bits to 0
2154   // [ big part | epoch | little part ] -> [ 0..0 | epoch ]
2155   rldicl_(temp2_reg, temp_reg, shift_amount, 64 - markOopDesc::epoch_bits);
2156   // branch if epoch bits are != 0, i.e. they differ, because the epoch has been incremented
2157   bne(CCR0, try_rebias);
2158 
2159   // The epoch of the current bias is still valid but we know nothing
2160   // about the owner; it might be set or it might be clear. Try to
2161   // acquire the bias of the object using an atomic operation. If this
2162   // fails we will go in to the runtime to revoke the object's bias.
2163   // Note that we first construct the presumed unbiased header so we
2164   // don't accidentally blow away another thread's valid bias.
2165   andi(mark_reg, mark_reg, (markOopDesc::biased_lock_mask_in_place |
2166                                 markOopDesc::age_mask_in_place |
2167                                 markOopDesc::epoch_mask_in_place));
2168   orr(temp_reg, R16_thread, mark_reg);
2169 
2170   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2171 
2172   // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
2173   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
2174            /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
2175            /*where=*/obj_reg,
2176            MacroAssembler::MemBarAcq,
2177            MacroAssembler::cmpxchgx_hint_acquire_lock(),
2178            noreg, slow_case_int); // bail out if failed
2179 
2180   // If the biasing toward our thread failed, this means that
2181   // another thread succeeded in biasing it toward itself and we
2182   // need to revoke that bias. The revocation will occur in the
2183   // interpreter runtime in the slow case.
2184   if (PrintBiasedLockingStatistics) {
2185     load_const(temp2_reg, (address) BiasedLocking::anonymously_biased_lock_entry_count_addr(), temp_reg);
2186     lwzx(temp_reg, temp2_reg);
2187     addi(temp_reg, temp_reg, 1);
2188     stwx(temp_reg, temp2_reg);
2189   }
2190   b(done);
2191 
2192   bind(try_rebias);
2193   // At this point we know the epoch has expired, meaning that the
2194   // current "bias owner", if any, is actually invalid. Under these
2195   // circumstances _only_, we are allowed to use the current header's
2196   // value as the comparison value when doing the cas to acquire the
2197   // bias in the current epoch. In other words, we allow transfer of
2198   // the bias from one thread to another directly in this situation.
2199   load_klass(temp_reg, obj_reg);
2200   andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place);
2201   orr(temp2_reg, R16_thread, temp2_reg);
2202   ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
2203   orr(temp_reg, temp2_reg, temp_reg);
2204 
2205   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2206 
2207   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
2208                  /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
2209                  /*where=*/obj_reg,
2210                  MacroAssembler::MemBarAcq,
2211                  MacroAssembler::cmpxchgx_hint_acquire_lock(),
2212                  noreg, slow_case_int); // bail out if failed
2213 
2214   // If the biasing toward our thread failed, this means that
2215   // another thread succeeded in biasing it toward itself and we
2216   // need to revoke that bias. The revocation will occur in the
2217   // interpreter runtime in the slow case.
2218   if (PrintBiasedLockingStatistics) {
2219     load_const(temp2_reg, (address) BiasedLocking::rebiased_lock_entry_count_addr(), temp_reg);
2220     lwzx(temp_reg, temp2_reg);
2221     addi(temp_reg, temp_reg, 1);
2222     stwx(temp_reg, temp2_reg);
2223   }
2224   b(done);
2225 
2226   bind(try_revoke_bias);
2227   // The prototype mark in the klass doesn't have the bias bit set any
2228   // more, indicating that objects of this data type are not supposed
2229   // to be biased any more. We are going to try to reset the mark of
2230   // this object to the prototype value and fall through to the
2231   // CAS-based locking scheme. Note that if our CAS fails, it means
2232   // that another thread raced us for the privilege of revoking the
2233   // bias of this particular object, so it's okay to continue in the
2234   // normal locking code.
2235   load_klass(temp_reg, obj_reg);
2236   ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
2237   andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place);
2238   orr(temp_reg, temp_reg, temp2_reg);
2239 
2240   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2241 
2242   // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
2243   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
2244                  /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
2245                  /*where=*/obj_reg,
2246                  MacroAssembler::MemBarAcq,
2247                  MacroAssembler::cmpxchgx_hint_acquire_lock());
2248 
2249   // reload markOop in mark_reg before continuing with lightweight locking
2250   ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);
2251 
2252   // Fall through to the normal CAS-based lock, because no matter what
2253   // the result of the above CAS, some thread must have succeeded in
2254   // removing the bias bit from the object's header.
2255   if (PrintBiasedLockingStatistics) {
2256     Label l;
2257     bne(cr_reg, l);
2258     load_const(temp2_reg, (address) BiasedLocking::revoked_lock_entry_count_addr(), temp_reg);
2259     lwzx(temp_reg, temp2_reg);
2260     addi(temp_reg, temp_reg, 1);
2261     stwx(temp_reg, temp2_reg);
2262     bind(l);
2263   }
2264 
2265   bind(cas_label);
2266 }
2267 
2268 void MacroAssembler::biased_locking_exit (ConditionRegister cr_reg, Register mark_addr, Register temp_reg, Label& done) {
2269   // Check for biased locking unlock case, which is a no-op
2270   // Note: we do not have to check the thread ID for two reasons.
2271   // First, the interpreter checks for IllegalMonitorStateException at
2272   // a higher level. Second, if the bias was revoked while we held the
2273   // lock, the object could not be rebiased toward another thread, so
2274   // the bias bit would be clear.
2275 
2276   ld(temp_reg, 0, mark_addr);
2277   andi(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
2278 
2279   cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern);
2280   beq(cr_reg, done);
2281 }
2282 
2283 // allocation (for C1)
2284 void MacroAssembler::eden_allocate(
2285   Register obj,                      // result: pointer to object after successful allocation
2286   Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
2287   int      con_size_in_bytes,        // object size in bytes if   known at compile time
2288   Register t1,                       // temp register
2289   Register t2,                       // temp register
2290   Label&   slow_case                 // continuation point if fast allocation fails
2291 ) {
2292   b(slow_case);
2293 }
2294 
2295 void MacroAssembler::tlab_allocate(
2296   Register obj,                      // result: pointer to object after successful allocation
2297   Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
2298   int      con_size_in_bytes,        // object size in bytes if   known at compile time
2299   Register t1,                       // temp register
2300   Label&   slow_case                 // continuation point if fast allocation fails
2301 ) {
2302   // make sure arguments make sense
2303   assert_different_registers(obj, var_size_in_bytes, t1);
2304   assert(0 <= con_size_in_bytes && is_simm13(con_size_in_bytes), "illegal object size");
2305   assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment");
2306 
2307   const Register new_top = t1;
2308   //verify_tlab(); not implemented
2309 
2310   ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2311   ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread);
2312   if (var_size_in_bytes == noreg) {
2313     addi(new_top, obj, con_size_in_bytes);
2314   } else {
2315     add(new_top, obj, var_size_in_bytes);
2316   }
2317   cmpld(CCR0, new_top, R0);
2318   bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case);
2319 
2320 #ifdef ASSERT
2321   // make sure new free pointer is properly aligned
2322   {
2323     Label L;
2324     andi_(R0, new_top, MinObjAlignmentInBytesMask);
2325     beq(CCR0, L);
2326     stop("updated TLAB free is not properly aligned", 0x934);
2327     bind(L);
2328   }
2329 #endif // ASSERT
2330 
2331   // update the tlab top pointer
2332   std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2333   //verify_tlab(); not implemented
2334 }
2335 void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register t1, Register t2) {
2336   unimplemented("incr_allocated_bytes");
2337 }
2338 
2339 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset,
2340                                              int insts_call_instruction_offset, Register Rtoc) {
2341   // Start the stub.
2342   address stub = start_a_stub(64);
2343   if (stub == NULL) { return NULL; } // CodeCache full: bail out
2344 
2345   // Create a trampoline stub relocation which relates this trampoline stub
2346   // with the call instruction at insts_call_instruction_offset in the
2347   // instructions code-section.
2348   relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset));
2349   const int stub_start_offset = offset();
2350 
2351   // For java_to_interp stubs we use R11_scratch1 as scratch register
2352   // and in call trampoline stubs we use R12_scratch2. This way we
2353   // can distinguish them (see is_NativeCallTrampolineStub_at()).
2354   Register reg_scratch = R12_scratch2;
2355 
2356   // Now, create the trampoline stub's code:
2357   // - load the TOC
2358   // - load the call target from the constant pool
2359   // - call
2360   if (Rtoc == noreg) {
2361     calculate_address_from_global_toc(reg_scratch, method_toc());
2362     Rtoc = reg_scratch;
2363   }
2364 
2365   ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false);
2366   mtctr(reg_scratch);
2367   bctr();
2368 
2369   const address stub_start_addr = addr_at(stub_start_offset);
2370 
2371   // Assert that the encoded destination_toc_offset can be identified and that it is correct.
2372   assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(),
2373          "encoded offset into the constant pool must match");
2374   // Trampoline_stub_size should be good.
2375   assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size");
2376   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
2377 
2378   // End the stub.
2379   end_a_stub();
2380   return stub;
2381 }
2382 
2383 // TM on PPC64.
2384 void MacroAssembler::atomic_inc_ptr(Register addr, Register result, int simm16) {
2385   Label retry;
2386   bind(retry);
2387   ldarx(result, addr, /*hint*/ false);
2388   addi(result, result, simm16);
2389   stdcx_(result, addr);
2390   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
2391     bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
2392   } else {
2393     bne(                  CCR0, retry); // stXcx_ sets CCR0
2394   }
2395 }
2396 
2397 void MacroAssembler::atomic_ori_int(Register addr, Register result, int uimm16) {
2398   Label retry;
2399   bind(retry);
2400   lwarx(result, addr, /*hint*/ false);
2401   ori(result, result, uimm16);
2402   stwcx_(result, addr);
2403   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
2404     bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
2405   } else {
2406     bne(                  CCR0, retry); // stXcx_ sets CCR0
2407   }
2408 }
2409 
2410 #if INCLUDE_RTM_OPT
2411 
2412 // Update rtm_counters based on abort status
2413 // input: abort_status
2414 //        rtm_counters (RTMLockingCounters*)
2415 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters_Reg) {
2416   // Mapping to keep PreciseRTMLockingStatistics similar to x86.
2417   // x86 ppc (! means inverted, ? means not the same)
2418   //  0   31  Set if abort caused by XABORT instruction.
2419   //  1  ! 7  If set, the transaction may succeed on a retry. This bit is always clear if bit 0 is set.
2420   //  2   13  Set if another logical processor conflicted with a memory address that was part of the transaction that aborted.
2421   //  3   10  Set if an internal buffer overflowed.
2422   //  4  ?12  Set if a debug breakpoint was hit.
2423   //  5  ?32  Set if an abort occurred during execution of a nested transaction.
2424   const  int tm_failure_bit[] = {Assembler::tm_tabort, // Note: Seems like signal handler sets this, too.
2425                                  Assembler::tm_failure_persistent, // inverted: transient
2426                                  Assembler::tm_trans_cf,
2427                                  Assembler::tm_footprint_of,
2428                                  Assembler::tm_non_trans_cf,
2429                                  Assembler::tm_suspended};
2430   const bool tm_failure_inv[] = {false, true, false, false, false, false};
2431   assert(sizeof(tm_failure_bit)/sizeof(int) == RTMLockingCounters::ABORT_STATUS_LIMIT, "adapt mapping!");
2432 
2433   const Register addr_Reg = R0;
2434   // Keep track of offset to where rtm_counters_Reg had pointed to.
2435   int counters_offs = RTMLockingCounters::abort_count_offset();
2436   addi(addr_Reg, rtm_counters_Reg, counters_offs);
2437   const Register temp_Reg = rtm_counters_Reg;
2438 
2439   //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically
2440   ldx(temp_Reg, addr_Reg);
2441   addi(temp_Reg, temp_Reg, 1);
2442   stdx(temp_Reg, addr_Reg);
2443 
2444   if (PrintPreciseRTMLockingStatistics) {
2445     int counters_offs_delta = RTMLockingCounters::abortX_count_offset() - counters_offs;
2446 
2447     //mftexasr(abort_status); done by caller
2448     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
2449       counters_offs += counters_offs_delta;
2450       li(temp_Reg, counters_offs_delta); // can't use addi with R0
2451       add(addr_Reg, addr_Reg, temp_Reg); // point to next counter
2452       counters_offs_delta = sizeof(uintx);
2453 
2454       Label check_abort;
2455       rldicr_(temp_Reg, abort_status, tm_failure_bit[i], 0);
2456       if (tm_failure_inv[i]) {
2457         bne(CCR0, check_abort);
2458       } else {
2459         beq(CCR0, check_abort);
2460       }
2461       //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically
2462       ldx(temp_Reg, addr_Reg);
2463       addi(temp_Reg, temp_Reg, 1);
2464       stdx(temp_Reg, addr_Reg);
2465       bind(check_abort);
2466     }
2467   }
2468   li(temp_Reg, -counters_offs); // can't use addi with R0
2469   add(rtm_counters_Reg, addr_Reg, temp_Reg); // restore
2470 }
2471 
2472 // Branch if (random & (count-1) != 0), count is 2^n
2473 // tmp and CR0 are killed
2474 void MacroAssembler::branch_on_random_using_tb(Register tmp, int count, Label& brLabel) {
2475   mftb(tmp);
2476   andi_(tmp, tmp, count-1);
2477   bne(CCR0, brLabel);
2478 }
2479 
2480 // Perform abort ratio calculation, set no_rtm bit if high ratio.
2481 // input:  rtm_counters_Reg (RTMLockingCounters* address) - KILLED
2482 void MacroAssembler::rtm_abort_ratio_calculation(Register rtm_counters_Reg,
2483                                                  RTMLockingCounters* rtm_counters,
2484                                                  Metadata* method_data) {
2485   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
2486 
2487   if (RTMLockingCalculationDelay > 0) {
2488     // Delay calculation.
2489     ld(rtm_counters_Reg, (RegisterOrConstant)(intptr_t)RTMLockingCounters::rtm_calculation_flag_addr());
2490     cmpdi(CCR0, rtm_counters_Reg, 0);
2491     beq(CCR0, L_done);
2492     load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2493   }
2494   // Abort ratio calculation only if abort_count > RTMAbortThreshold.
2495   //   Aborted transactions = abort_count * 100
2496   //   All transactions = total_count *  RTMTotalCountIncrRate
2497   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
2498   ld(R0, RTMLockingCounters::abort_count_offset(), rtm_counters_Reg);
2499   if (is_simm(RTMAbortThreshold, 16)) {   // cmpdi can handle 16bit immediate only.
2500     cmpdi(CCR0, R0, RTMAbortThreshold);
2501     blt(CCR0, L_check_always_rtm2);  // reload of rtm_counters_Reg not necessary
2502   } else {
2503     load_const_optimized(rtm_counters_Reg, RTMAbortThreshold);
2504     cmpd(CCR0, R0, rtm_counters_Reg);
2505     blt(CCR0, L_check_always_rtm1);  // reload of rtm_counters_Reg required
2506   }
2507   mulli(R0, R0, 100);
2508 
2509   const Register tmpReg = rtm_counters_Reg;
2510   ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2511   mulli(tmpReg, tmpReg, RTMTotalCountIncrRate); // allowable range: int16
2512   mulli(tmpReg, tmpReg, RTMAbortRatio);         // allowable range: int16
2513   cmpd(CCR0, R0, tmpReg);
2514   blt(CCR0, L_check_always_rtm1); // jump to reload
2515   if (method_data != NULL) {
2516     // Set rtm_state to "no rtm" in MDO.
2517     // Not using a metadata relocation. Method and Class Loader are kept alive anyway.
2518     // (See nmethod::metadata_do and CodeBuffer::finalize_oop_references.)
2519     load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2520     atomic_ori_int(R0, tmpReg, NoRTM);
2521   }
2522   b(L_done);
2523 
2524   bind(L_check_always_rtm1);
2525   load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2526   bind(L_check_always_rtm2);
2527   ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2528   int64_t thresholdValue = RTMLockingThreshold / RTMTotalCountIncrRate;
2529   if (is_simm(thresholdValue, 16)) {   // cmpdi can handle 16bit immediate only.
2530     cmpdi(CCR0, tmpReg, thresholdValue);
2531   } else {
2532     load_const_optimized(R0, thresholdValue);
2533     cmpd(CCR0, tmpReg, R0);
2534   }
2535   blt(CCR0, L_done);
2536   if (method_data != NULL) {
2537     // Set rtm_state to "always rtm" in MDO.
2538     // Not using a metadata relocation. See above.
2539     load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2540     atomic_ori_int(R0, tmpReg, UseRTM);
2541   }
2542   bind(L_done);
2543 }
2544 
2545 // Update counters and perform abort ratio calculation.
2546 // input: abort_status_Reg
2547 void MacroAssembler::rtm_profiling(Register abort_status_Reg, Register temp_Reg,
2548                                    RTMLockingCounters* rtm_counters,
2549                                    Metadata* method_data,
2550                                    bool profile_rtm) {
2551 
2552   assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2553   // Update rtm counters based on state at abort.
2554   // Reads abort_status_Reg, updates flags.
2555   assert_different_registers(abort_status_Reg, temp_Reg);
2556   load_const_optimized(temp_Reg, (address)rtm_counters, R0);
2557   rtm_counters_update(abort_status_Reg, temp_Reg);
2558   if (profile_rtm) {
2559     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2560     rtm_abort_ratio_calculation(temp_Reg, rtm_counters, method_data);
2561   }
2562 }
2563 
2564 // Retry on abort if abort's status indicates non-persistent failure.
2565 // inputs: retry_count_Reg
2566 //       : abort_status_Reg
2567 // output: retry_count_Reg decremented by 1
2568 void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg,
2569                                              Label& retryLabel, Label* checkRetry) {
2570   Label doneRetry;
2571   rldicr_(R0, abort_status_Reg, tm_failure_persistent, 0);
2572   bne(CCR0, doneRetry);
2573   if (checkRetry) { bind(*checkRetry); }
2574   addic_(retry_count_Reg, retry_count_Reg, -1);
2575   blt(CCR0, doneRetry);
2576   b(retryLabel);
2577   bind(doneRetry);
2578 }
2579 
2580 // Spin and retry if lock is busy.
2581 // inputs: owner_addr_Reg (monitor address)
2582 //       : retry_count_Reg
2583 // output: retry_count_Reg decremented by 1
2584 // CTR is killed
2585 void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register owner_addr_Reg, Label& retryLabel) {
2586   Label SpinLoop, doneRetry, doRetry;
2587   addic_(retry_count_Reg, retry_count_Reg, -1);
2588   blt(CCR0, doneRetry);
2589 
2590   if (RTMSpinLoopCount > 1) {
2591     li(R0, RTMSpinLoopCount);
2592     mtctr(R0);
2593   }
2594 
2595   // low thread priority
2596   smt_prio_low();
2597   bind(SpinLoop);
2598 
2599   if (RTMSpinLoopCount > 1) {
2600     bdz(doRetry);
2601     ld(R0, 0, owner_addr_Reg);
2602     cmpdi(CCR0, R0, 0);
2603     bne(CCR0, SpinLoop);
2604   }
2605 
2606   bind(doRetry);
2607 
2608   // restore thread priority to default in userspace
2609 #ifdef LINUX
2610   smt_prio_medium_low();
2611 #else
2612   smt_prio_medium();
2613 #endif
2614 
2615   b(retryLabel);
2616 
2617   bind(doneRetry);
2618 }
2619 
2620 // Use RTM for normal stack locks.
2621 // Input: objReg (object to lock)
2622 void MacroAssembler::rtm_stack_locking(ConditionRegister flag,
2623                                        Register obj, Register mark_word, Register tmp,
2624                                        Register retry_on_abort_count_Reg,
2625                                        RTMLockingCounters* stack_rtm_counters,
2626                                        Metadata* method_data, bool profile_rtm,
2627                                        Label& DONE_LABEL, Label& IsInflated) {
2628   assert(UseRTMForStackLocks, "why call this otherwise?");
2629   assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
2630   Label L_rtm_retry, L_decrement_retry, L_on_abort;
2631 
2632   if (RTMRetryCount > 0) {
2633     load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
2634     bind(L_rtm_retry);
2635   }
2636   andi_(R0, mark_word, markOopDesc::monitor_value);  // inflated vs stack-locked|neutral|biased
2637   bne(CCR0, IsInflated);
2638 
2639   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2640     Label L_noincrement;
2641     if (RTMTotalCountIncrRate > 1) {
2642       branch_on_random_using_tb(tmp, RTMTotalCountIncrRate, L_noincrement);
2643     }
2644     assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
2645     load_const_optimized(tmp, (address)stack_rtm_counters->total_count_addr(), R0);
2646     //atomic_inc_ptr(tmp, /*temp, will be reloaded*/mark_word); We don't increment atomically
2647     ldx(mark_word, tmp);
2648     addi(mark_word, mark_word, 1);
2649     stdx(mark_word, tmp);
2650     bind(L_noincrement);
2651   }
2652   tbegin_();
2653   beq(CCR0, L_on_abort);
2654   ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);         // Reload in transaction, conflicts need to be tracked.
2655   andi(R0, mark_word, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
2656   cmpwi(flag, R0, markOopDesc::unlocked_value);                // bits = 001 unlocked
2657   beq(flag, DONE_LABEL);                                       // all done if unlocked
2658 
2659   if (UseRTMXendForLockBusy) {
2660     tend_();
2661     b(L_decrement_retry);
2662   } else {
2663     tabort_();
2664   }
2665   bind(L_on_abort);
2666   const Register abort_status_Reg = tmp;
2667   mftexasr(abort_status_Reg);
2668   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2669     rtm_profiling(abort_status_Reg, /*temp*/mark_word, stack_rtm_counters, method_data, profile_rtm);
2670   }
2671   ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // reload
2672   if (RTMRetryCount > 0) {
2673     // Retry on lock abort if abort status is not permanent.
2674     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry, &L_decrement_retry);
2675   } else {
2676     bind(L_decrement_retry);
2677   }
2678 }
2679 
2680 // Use RTM for inflating locks
2681 // inputs: obj       (object to lock)
2682 //         mark_word (current header - KILLED)
2683 //         boxReg    (on-stack box address (displaced header location) - KILLED)
2684 void MacroAssembler::rtm_inflated_locking(ConditionRegister flag,
2685                                           Register obj, Register mark_word, Register boxReg,
2686                                           Register retry_on_busy_count_Reg, Register retry_on_abort_count_Reg,
2687                                           RTMLockingCounters* rtm_counters,
2688                                           Metadata* method_data, bool profile_rtm,
2689                                           Label& DONE_LABEL) {
2690   assert(UseRTMLocking, "why call this otherwise?");
2691   Label L_rtm_retry, L_decrement_retry, L_on_abort;
2692   // Clean monitor_value bit to get valid pointer.
2693   int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markOopDesc::monitor_value;
2694 
2695   // Store non-null, using boxReg instead of (intptr_t)markOopDesc::unused_mark().
2696   std(boxReg, BasicLock::displaced_header_offset_in_bytes(), boxReg);
2697   const Register tmpReg = boxReg;
2698   const Register owner_addr_Reg = mark_word;
2699   addi(owner_addr_Reg, mark_word, owner_offset);
2700 
2701   if (RTMRetryCount > 0) {
2702     load_const_optimized(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy.
2703     load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort.
2704     bind(L_rtm_retry);
2705   }
2706   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2707     Label L_noincrement;
2708     if (RTMTotalCountIncrRate > 1) {
2709       branch_on_random_using_tb(R0, RTMTotalCountIncrRate, L_noincrement);
2710     }
2711     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2712     load_const(R0, (address)rtm_counters->total_count_addr(), tmpReg);
2713     //atomic_inc_ptr(R0, tmpReg); We don't increment atomically
2714     ldx(tmpReg, R0);
2715     addi(tmpReg, tmpReg, 1);
2716     stdx(tmpReg, R0);
2717     bind(L_noincrement);
2718   }
2719   tbegin_();
2720   beq(CCR0, L_on_abort);
2721   // We don't reload mark word. Will only be reset at safepoint.
2722   ld(R0, 0, owner_addr_Reg); // Load in transaction, conflicts need to be tracked.
2723   cmpdi(flag, R0, 0);
2724   beq(flag, DONE_LABEL);
2725 
2726   if (UseRTMXendForLockBusy) {
2727     tend_();
2728     b(L_decrement_retry);
2729   } else {
2730     tabort_();
2731   }
2732   bind(L_on_abort);
2733   const Register abort_status_Reg = tmpReg;
2734   mftexasr(abort_status_Reg);
2735   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2736     rtm_profiling(abort_status_Reg, /*temp*/ owner_addr_Reg, rtm_counters, method_data, profile_rtm);
2737     // Restore owner_addr_Reg
2738     ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);
2739 #ifdef ASSERT
2740     andi_(R0, mark_word, markOopDesc::monitor_value);
2741     asm_assert_ne("must be inflated", 0xa754); // Deflating only allowed at safepoint.
2742 #endif
2743     addi(owner_addr_Reg, mark_word, owner_offset);
2744   }
2745   if (RTMRetryCount > 0) {
2746     // Retry on lock abort if abort status is not permanent.
2747     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
2748   }
2749 
2750   // Appears unlocked - try to swing _owner from null to non-null.
2751   cmpxchgd(flag, /*current val*/ R0, (intptr_t)0, /*new val*/ R16_thread, owner_addr_Reg,
2752            MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2753            MacroAssembler::cmpxchgx_hint_acquire_lock(), noreg, &L_decrement_retry, true);
2754 
2755   if (RTMRetryCount > 0) {
2756     // success done else retry
2757     b(DONE_LABEL);
2758     bind(L_decrement_retry);
2759     // Spin and retry if lock is busy.
2760     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, owner_addr_Reg, L_rtm_retry);
2761   } else {
2762     bind(L_decrement_retry);
2763   }
2764 }
2765 
2766 #endif //  INCLUDE_RTM_OPT
2767 
2768 // "The box" is the space on the stack where we copy the object mark.
2769 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box,
2770                                                Register temp, Register displaced_header, Register current_header,
2771                                                bool try_bias,
2772                                                RTMLockingCounters* rtm_counters,
2773                                                RTMLockingCounters* stack_rtm_counters,
2774                                                Metadata* method_data,
2775                                                bool use_rtm, bool profile_rtm) {
2776   assert_different_registers(oop, box, temp, displaced_header, current_header);
2777   assert(flag != CCR0, "bad condition register");
2778   Label cont;
2779   Label object_has_monitor;
2780   Label cas_failed;
2781 
2782   // Load markOop from object into displaced_header.
2783   ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop);
2784 
2785 
2786   // Always do locking in runtime.
2787   if (EmitSync & 0x01) {
2788     cmpdi(flag, oop, 0); // Oop can't be 0 here => always false.
2789     return;
2790   }
2791 
2792   if (try_bias) {
2793     biased_locking_enter(flag, oop, displaced_header, temp, current_header, cont);
2794   }
2795 
2796 #if INCLUDE_RTM_OPT
2797   if (UseRTMForStackLocks && use_rtm) {
2798     rtm_stack_locking(flag, oop, displaced_header, temp, /*temp*/ current_header,
2799                       stack_rtm_counters, method_data, profile_rtm,
2800                       cont, object_has_monitor);
2801   }
2802 #endif // INCLUDE_RTM_OPT
2803 
2804   // Handle existing monitor.
2805   if ((EmitSync & 0x02) == 0) {
2806     // The object has an existing monitor iff (mark & monitor_value) != 0.
2807     andi_(temp, displaced_header, markOopDesc::monitor_value);
2808     bne(CCR0, object_has_monitor);
2809   }
2810 
2811   // Set displaced_header to be (markOop of object | UNLOCK_VALUE).
2812   ori(displaced_header, displaced_header, markOopDesc::unlocked_value);
2813 
2814   // Load Compare Value application register.
2815 
2816   // Initialize the box. (Must happen before we update the object mark!)
2817   std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2818 
2819   // Must fence, otherwise, preceding store(s) may float below cmpxchg.
2820   // Compare object markOop with mark and if equal exchange scratch1 with object markOop.
2821   cmpxchgd(/*flag=*/flag,
2822            /*current_value=*/current_header,
2823            /*compare_value=*/displaced_header,
2824            /*exchange_value=*/box,
2825            /*where=*/oop,
2826            MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2827            MacroAssembler::cmpxchgx_hint_acquire_lock(),
2828            noreg,
2829            &cas_failed,
2830            /*check without membar and ldarx first*/true);
2831   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2832 
2833   // If the compare-and-exchange succeeded, then we found an unlocked
2834   // object and we have now locked it.
2835   b(cont);
2836 
2837   bind(cas_failed);
2838   // We did not see an unlocked object so try the fast recursive case.
2839 
2840   // Check if the owner is self by comparing the value in the markOop of object
2841   // (current_header) with the stack pointer.
2842   sub(current_header, current_header, R1_SP);
2843   load_const_optimized(temp, ~(os::vm_page_size()-1) | markOopDesc::lock_mask_in_place);
2844 
2845   and_(R0/*==0?*/, current_header, temp);
2846   // If condition is true we are cont and hence we can store 0 as the
2847   // displaced header in the box, which indicates that it is a recursive lock.
2848   mcrf(flag,CCR0);
2849   std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box);
2850 
2851   // Handle existing monitor.
2852   if ((EmitSync & 0x02) == 0) {
2853     b(cont);
2854 
2855     bind(object_has_monitor);
2856     // The object's monitor m is unlocked iff m->owner == NULL,
2857     // otherwise m->owner may contain a thread or a stack address.
2858 
2859 #if INCLUDE_RTM_OPT
2860     // Use the same RTM locking code in 32- and 64-bit VM.
2861     if (use_rtm) {
2862       rtm_inflated_locking(flag, oop, displaced_header, box, temp, /*temp*/ current_header,
2863                            rtm_counters, method_data, profile_rtm, cont);
2864     } else {
2865 #endif // INCLUDE_RTM_OPT
2866 
2867     // Try to CAS m->owner from NULL to current thread.
2868     addi(temp, displaced_header, ObjectMonitor::owner_offset_in_bytes()-markOopDesc::monitor_value);
2869     cmpxchgd(/*flag=*/flag,
2870              /*current_value=*/current_header,
2871              /*compare_value=*/(intptr_t)0,
2872              /*exchange_value=*/R16_thread,
2873              /*where=*/temp,
2874              MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2875              MacroAssembler::cmpxchgx_hint_acquire_lock());
2876 
2877     // Store a non-null value into the box.
2878     std(box, BasicLock::displaced_header_offset_in_bytes(), box);
2879 
2880 #   ifdef ASSERT
2881     bne(flag, cont);
2882     // We have acquired the monitor, check some invariants.
2883     addi(/*monitor=*/temp, temp, -ObjectMonitor::owner_offset_in_bytes());
2884     // Invariant 1: _recursions should be 0.
2885     //assert(ObjectMonitor::recursions_size_in_bytes() == 8, "unexpected size");
2886     asm_assert_mem8_is_zero(ObjectMonitor::recursions_offset_in_bytes(), temp,
2887                             "monitor->_recursions should be 0", -1);
2888 #   endif
2889 
2890 #if INCLUDE_RTM_OPT
2891     } // use_rtm()
2892 #endif
2893   }
2894 
2895   bind(cont);
2896   // flag == EQ indicates success
2897   // flag == NE indicates failure
2898 }
2899 
2900 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box,
2901                                                  Register temp, Register displaced_header, Register current_header,
2902                                                  bool try_bias, bool use_rtm) {
2903   assert_different_registers(oop, box, temp, displaced_header, current_header);
2904   assert(flag != CCR0, "bad condition register");
2905   Label cont;
2906   Label object_has_monitor;
2907 
2908   // Always do locking in runtime.
2909   if (EmitSync & 0x01) {
2910     cmpdi(flag, oop, 0); // Oop can't be 0 here => always false.
2911     return;
2912   }
2913 
2914   if (try_bias) {
2915     biased_locking_exit(flag, oop, current_header, cont);
2916   }
2917 
2918 #if INCLUDE_RTM_OPT
2919   if (UseRTMForStackLocks && use_rtm) {
2920     assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
2921     Label L_regular_unlock;
2922     ld(current_header, oopDesc::mark_offset_in_bytes(), oop);         // fetch markword
2923     andi(R0, current_header, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
2924     cmpwi(flag, R0, markOopDesc::unlocked_value);                     // bits = 001 unlocked
2925     bne(flag, L_regular_unlock);                                      // else RegularLock
2926     tend_();                                                          // otherwise end...
2927     b(cont);                                                          // ... and we're done
2928     bind(L_regular_unlock);
2929   }
2930 #endif
2931 
2932   // Find the lock address and load the displaced header from the stack.
2933   ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2934 
2935   // If the displaced header is 0, we have a recursive unlock.
2936   cmpdi(flag, displaced_header, 0);
2937   beq(flag, cont);
2938 
2939   // Handle existing monitor.
2940   if ((EmitSync & 0x02) == 0) {
2941     // The object has an existing monitor iff (mark & monitor_value) != 0.
2942     RTM_OPT_ONLY( if (!(UseRTMForStackLocks && use_rtm)) ) // skip load if already done
2943     ld(current_header, oopDesc::mark_offset_in_bytes(), oop);
2944     andi_(R0, current_header, markOopDesc::monitor_value);
2945     bne(CCR0, object_has_monitor);
2946   }
2947 
2948   // Check if it is still a light weight lock, this is is true if we see
2949   // the stack address of the basicLock in the markOop of the object.
2950   // Cmpxchg sets flag to cmpd(current_header, box).
2951   cmpxchgd(/*flag=*/flag,
2952            /*current_value=*/current_header,
2953            /*compare_value=*/box,
2954            /*exchange_value=*/displaced_header,
2955            /*where=*/oop,
2956            MacroAssembler::MemBarRel,
2957            MacroAssembler::cmpxchgx_hint_release_lock(),
2958            noreg,
2959            &cont);
2960 
2961   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2962 
2963   // Handle existing monitor.
2964   if ((EmitSync & 0x02) == 0) {
2965     b(cont);
2966 
2967     bind(object_has_monitor);
2968     addi(current_header, current_header, -markOopDesc::monitor_value); // monitor
2969     ld(temp,             ObjectMonitor::owner_offset_in_bytes(), current_header);
2970 
2971     // It's inflated.
2972 #if INCLUDE_RTM_OPT
2973     if (use_rtm) {
2974       Label L_regular_inflated_unlock;
2975       // Clean monitor_value bit to get valid pointer
2976       cmpdi(flag, temp, 0);
2977       bne(flag, L_regular_inflated_unlock);
2978       tend_();
2979       b(cont);
2980       bind(L_regular_inflated_unlock);
2981     }
2982 #endif
2983 
2984     ld(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header);
2985     xorr(temp, R16_thread, temp);      // Will be 0 if we are the owner.
2986     orr(temp, temp, displaced_header); // Will be 0 if there are 0 recursions.
2987     cmpdi(flag, temp, 0);
2988     bne(flag, cont);
2989 
2990     ld(temp,             ObjectMonitor::EntryList_offset_in_bytes(), current_header);
2991     ld(displaced_header, ObjectMonitor::cxq_offset_in_bytes(), current_header);
2992     orr(temp, temp, displaced_header); // Will be 0 if both are 0.
2993     cmpdi(flag, temp, 0);
2994     bne(flag, cont);
2995     release();
2996     std(temp, ObjectMonitor::owner_offset_in_bytes(), current_header);
2997   }
2998 
2999   bind(cont);
3000   // flag == EQ indicates success
3001   // flag == NE indicates failure
3002 }
3003 
3004 // Write serialization page so VM thread can do a pseudo remote membar.
3005 // We use the current thread pointer to calculate a thread specific
3006 // offset to write to within the page. This minimizes bus traffic
3007 // due to cache line collision.
3008 void MacroAssembler::serialize_memory(Register thread, Register tmp1, Register tmp2) {
3009   srdi(tmp2, thread, os::get_serialize_page_shift_count());
3010 
3011   int mask = os::vm_page_size() - sizeof(int);
3012   if (Assembler::is_simm(mask, 16)) {
3013     andi(tmp2, tmp2, mask);
3014   } else {
3015     lis(tmp1, (int)((signed short) (mask >> 16)));
3016     ori(tmp1, tmp1, mask & 0x0000ffff);
3017     andr(tmp2, tmp2, tmp1);
3018   }
3019 
3020   load_const(tmp1, (long) os::get_memory_serialize_page());
3021   release();
3022   stwx(R0, tmp1, tmp2);
3023 }
3024 
3025 void MacroAssembler::safepoint_poll(Label& slow_path, Register temp_reg) {
3026   if (SafepointMechanism::uses_thread_local_poll()) {
3027     ld(temp_reg, in_bytes(Thread::polling_page_offset()), R16_thread);
3028     // Armed page has poll_bit set.
3029     andi_(temp_reg, temp_reg, SafepointMechanism::poll_bit());
3030   } else {
3031     lwz(temp_reg, (RegisterOrConstant)(intptr_t)SafepointSynchronize::address_of_state());
3032     cmpwi(CCR0, temp_reg, SafepointSynchronize::_not_synchronized);
3033   }
3034   bne(CCR0, slow_path);
3035 }
3036 
3037 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2, bool needs_frame) {
3038   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3039   bs->resolve_jobject(this, value, tmp1, tmp2, needs_frame);
3040 }
3041 
3042 // Values for last_Java_pc, and last_Java_sp must comply to the rules
3043 // in frame_ppc.hpp.
3044 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) {
3045   // Always set last_Java_pc and flags first because once last_Java_sp
3046   // is visible has_last_Java_frame is true and users will look at the
3047   // rest of the fields. (Note: flags should always be zero before we
3048   // get here so doesn't need to be set.)
3049 
3050   // Verify that last_Java_pc was zeroed on return to Java
3051   asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread,
3052                           "last_Java_pc not zeroed before leaving Java", 0x200);
3053 
3054   // When returning from calling out from Java mode the frame anchor's
3055   // last_Java_pc will always be set to NULL. It is set here so that
3056   // if we are doing a call to native (not VM) that we capture the
3057   // known pc and don't have to rely on the native call having a
3058   // standard frame linkage where we can find the pc.
3059   if (last_Java_pc != noreg)
3060     std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
3061 
3062   // Set last_Java_sp last.
3063   std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
3064 }
3065 
3066 void MacroAssembler::reset_last_Java_frame(void) {
3067   asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()),
3068                              R16_thread, "SP was not set, still zero", 0x202);
3069 
3070   BLOCK_COMMENT("reset_last_Java_frame {");
3071   li(R0, 0);
3072 
3073   // _last_Java_sp = 0
3074   std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
3075 
3076   // _last_Java_pc = 0
3077   std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
3078   BLOCK_COMMENT("} reset_last_Java_frame");
3079 }
3080 
3081 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) {
3082   assert_different_registers(sp, tmp1);
3083 
3084   // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via
3085   // TOP_IJAVA_FRAME_ABI.
3086   // FIXME: assert that we really have a TOP_IJAVA_FRAME here!
3087   address entry = pc();
3088   load_const_optimized(tmp1, entry);
3089 
3090   set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1);
3091 }
3092 
3093 void MacroAssembler::get_vm_result(Register oop_result) {
3094   // Read:
3095   //   R16_thread
3096   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
3097   //
3098   // Updated:
3099   //   oop_result
3100   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
3101 
3102   verify_thread();
3103 
3104   ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread);
3105   li(R0, 0);
3106   std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread);
3107 
3108   verify_oop(oop_result);
3109 }
3110 
3111 void MacroAssembler::get_vm_result_2(Register metadata_result) {
3112   // Read:
3113   //   R16_thread
3114   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
3115   //
3116   // Updated:
3117   //   metadata_result
3118   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
3119 
3120   ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
3121   li(R0, 0);
3122   std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
3123 }
3124 
3125 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3126   Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided.
3127   if (Universe::narrow_klass_base() != 0) {
3128     // Use dst as temp if it is free.
3129     sub_const_optimized(dst, current, Universe::narrow_klass_base(), R0);
3130     current = dst;
3131   }
3132   if (Universe::narrow_klass_shift() != 0) {
3133     srdi(dst, current, Universe::narrow_klass_shift());
3134     current = dst;
3135   }
3136   return current;
3137 }
3138 
3139 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) {
3140   if (UseCompressedClassPointers) {
3141     Register compressedKlass = encode_klass_not_null(ck, klass);
3142     stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop);
3143   } else {
3144     std(klass, oopDesc::klass_offset_in_bytes(), dst_oop);
3145   }
3146 }
3147 
3148 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) {
3149   if (UseCompressedClassPointers) {
3150     if (val == noreg) {
3151       val = R0;
3152       li(val, 0);
3153     }
3154     stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed
3155   }
3156 }
3157 
3158 int MacroAssembler::instr_size_for_decode_klass_not_null() {
3159   if (!UseCompressedClassPointers) return 0;
3160   int num_instrs = 1;  // shift or move
3161   if (Universe::narrow_klass_base() != 0) num_instrs = 7;  // shift + load const + add
3162   return num_instrs * BytesPerInstWord;
3163 }
3164 
3165 void MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3166   assert(dst != R0, "Dst reg may not be R0, as R0 is used here.");
3167   if (src == noreg) src = dst;
3168   Register shifted_src = src;
3169   if (Universe::narrow_klass_shift() != 0 ||
3170       Universe::narrow_klass_base() == 0 && src != dst) {  // Move required.
3171     shifted_src = dst;
3172     sldi(shifted_src, src, Universe::narrow_klass_shift());
3173   }
3174   if (Universe::narrow_klass_base() != 0) {
3175     add_const_optimized(dst, shifted_src, Universe::narrow_klass_base(), R0);
3176   }
3177 }
3178 
3179 void MacroAssembler::load_klass(Register dst, Register src) {
3180   if (UseCompressedClassPointers) {
3181     lwz(dst, oopDesc::klass_offset_in_bytes(), src);
3182     // Attention: no null check here!
3183     decode_klass_not_null(dst, dst);
3184   } else {
3185     ld(dst, oopDesc::klass_offset_in_bytes(), src);
3186   }
3187 }
3188 
3189 // ((OopHandle)result).resolve();
3190 void MacroAssembler::resolve_oop_handle(Register result) {
3191   // OopHandle::resolve is an indirection.
3192   ld(result, 0, result);
3193 }
3194 
3195 void MacroAssembler::load_mirror_from_const_method(Register mirror, Register const_method) {
3196   ld(mirror, in_bytes(ConstMethod::constants_offset()), const_method);
3197   ld(mirror, ConstantPool::pool_holder_offset_in_bytes(), mirror);
3198   ld(mirror, in_bytes(Klass::java_mirror_offset()), mirror);
3199   resolve_oop_handle(mirror);
3200 }
3201 
3202 // Clear Array
3203 // For very short arrays. tmp == R0 is allowed.
3204 void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) {
3205   if (cnt_dwords > 0) { li(tmp, 0); }
3206   for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); }
3207 }
3208 
3209 // Version for constant short array length. Kills base_ptr. tmp == R0 is allowed.
3210 void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) {
3211   if (cnt_dwords < 8) {
3212     clear_memory_unrolled(base_ptr, cnt_dwords, tmp);
3213     return;
3214   }
3215 
3216   Label loop;
3217   const long loopcnt   = cnt_dwords >> 1,
3218              remainder = cnt_dwords & 1;
3219 
3220   li(tmp, loopcnt);
3221   mtctr(tmp);
3222   li(tmp, 0);
3223   bind(loop);
3224     std(tmp, 0, base_ptr);
3225     std(tmp, 8, base_ptr);
3226     addi(base_ptr, base_ptr, 16);
3227     bdnz(loop);
3228   if (remainder) { std(tmp, 0, base_ptr); }
3229 }
3230 
3231 // Kills both input registers. tmp == R0 is allowed.
3232 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) {
3233   // Procedure for large arrays (uses data cache block zero instruction).
3234     Label startloop, fast, fastloop, small_rest, restloop, done;
3235     const int cl_size         = VM_Version::L1_data_cache_line_size(),
3236               cl_dwords       = cl_size >> 3,
3237               cl_dw_addr_bits = exact_log2(cl_dwords),
3238               dcbz_min        = 1,  // Min count of dcbz executions, needs to be >0.
3239               min_cnt         = ((dcbz_min + 1) << cl_dw_addr_bits) - 1;
3240 
3241   if (const_cnt >= 0) {
3242     // Constant case.
3243     if (const_cnt < min_cnt) {
3244       clear_memory_constlen(base_ptr, const_cnt, tmp);
3245       return;
3246     }
3247     load_const_optimized(cnt_dwords, const_cnt, tmp);
3248   } else {
3249     // cnt_dwords already loaded in register. Need to check size.
3250     cmpdi(CCR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included).
3251     blt(CCR1, small_rest);
3252   }
3253     rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line.
3254     beq(CCR0, fast);                                  // Already 128byte aligned.
3255 
3256     subfic(tmp, tmp, cl_dwords);
3257     mtctr(tmp);                        // Set ctr to hit 128byte boundary (0<ctr<cl_dwords).
3258     subf(cnt_dwords, tmp, cnt_dwords); // rest.
3259     li(tmp, 0);
3260 
3261   bind(startloop);                     // Clear at the beginning to reach 128byte boundary.
3262     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3263     addi(base_ptr, base_ptr, 8);
3264     bdnz(startloop);
3265 
3266   bind(fast);                                  // Clear 128byte blocks.
3267     srdi(tmp, cnt_dwords, cl_dw_addr_bits);    // Loop count for 128byte loop (>0).
3268     andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords.
3269     mtctr(tmp);                                // Load counter.
3270 
3271   bind(fastloop);
3272     dcbz(base_ptr);                    // Clear 128byte aligned block.
3273     addi(base_ptr, base_ptr, cl_size);
3274     bdnz(fastloop);
3275 
3276   bind(small_rest);
3277     cmpdi(CCR0, cnt_dwords, 0);        // size 0?
3278     beq(CCR0, done);                   // rest == 0
3279     li(tmp, 0);
3280     mtctr(cnt_dwords);                 // Load counter.
3281 
3282   bind(restloop);                      // Clear rest.
3283     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3284     addi(base_ptr, base_ptr, 8);
3285     bdnz(restloop);
3286 
3287   bind(done);
3288 }
3289 
3290 /////////////////////////////////////////// String intrinsics ////////////////////////////////////////////
3291 
3292 #ifdef COMPILER2
3293 // Intrinsics for CompactStrings
3294 
3295 // Compress char[] to byte[] by compressing 16 bytes at once.
3296 void MacroAssembler::string_compress_16(Register src, Register dst, Register cnt,
3297                                         Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5,
3298                                         Label& Lfailure) {
3299 
3300   const Register tmp0 = R0;
3301   assert_different_registers(src, dst, cnt, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5);
3302   Label Lloop, Lslow;
3303 
3304   // Check if cnt >= 8 (= 16 bytes)
3305   lis(tmp1, 0xFF);                // tmp1 = 0x00FF00FF00FF00FF
3306   srwi_(tmp2, cnt, 3);
3307   beq(CCR0, Lslow);
3308   ori(tmp1, tmp1, 0xFF);
3309   rldimi(tmp1, tmp1, 32, 0);
3310   mtctr(tmp2);
3311 
3312   // 2x unrolled loop
3313   bind(Lloop);
3314   ld(tmp2, 0, src);               // _0_1_2_3 (Big Endian)
3315   ld(tmp4, 8, src);               // _4_5_6_7
3316 
3317   orr(tmp0, tmp2, tmp4);
3318   rldicl(tmp3, tmp2, 6*8, 64-24); // _____1_2
3319   rldimi(tmp2, tmp2, 2*8, 2*8);   // _0_2_3_3
3320   rldicl(tmp5, tmp4, 6*8, 64-24); // _____5_6
3321   rldimi(tmp4, tmp4, 2*8, 2*8);   // _4_6_7_7
3322 
3323   andc_(tmp0, tmp0, tmp1);
3324   bne(CCR0, Lfailure);            // Not latin1.
3325   addi(src, src, 16);
3326 
3327   rlwimi(tmp3, tmp2, 0*8, 24, 31);// _____1_3
3328   srdi(tmp2, tmp2, 3*8);          // ____0_2_
3329   rlwimi(tmp5, tmp4, 0*8, 24, 31);// _____5_7
3330   srdi(tmp4, tmp4, 3*8);          // ____4_6_
3331 
3332   orr(tmp2, tmp2, tmp3);          // ____0123
3333   orr(tmp4, tmp4, tmp5);          // ____4567
3334 
3335   stw(tmp2, 0, dst);
3336   stw(tmp4, 4, dst);
3337   addi(dst, dst, 8);
3338   bdnz(Lloop);
3339 
3340   bind(Lslow);                    // Fallback to slow version
3341 }
3342 
3343 // Compress char[] to byte[]. cnt must be positive int.
3344 void MacroAssembler::string_compress(Register src, Register dst, Register cnt, Register tmp, Label& Lfailure) {
3345   Label Lloop;
3346   mtctr(cnt);
3347 
3348   bind(Lloop);
3349   lhz(tmp, 0, src);
3350   cmplwi(CCR0, tmp, 0xff);
3351   bgt(CCR0, Lfailure);            // Not latin1.
3352   addi(src, src, 2);
3353   stb(tmp, 0, dst);
3354   addi(dst, dst, 1);
3355   bdnz(Lloop);
3356 }
3357 
3358 // Inflate byte[] to char[] by inflating 16 bytes at once.
3359 void MacroAssembler::string_inflate_16(Register src, Register dst, Register cnt,
3360                                        Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
3361   const Register tmp0 = R0;
3362   assert_different_registers(src, dst, cnt, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5);
3363   Label Lloop, Lslow;
3364 
3365   // Check if cnt >= 8
3366   srwi_(tmp2, cnt, 3);
3367   beq(CCR0, Lslow);
3368   lis(tmp1, 0xFF);                // tmp1 = 0x00FF00FF
3369   ori(tmp1, tmp1, 0xFF);
3370   mtctr(tmp2);
3371 
3372   // 2x unrolled loop
3373   bind(Lloop);
3374   lwz(tmp2, 0, src);              // ____0123 (Big Endian)
3375   lwz(tmp4, 4, src);              // ____4567
3376   addi(src, src, 8);
3377 
3378   rldicl(tmp3, tmp2, 7*8, 64-8);  // _______2
3379   rlwimi(tmp2, tmp2, 3*8, 16, 23);// ____0113
3380   rldicl(tmp5, tmp4, 7*8, 64-8);  // _______6
3381   rlwimi(tmp4, tmp4, 3*8, 16, 23);// ____4557
3382 
3383   andc(tmp0, tmp2, tmp1);         // ____0_1_
3384   rlwimi(tmp2, tmp3, 2*8, 0, 23); // _____2_3
3385   andc(tmp3, tmp4, tmp1);         // ____4_5_
3386   rlwimi(tmp4, tmp5, 2*8, 0, 23); // _____6_7
3387 
3388   rldimi(tmp2, tmp0, 3*8, 0*8);   // _0_1_2_3
3389   rldimi(tmp4, tmp3, 3*8, 0*8);   // _4_5_6_7
3390 
3391   std(tmp2, 0, dst);
3392   std(tmp4, 8, dst);
3393   addi(dst, dst, 16);
3394   bdnz(Lloop);
3395 
3396   bind(Lslow);                    // Fallback to slow version
3397 }
3398 
3399 // Inflate byte[] to char[]. cnt must be positive int.
3400 void MacroAssembler::string_inflate(Register src, Register dst, Register cnt, Register tmp) {
3401   Label Lloop;
3402   mtctr(cnt);
3403 
3404   bind(Lloop);
3405   lbz(tmp, 0, src);
3406   addi(src, src, 1);
3407   sth(tmp, 0, dst);
3408   addi(dst, dst, 2);
3409   bdnz(Lloop);
3410 }
3411 
3412 void MacroAssembler::string_compare(Register str1, Register str2,
3413                                     Register cnt1, Register cnt2,
3414                                     Register tmp1, Register result, int ae) {
3415   const Register tmp0 = R0,
3416                  diff = tmp1;
3417 
3418   assert_different_registers(str1, str2, cnt1, cnt2, tmp0, tmp1, result);
3419   Label Ldone, Lslow, Lloop, Lreturn_diff;
3420 
3421   // Note: Making use of the fact that compareTo(a, b) == -compareTo(b, a)
3422   // we interchange str1 and str2 in the UL case and negate the result.
3423   // Like this, str1 is always latin1 encoded, except for the UU case.
3424   // In addition, we need 0 (or sign which is 0) extend.
3425 
3426   if (ae == StrIntrinsicNode::UU) {
3427     srwi(cnt1, cnt1, 1);
3428   } else {
3429     clrldi(cnt1, cnt1, 32);
3430   }
3431 
3432   if (ae != StrIntrinsicNode::LL) {
3433     srwi(cnt2, cnt2, 1);
3434   } else {
3435     clrldi(cnt2, cnt2, 32);
3436   }
3437 
3438   // See if the lengths are different, and calculate min in cnt1.
3439   // Save diff in case we need it for a tie-breaker.
3440   subf_(diff, cnt2, cnt1); // diff = cnt1 - cnt2
3441   // if (diff > 0) { cnt1 = cnt2; }
3442   if (VM_Version::has_isel()) {
3443     isel(cnt1, CCR0, Assembler::greater, /*invert*/ false, cnt2);
3444   } else {
3445     Label Lskip;
3446     blt(CCR0, Lskip);
3447     mr(cnt1, cnt2);
3448     bind(Lskip);
3449   }
3450 
3451   // Rename registers
3452   Register chr1 = result;
3453   Register chr2 = tmp0;
3454 
3455   // Compare multiple characters in fast loop (only implemented for same encoding).
3456   int stride1 = 8, stride2 = 8;
3457   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3458     int log2_chars_per_iter = (ae == StrIntrinsicNode::LL) ? 3 : 2;
3459     Label Lfastloop, Lskipfast;
3460 
3461     srwi_(tmp0, cnt1, log2_chars_per_iter);
3462     beq(CCR0, Lskipfast);
3463     rldicl(cnt2, cnt1, 0, 64 - log2_chars_per_iter); // Remaining characters.
3464     li(cnt1, 1 << log2_chars_per_iter); // Initialize for failure case: Rescan characters from current iteration.
3465     mtctr(tmp0);
3466 
3467     bind(Lfastloop);
3468     ld(chr1, 0, str1);
3469     ld(chr2, 0, str2);
3470     cmpd(CCR0, chr1, chr2);
3471     bne(CCR0, Lslow);
3472     addi(str1, str1, stride1);
3473     addi(str2, str2, stride2);
3474     bdnz(Lfastloop);
3475     mr(cnt1, cnt2); // Remaining characters.
3476     bind(Lskipfast);
3477   }
3478 
3479   // Loop which searches the first difference character by character.
3480   cmpwi(CCR0, cnt1, 0);
3481   beq(CCR0, Lreturn_diff);
3482   bind(Lslow);
3483   mtctr(cnt1);
3484 
3485   switch (ae) {
3486     case StrIntrinsicNode::LL: stride1 = 1; stride2 = 1; break;
3487     case StrIntrinsicNode::UL: // fallthru (see comment above)
3488     case StrIntrinsicNode::LU: stride1 = 1; stride2 = 2; break;
3489     case StrIntrinsicNode::UU: stride1 = 2; stride2 = 2; break;
3490     default: ShouldNotReachHere(); break;
3491   }
3492 
3493   bind(Lloop);
3494   if (stride1 == 1) { lbz(chr1, 0, str1); } else { lhz(chr1, 0, str1); }
3495   if (stride2 == 1) { lbz(chr2, 0, str2); } else { lhz(chr2, 0, str2); }
3496   subf_(result, chr2, chr1); // result = chr1 - chr2
3497   bne(CCR0, Ldone);
3498   addi(str1, str1, stride1);
3499   addi(str2, str2, stride2);
3500   bdnz(Lloop);
3501 
3502   // If strings are equal up to min length, return the length difference.
3503   bind(Lreturn_diff);
3504   mr(result, diff);
3505 
3506   // Otherwise, return the difference between the first mismatched chars.
3507   bind(Ldone);
3508   if (ae == StrIntrinsicNode::UL) {
3509     neg(result, result); // Negate result (see note above).
3510   }
3511 }
3512 
3513 void MacroAssembler::array_equals(bool is_array_equ, Register ary1, Register ary2,
3514                                   Register limit, Register tmp1, Register result, bool is_byte) {
3515   const Register tmp0 = R0;
3516   assert_different_registers(ary1, ary2, limit, tmp0, tmp1, result);
3517   Label Ldone, Lskiploop, Lloop, Lfastloop, Lskipfast;
3518   bool limit_needs_shift = false;
3519 
3520   if (is_array_equ) {
3521     const int length_offset = arrayOopDesc::length_offset_in_bytes();
3522     const int base_offset   = arrayOopDesc::base_offset_in_bytes(is_byte ? T_BYTE : T_CHAR);
3523 
3524     // Return true if the same array.
3525     cmpd(CCR0, ary1, ary2);
3526     beq(CCR0, Lskiploop);
3527 
3528     // Return false if one of them is NULL.
3529     cmpdi(CCR0, ary1, 0);
3530     cmpdi(CCR1, ary2, 0);
3531     li(result, 0);
3532     cror(CCR0, Assembler::equal, CCR1, Assembler::equal);
3533     beq(CCR0, Ldone);
3534 
3535     // Load the lengths of arrays.
3536     lwz(limit, length_offset, ary1);
3537     lwz(tmp0, length_offset, ary2);
3538 
3539     // Return false if the two arrays are not equal length.
3540     cmpw(CCR0, limit, tmp0);
3541     bne(CCR0, Ldone);
3542 
3543     // Load array addresses.
3544     addi(ary1, ary1, base_offset);
3545     addi(ary2, ary2, base_offset);
3546   } else {
3547     limit_needs_shift = !is_byte;
3548     li(result, 0); // Assume not equal.
3549   }
3550 
3551   // Rename registers
3552   Register chr1 = tmp0;
3553   Register chr2 = tmp1;
3554 
3555   // Compare 8 bytes per iteration in fast loop.
3556   const int log2_chars_per_iter = is_byte ? 3 : 2;
3557 
3558   srwi_(tmp0, limit, log2_chars_per_iter + (limit_needs_shift ? 1 : 0));
3559   beq(CCR0, Lskipfast);
3560   mtctr(tmp0);
3561 
3562   bind(Lfastloop);
3563   ld(chr1, 0, ary1);
3564   ld(chr2, 0, ary2);
3565   addi(ary1, ary1, 8);
3566   addi(ary2, ary2, 8);
3567   cmpd(CCR0, chr1, chr2);
3568   bne(CCR0, Ldone);
3569   bdnz(Lfastloop);
3570 
3571   bind(Lskipfast);
3572   rldicl_(limit, limit, limit_needs_shift ? 64 - 1 : 0, 64 - log2_chars_per_iter); // Remaining characters.
3573   beq(CCR0, Lskiploop);
3574   mtctr(limit);
3575 
3576   // Character by character.
3577   bind(Lloop);
3578   if (is_byte) {
3579     lbz(chr1, 0, ary1);
3580     lbz(chr2, 0, ary2);
3581     addi(ary1, ary1, 1);
3582     addi(ary2, ary2, 1);
3583   } else {
3584     lhz(chr1, 0, ary1);
3585     lhz(chr2, 0, ary2);
3586     addi(ary1, ary1, 2);
3587     addi(ary2, ary2, 2);
3588   }
3589   cmpw(CCR0, chr1, chr2);
3590   bne(CCR0, Ldone);
3591   bdnz(Lloop);
3592 
3593   bind(Lskiploop);
3594   li(result, 1); // All characters are equal.
3595   bind(Ldone);
3596 }
3597 
3598 void MacroAssembler::string_indexof(Register result, Register haystack, Register haycnt,
3599                                     Register needle, ciTypeArray* needle_values, Register needlecnt, int needlecntval,
3600                                     Register tmp1, Register tmp2, Register tmp3, Register tmp4, int ae) {
3601 
3602   // Ensure 0<needlecnt<=haycnt in ideal graph as prerequisite!
3603   Label L_TooShort, L_Found, L_NotFound, L_End;
3604   Register last_addr = haycnt, // Kill haycnt at the beginning.
3605   addr      = tmp1,
3606   n_start   = tmp2,
3607   ch1       = tmp3,
3608   ch2       = R0;
3609 
3610   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
3611   const int h_csize = (ae == StrIntrinsicNode::LL) ? 1 : 2;
3612   const int n_csize = (ae == StrIntrinsicNode::UU) ? 2 : 1;
3613 
3614   // **************************************************************************************************
3615   // Prepare for main loop: optimized for needle count >=2, bail out otherwise.
3616   // **************************************************************************************************
3617 
3618   // Compute last haystack addr to use if no match gets found.
3619   clrldi(haycnt, haycnt, 32);         // Ensure positive int is valid as 64 bit value.
3620   addi(addr, haystack, -h_csize);     // Accesses use pre-increment.
3621   if (needlecntval == 0) { // variable needlecnt
3622    cmpwi(CCR6, needlecnt, 2);
3623    clrldi(needlecnt, needlecnt, 32);  // Ensure positive int is valid as 64 bit value.
3624    blt(CCR6, L_TooShort);             // Variable needlecnt: handle short needle separately.
3625   }
3626 
3627   if (n_csize == 2) { lwz(n_start, 0, needle); } else { lhz(n_start, 0, needle); } // Load first 2 characters of needle.
3628 
3629   if (needlecntval == 0) { // variable needlecnt
3630    subf(ch1, needlecnt, haycnt);      // Last character index to compare is haycnt-needlecnt.
3631    addi(needlecnt, needlecnt, -2);    // Rest of needle.
3632   } else { // constant needlecnt
3633   guarantee(needlecntval != 1, "IndexOf with single-character needle must be handled separately");
3634   assert((needlecntval & 0x7fff) == needlecntval, "wrong immediate");
3635    addi(ch1, haycnt, -needlecntval);  // Last character index to compare is haycnt-needlecnt.
3636    if (needlecntval > 3) { li(needlecnt, needlecntval - 2); } // Rest of needle.
3637   }
3638 
3639   if (h_csize == 2) { slwi(ch1, ch1, 1); } // Scale to number of bytes.
3640 
3641   if (ae ==StrIntrinsicNode::UL) {
3642    srwi(tmp4, n_start, 1*8);          // ___0
3643    rlwimi(n_start, tmp4, 2*8, 0, 23); // _0_1
3644   }
3645 
3646   add(last_addr, haystack, ch1);      // Point to last address to compare (haystack+2*(haycnt-needlecnt)).
3647 
3648   // Main Loop (now we have at least 2 characters).
3649   Label L_OuterLoop, L_InnerLoop, L_FinalCheck, L_Comp1, L_Comp2;
3650   bind(L_OuterLoop); // Search for 1st 2 characters.
3651   Register addr_diff = tmp4;
3652    subf(addr_diff, addr, last_addr);  // Difference between already checked address and last address to check.
3653    addi(addr, addr, h_csize);         // This is the new address we want to use for comparing.
3654    srdi_(ch2, addr_diff, h_csize);
3655    beq(CCR0, L_FinalCheck);           // 2 characters left?
3656    mtctr(ch2);                        // num of characters / 2
3657   bind(L_InnerLoop);                  // Main work horse (2x unrolled search loop)
3658    if (h_csize == 2) {                // Load 2 characters of haystack (ignore alignment).
3659     lwz(ch1, 0, addr);
3660     lwz(ch2, 2, addr);
3661    } else {
3662     lhz(ch1, 0, addr);
3663     lhz(ch2, 1, addr);
3664    }
3665    cmpw(CCR0, ch1, n_start);          // Compare 2 characters (1 would be sufficient but try to reduce branches to CompLoop).
3666    cmpw(CCR1, ch2, n_start);
3667    beq(CCR0, L_Comp1);                // Did we find the needle start?
3668    beq(CCR1, L_Comp2);
3669    addi(addr, addr, 2 * h_csize);
3670    bdnz(L_InnerLoop);
3671   bind(L_FinalCheck);
3672    andi_(addr_diff, addr_diff, h_csize); // Remaining characters not covered by InnerLoop: (num of characters) & 1.
3673    beq(CCR0, L_NotFound);
3674    if (h_csize == 2) { lwz(ch1, 0, addr); } else { lhz(ch1, 0, addr); } // One position left at which we have to compare.
3675    cmpw(CCR1, ch1, n_start);
3676    beq(CCR1, L_Comp1);
3677   bind(L_NotFound);
3678    li(result, -1);                    // not found
3679    b(L_End);
3680 
3681    // **************************************************************************************************
3682    // Special Case: unfortunately, the variable needle case can be called with needlecnt<2
3683    // **************************************************************************************************
3684   if (needlecntval == 0) {           // We have to handle these cases separately.
3685   Label L_OneCharLoop;
3686   bind(L_TooShort);
3687    mtctr(haycnt);
3688    if (n_csize == 2) { lhz(n_start, 0, needle); } else { lbz(n_start, 0, needle); } // First character of needle
3689   bind(L_OneCharLoop);
3690    if (h_csize == 2) { lhzu(ch1, 2, addr); } else { lbzu(ch1, 1, addr); }
3691    cmpw(CCR1, ch1, n_start);
3692    beq(CCR1, L_Found);               // Did we find the one character needle?
3693    bdnz(L_OneCharLoop);
3694    li(result, -1);                   // Not found.
3695    b(L_End);
3696   }
3697 
3698   // **************************************************************************************************
3699   // Regular Case Part II: compare rest of needle (first 2 characters have been compared already)
3700   // **************************************************************************************************
3701 
3702   // Compare the rest
3703   bind(L_Comp2);
3704    addi(addr, addr, h_csize);        // First comparison has failed, 2nd one hit.
3705   bind(L_Comp1);                     // Addr points to possible needle start.
3706   if (needlecntval != 2) {           // Const needlecnt==2?
3707    if (needlecntval != 3) {
3708     if (needlecntval == 0) { beq(CCR6, L_Found); } // Variable needlecnt==2?
3709     Register n_ind = tmp4,
3710              h_ind = n_ind;
3711     li(n_ind, 2 * n_csize);          // First 2 characters are already compared, use index 2.
3712     mtctr(needlecnt);                // Decremented by 2, still > 0.
3713    Label L_CompLoop;
3714    bind(L_CompLoop);
3715     if (ae ==StrIntrinsicNode::UL) {
3716       h_ind = ch1;
3717       sldi(h_ind, n_ind, 1);
3718     }
3719     if (n_csize == 2) { lhzx(ch2, needle, n_ind); } else { lbzx(ch2, needle, n_ind); }
3720     if (h_csize == 2) { lhzx(ch1, addr, h_ind); } else { lbzx(ch1, addr, h_ind); }
3721     cmpw(CCR1, ch1, ch2);
3722     bne(CCR1, L_OuterLoop);
3723     addi(n_ind, n_ind, n_csize);
3724     bdnz(L_CompLoop);
3725    } else { // No loop required if there's only one needle character left.
3726     if (n_csize == 2) { lhz(ch2, 2 * 2, needle); } else { lbz(ch2, 2 * 1, needle); }
3727     if (h_csize == 2) { lhz(ch1, 2 * 2, addr); } else { lbz(ch1, 2 * 1, addr); }
3728     cmpw(CCR1, ch1, ch2);
3729     bne(CCR1, L_OuterLoop);
3730    }
3731   }
3732   // Return index ...
3733   bind(L_Found);
3734    subf(result, haystack, addr);     // relative to haystack, ...
3735    if (h_csize == 2) { srdi(result, result, 1); } // in characters.
3736   bind(L_End);
3737 } // string_indexof
3738 
3739 void MacroAssembler::string_indexof_char(Register result, Register haystack, Register haycnt,
3740                                          Register needle, jchar needleChar, Register tmp1, Register tmp2, bool is_byte) {
3741   assert_different_registers(haystack, haycnt, needle, tmp1, tmp2);
3742 
3743   Label L_InnerLoop, L_FinalCheck, L_Found1, L_Found2, L_NotFound, L_End;
3744   Register addr = tmp1,
3745            ch1 = tmp2,
3746            ch2 = R0;
3747 
3748   const int h_csize = is_byte ? 1 : 2;
3749 
3750 //4:
3751    srwi_(tmp2, haycnt, 1);   // Shift right by exact_log2(UNROLL_FACTOR).
3752    mr(addr, haystack);
3753    beq(CCR0, L_FinalCheck);
3754    mtctr(tmp2);              // Move to count register.
3755 //8:
3756   bind(L_InnerLoop);         // Main work horse (2x unrolled search loop).
3757    if (!is_byte) {
3758     lhz(ch1, 0, addr);
3759     lhz(ch2, 2, addr);
3760    } else {
3761     lbz(ch1, 0, addr);
3762     lbz(ch2, 1, addr);
3763    }
3764    (needle != R0) ? cmpw(CCR0, ch1, needle) : cmplwi(CCR0, ch1, (unsigned int)needleChar);
3765    (needle != R0) ? cmpw(CCR1, ch2, needle) : cmplwi(CCR1, ch2, (unsigned int)needleChar);
3766    beq(CCR0, L_Found1);      // Did we find the needle?
3767    beq(CCR1, L_Found2);
3768    addi(addr, addr, 2 * h_csize);
3769    bdnz(L_InnerLoop);
3770 //16:
3771   bind(L_FinalCheck);
3772    andi_(R0, haycnt, 1);
3773    beq(CCR0, L_NotFound);
3774    if (!is_byte) { lhz(ch1, 0, addr); } else { lbz(ch1, 0, addr); } // One position left at which we have to compare.
3775    (needle != R0) ? cmpw(CCR1, ch1, needle) : cmplwi(CCR1, ch1, (unsigned int)needleChar);
3776    beq(CCR1, L_Found1);
3777 //21:
3778   bind(L_NotFound);
3779    li(result, -1);           // Not found.
3780    b(L_End);
3781 
3782   bind(L_Found2);
3783    addi(addr, addr, h_csize);
3784 //24:
3785   bind(L_Found1);            // Return index ...
3786    subf(result, haystack, addr); // relative to haystack, ...
3787    if (!is_byte) { srdi(result, result, 1); } // in characters.
3788   bind(L_End);
3789 } // string_indexof_char
3790 
3791 
3792 void MacroAssembler::has_negatives(Register src, Register cnt, Register result,
3793                                    Register tmp1, Register tmp2) {
3794   const Register tmp0 = R0;
3795   assert_different_registers(src, result, cnt, tmp0, tmp1, tmp2);
3796   Label Lfastloop, Lslow, Lloop, Lnoneg, Ldone;
3797 
3798   // Check if cnt >= 8 (= 16 bytes)
3799   lis(tmp1, (int)(short)0x8080);  // tmp1 = 0x8080808080808080
3800   srwi_(tmp2, cnt, 4);
3801   li(result, 1);                  // Assume there's a negative byte.
3802   beq(CCR0, Lslow);
3803   ori(tmp1, tmp1, 0x8080);
3804   rldimi(tmp1, tmp1, 32, 0);
3805   mtctr(tmp2);
3806 
3807   // 2x unrolled loop
3808   bind(Lfastloop);
3809   ld(tmp2, 0, src);
3810   ld(tmp0, 8, src);
3811 
3812   orr(tmp0, tmp2, tmp0);
3813 
3814   and_(tmp0, tmp0, tmp1);
3815   bne(CCR0, Ldone);               // Found negative byte.
3816   addi(src, src, 16);
3817 
3818   bdnz(Lfastloop);
3819 
3820   bind(Lslow);                    // Fallback to slow version
3821   rldicl_(tmp0, cnt, 0, 64-4);
3822   beq(CCR0, Lnoneg);
3823   mtctr(tmp0);
3824   bind(Lloop);
3825   lbz(tmp0, 0, src);
3826   addi(src, src, 1);
3827   andi_(tmp0, tmp0, 0x80);
3828   bne(CCR0, Ldone);               // Found negative byte.
3829   bdnz(Lloop);
3830   bind(Lnoneg);
3831   li(result, 0);
3832 
3833   bind(Ldone);
3834 }
3835 
3836 #endif // Compiler2
3837 
3838 // Helpers for Intrinsic Emitters
3839 //
3840 // Revert the byte order of a 32bit value in a register
3841 //   src: 0x44556677
3842 //   dst: 0x77665544
3843 // Three steps to obtain the result:
3844 //  1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word
3845 //     into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared.
3846 //     This value initializes dst.
3847 //  2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost
3848 //     byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go.
3849 //     This value is mask inserted into dst with a [0..23] mask of 1s.
3850 //  3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position.
3851 //     This value is mask inserted into dst with a [8..15] mask of 1s.
3852 void MacroAssembler::load_reverse_32(Register dst, Register src) {
3853   assert_different_registers(dst, src);
3854 
3855   rldicl(dst, src, (4+1)*8, 56);       // Rotate byte 4 into position 7 (rightmost), clear all to the left.
3856   rlwimi(dst, src,     3*8,  0, 23);   // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone.
3857   rlwimi(dst, src,     1*8,  8, 15);   // Insert byte 6 into position 5, leave the rest alone.
3858 }
3859 
3860 // Calculate the column addresses of the crc32 lookup table into distinct registers.
3861 // This loop-invariant calculation is moved out of the loop body, reducing the loop
3862 // body size from 20 to 16 instructions.
3863 // Returns the offset that was used to calculate the address of column tc3.
3864 // Due to register shortage, setting tc3 may overwrite table. With the return offset
3865 // at hand, the original table address can be easily reconstructed.
3866 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {
3867 
3868 #ifdef VM_LITTLE_ENDIAN
3869   // This is what we implement (the DOLIT4 part):
3870   // ========================================================================= */
3871   // #define DOLIT4 c ^= *buf4++; \
3872   //         c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \
3873   //             crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24]
3874   // #define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4
3875   // ========================================================================= */
3876   const int ix0 = 3*(4*CRC32_COLUMN_SIZE);
3877   const int ix1 = 2*(4*CRC32_COLUMN_SIZE);
3878   const int ix2 = 1*(4*CRC32_COLUMN_SIZE);
3879   const int ix3 = 0*(4*CRC32_COLUMN_SIZE);
3880 #else
3881   // This is what we implement (the DOBIG4 part):
3882   // =========================================================================
3883   // #define DOBIG4 c ^= *++buf4; \
3884   //         c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \
3885   //             crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24]
3886   // #define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4
3887   // =========================================================================
3888   const int ix0 = 4*(4*CRC32_COLUMN_SIZE);
3889   const int ix1 = 5*(4*CRC32_COLUMN_SIZE);
3890   const int ix2 = 6*(4*CRC32_COLUMN_SIZE);
3891   const int ix3 = 7*(4*CRC32_COLUMN_SIZE);
3892 #endif
3893   assert_different_registers(table, tc0, tc1, tc2);
3894   assert(table == tc3, "must be!");
3895 
3896   addi(tc0, table, ix0);
3897   addi(tc1, table, ix1);
3898   addi(tc2, table, ix2);
3899   if (ix3 != 0) addi(tc3, table, ix3);
3900 
3901   return ix3;
3902 }
3903 
3904 /**
3905  * uint32_t crc;
3906  * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
3907  */
3908 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) {
3909   assert_different_registers(crc, table, tmp);
3910   assert_different_registers(val, table);
3911 
3912   if (crc == val) {                   // Must rotate first to use the unmodified value.
3913     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3914                                       // As we use a word (4-byte) instruction, we have to adapt the mask bit positions.
3915     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
3916   } else {
3917     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
3918     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3919   }
3920   lwzx(tmp, table, tmp);
3921   xorr(crc, crc, tmp);
3922 }
3923 
3924 /**
3925  * uint32_t crc;
3926  * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
3927  */
3928 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
3929   fold_byte_crc32(crc, crc, table, tmp);
3930 }
3931 
3932 /**
3933  * Emits code to update CRC-32 with a byte value according to constants in table.
3934  *
3935  * @param [in,out]crc   Register containing the crc.
3936  * @param [in]val       Register containing the byte to fold into the CRC.
3937  * @param [in]table     Register containing the table of crc constants.
3938  *
3939  * uint32_t crc;
3940  * val = crc_table[(val ^ crc) & 0xFF];
3941  * crc = val ^ (crc >> 8);
3942  */
3943 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3944   BLOCK_COMMENT("update_byte_crc32:");
3945   xorr(val, val, crc);
3946   fold_byte_crc32(crc, val, table, val);
3947 }
3948 
3949 /**
3950  * @param crc   register containing existing CRC (32-bit)
3951  * @param buf   register pointing to input byte buffer (byte*)
3952  * @param len   register containing number of bytes
3953  * @param table register pointing to CRC table
3954  */
3955 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,
3956                                            Register data, bool loopAlignment) {
3957   assert_different_registers(crc, buf, len, table, data);
3958 
3959   Label L_mainLoop, L_done;
3960   const int mainLoop_stepping  = 1;
3961   const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4;
3962 
3963   // Process all bytes in a single-byte loop.
3964   clrldi_(len, len, 32);                         // Enforce 32 bit. Anything to do?
3965   beq(CCR0, L_done);
3966 
3967   mtctr(len);
3968   align(mainLoop_alignment);
3969   BIND(L_mainLoop);
3970     lbz(data, 0, buf);                           // Byte from buffer, zero-extended.
3971     addi(buf, buf, mainLoop_stepping);           // Advance buffer position.
3972     update_byte_crc32(crc, data, table);
3973     bdnz(L_mainLoop);                            // Iterate.
3974 
3975   bind(L_done);
3976 }
3977 
3978 /**
3979  * Emits code to update CRC-32 with a 4-byte value according to constants in table
3980  * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
3981  */
3982 // A not on the lookup table address(es):
3983 // The lookup table consists of two sets of four columns each.
3984 // The columns {0..3} are used for little-endian machines.
3985 // The columns {4..7} are used for big-endian machines.
3986 // To save the effort of adding the column offset to the table address each time
3987 // a table element is looked up, it is possible to pass the pre-calculated
3988 // column addresses.
3989 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary.
3990 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,
3991                                         Register t0,  Register t1,  Register t2,  Register t3,
3992                                         Register tc0, Register tc1, Register tc2, Register tc3) {
3993   assert_different_registers(crc, t3);
3994 
3995   // XOR crc with next four bytes of buffer.
3996   lwz(t3, bufDisp, buf);
3997   if (bufInc != 0) {
3998     addi(buf, buf, bufInc);
3999   }
4000   xorr(t3, t3, crc);
4001 
4002   // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices.
4003   rlwinm(t0, t3,  2,         24-2, 31-2);  // ((t1 >>  0) & 0xff) << 2
4004   rlwinm(t1, t3,  32+(2- 8), 24-2, 31-2);  // ((t1 >>  8) & 0xff) << 2
4005   rlwinm(t2, t3,  32+(2-16), 24-2, 31-2);  // ((t1 >> 16) & 0xff) << 2
4006   rlwinm(t3, t3,  32+(2-24), 24-2, 31-2);  // ((t1 >> 24) & 0xff) << 2
4007 
4008   // Use the pre-calculated column addresses.
4009   // Load pre-calculated table values.
4010   lwzx(t0, tc0, t0);
4011   lwzx(t1, tc1, t1);
4012   lwzx(t2, tc2, t2);
4013   lwzx(t3, tc3, t3);
4014 
4015   // Calculate new crc from table values.
4016   xorr(t0,  t0, t1);
4017   xorr(t2,  t2, t3);
4018   xorr(crc, t0, t2);  // Now crc contains the final checksum value.
4019 }
4020 
4021 /**
4022  * @param crc   register containing existing CRC (32-bit)
4023  * @param buf   register pointing to input byte buffer (byte*)
4024  * @param len   register containing number of bytes
4025  * @param table register pointing to CRC table
4026  *
4027  * Uses R9..R12 as work register. Must be saved/restored by caller!
4028  */
4029 void MacroAssembler::kernel_crc32_2word(Register crc, Register buf, Register len, Register table,
4030                                         Register t0,  Register t1,  Register t2,  Register t3,
4031                                         Register tc0, Register tc1, Register tc2, Register tc3,
4032                                         bool invertCRC) {
4033   assert_different_registers(crc, buf, len, table);
4034 
4035   Label L_mainLoop, L_tail;
4036   Register  tmp  = t0;
4037   Register  data = t0;
4038   Register  tmp2 = t1;
4039   const int mainLoop_stepping  = 8;
4040   const int tailLoop_stepping  = 1;
4041   const int log_stepping       = exact_log2(mainLoop_stepping);
4042   const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
4043   const int complexThreshold   = 2*mainLoop_stepping;
4044 
4045   // Don't test for len <= 0 here. This pathological case should not occur anyway.
4046   // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles
4047   // for all well-behaved cases. The situation itself is detected and handled correctly
4048   // within update_byteLoop_crc32.
4049   assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
4050 
4051   BLOCK_COMMENT("kernel_crc32_2word {");
4052 
4053   if (invertCRC) {
4054     nand(crc, crc, crc);                      // 1s complement of crc
4055   }
4056 
4057   // Check for short (<mainLoop_stepping) buffer.
4058   cmpdi(CCR0, len, complexThreshold);
4059   blt(CCR0, L_tail);
4060 
4061   // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
4062   // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
4063   {
4064     // Align buf addr to mainLoop_stepping boundary.
4065     neg(tmp2, buf);                           // Calculate # preLoop iterations for alignment.
4066     rldicl(tmp2, tmp2, 0, 64-log_stepping);   // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
4067 
4068     if (complexThreshold > mainLoop_stepping) {
4069       sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4070     } else {
4071       sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
4072       cmpdi(CCR0, tmp, mainLoop_stepping);
4073       blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
4074       mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4075     }
4076     update_byteLoop_crc32(crc, buf, tmp2, table, data, false);
4077   }
4078 
4079   srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
4080   andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
4081   mtctr(tmp2);
4082 
4083 #ifdef VM_LITTLE_ENDIAN
4084   Register crc_rv = crc;
4085 #else
4086   Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
4087                                                  // Occupies tmp, but frees up crc.
4088   load_reverse_32(crc_rv, crc);                  // Revert byte order because we are dealing with big-endian data.
4089   tmp = crc;
4090 #endif
4091 
4092   int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
4093 
4094   align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
4095   BIND(L_mainLoop);
4096     update_1word_crc32(crc_rv, buf, table, 0, 0, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
4097     update_1word_crc32(crc_rv, buf, table, 4, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
4098     bdnz(L_mainLoop);
4099 
4100 #ifndef VM_LITTLE_ENDIAN
4101   load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
4102   tmp = crc_rv;                                  // Tmp uses it's original register again.
4103 #endif
4104 
4105   // Restore original table address for tailLoop.
4106   if (reconstructTableOffset != 0) {
4107     addi(table, table, -reconstructTableOffset);
4108   }
4109 
4110   // Process last few (<complexThreshold) bytes of buffer.
4111   BIND(L_tail);
4112   update_byteLoop_crc32(crc, buf, len, table, data, false);
4113 
4114   if (invertCRC) {
4115     nand(crc, crc, crc);                      // 1s complement of crc
4116   }
4117   BLOCK_COMMENT("} kernel_crc32_2word");
4118 }
4119 
4120 /**
4121  * @param crc   register containing existing CRC (32-bit)
4122  * @param buf   register pointing to input byte buffer (byte*)
4123  * @param len   register containing number of bytes
4124  * @param table register pointing to CRC table
4125  *
4126  * uses R9..R12 as work register. Must be saved/restored by caller!
4127  */
4128 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table,
4129                                         Register t0,  Register t1,  Register t2,  Register t3,
4130                                         Register tc0, Register tc1, Register tc2, Register tc3,
4131                                         bool invertCRC) {
4132   assert_different_registers(crc, buf, len, table);
4133 
4134   Label L_mainLoop, L_tail;
4135   Register  tmp          = t0;
4136   Register  data         = t0;
4137   Register  tmp2         = t1;
4138   const int mainLoop_stepping  = 4;
4139   const int tailLoop_stepping  = 1;
4140   const int log_stepping       = exact_log2(mainLoop_stepping);
4141   const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
4142   const int complexThreshold   = 2*mainLoop_stepping;
4143 
4144   // Don't test for len <= 0 here. This pathological case should not occur anyway.
4145   // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles
4146   // for all well-behaved cases. The situation itself is detected and handled correctly
4147   // within update_byteLoop_crc32.
4148   assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
4149 
4150   BLOCK_COMMENT("kernel_crc32_1word {");
4151 
4152   if (invertCRC) {
4153     nand(crc, crc, crc);                      // 1s complement of crc
4154   }
4155 
4156   // Check for short (<mainLoop_stepping) buffer.
4157   cmpdi(CCR0, len, complexThreshold);
4158   blt(CCR0, L_tail);
4159 
4160   // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
4161   // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
4162   {
4163     // Align buf addr to mainLoop_stepping boundary.
4164     neg(tmp2, buf);                              // Calculate # preLoop iterations for alignment.
4165     rldicl(tmp2, tmp2, 0, 64-log_stepping);      // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
4166 
4167     if (complexThreshold > mainLoop_stepping) {
4168       sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4169     } else {
4170       sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
4171       cmpdi(CCR0, tmp, mainLoop_stepping);
4172       blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
4173       mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4174     }
4175     update_byteLoop_crc32(crc, buf, tmp2, table, data, false);
4176   }
4177 
4178   srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
4179   andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
4180   mtctr(tmp2);
4181 
4182 #ifdef VM_LITTLE_ENDIAN
4183   Register crc_rv = crc;
4184 #else
4185   Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
4186                                                  // Occupies tmp, but frees up crc.
4187   load_reverse_32(crc_rv, crc);                  // Revert byte order because we are dealing with big-endian data.
4188   tmp = crc;
4189 #endif
4190 
4191   int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
4192 
4193   align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
4194   BIND(L_mainLoop);
4195     update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
4196     bdnz(L_mainLoop);
4197 
4198 #ifndef VM_LITTLE_ENDIAN
4199   load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
4200   tmp = crc_rv;                                  // Tmp uses it's original register again.
4201 #endif
4202 
4203   // Restore original table address for tailLoop.
4204   if (reconstructTableOffset != 0) {
4205     addi(table, table, -reconstructTableOffset);
4206   }
4207 
4208   // Process last few (<complexThreshold) bytes of buffer.
4209   BIND(L_tail);
4210   update_byteLoop_crc32(crc, buf, len, table, data, false);
4211 
4212   if (invertCRC) {
4213     nand(crc, crc, crc);                      // 1s complement of crc
4214   }
4215   BLOCK_COMMENT("} kernel_crc32_1word");
4216 }
4217 
4218 /**
4219  * @param crc   register containing existing CRC (32-bit)
4220  * @param buf   register pointing to input byte buffer (byte*)
4221  * @param len   register containing number of bytes
4222  * @param table register pointing to CRC table
4223  *
4224  * Uses R7_ARG5, R8_ARG6 as work registers.
4225  */
4226 void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len, Register table,
4227                                         Register t0,  Register t1,  Register t2,  Register t3,
4228                                         bool invertCRC) {
4229   assert_different_registers(crc, buf, len, table);
4230 
4231   Register  data = t0;                   // Holds the current byte to be folded into crc.
4232 
4233   BLOCK_COMMENT("kernel_crc32_1byte {");
4234 
4235   if (invertCRC) {
4236     nand(crc, crc, crc);                      // 1s complement of crc
4237   }
4238 
4239   // Process all bytes in a single-byte loop.
4240   update_byteLoop_crc32(crc, buf, len, table, data, true);
4241 
4242   if (invertCRC) {
4243     nand(crc, crc, crc);                      // 1s complement of crc
4244   }
4245   BLOCK_COMMENT("} kernel_crc32_1byte");
4246 }
4247 
4248 /**
4249  * @param crc             register containing existing CRC (32-bit)
4250  * @param buf             register pointing to input byte buffer (byte*)
4251  * @param len             register containing number of bytes
4252  * @param table           register pointing to CRC table
4253  * @param constants       register pointing to CRC table for 128-bit aligned memory
4254  * @param barretConstants register pointing to table for barrett reduction
4255  * @param t0-t4           temp registers
4256  */
4257 void MacroAssembler::kernel_crc32_1word_vpmsum(Register crc, Register buf, Register len, Register table,
4258                                                Register constants, Register barretConstants,
4259                                                Register t0, Register t1, Register t2, Register t3, Register t4,
4260                                                bool invertCRC) {
4261   assert_different_registers(crc, buf, len, table);
4262 
4263   Label L_alignedHead, L_tail;
4264 
4265   BLOCK_COMMENT("kernel_crc32_1word_vpmsum {");
4266 
4267   // 1. ~c
4268   if (invertCRC) {
4269     nand(crc, crc, crc);                      // 1s complement of crc
4270   }
4271 
4272   // 2. use kernel_crc32_1word for short len
4273   clrldi(len, len, 32);
4274   cmpdi(CCR0, len, 512);
4275   blt(CCR0, L_tail);
4276 
4277   // 3. calculate from 0 to first aligned address
4278   const int alignment = 16;
4279   Register prealign = t0;
4280 
4281   andi_(prealign, buf, alignment - 1);
4282   beq(CCR0, L_alignedHead);
4283   subfic(prealign, prealign, alignment);
4284 
4285   subf(len, prealign, len);
4286   update_byteLoop_crc32(crc, buf, prealign, table, t2, false);
4287 
4288   // 4. calculate from first aligned address as far as possible
4289   BIND(L_alignedHead);
4290   kernel_crc32_1word_aligned(crc, buf, len, constants, barretConstants, t0, t1, t2, t3, t4);
4291 
4292   // 5. remaining bytes
4293   BIND(L_tail);
4294   Register tc0 = t4;
4295   Register tc1 = constants;
4296   Register tc2 = barretConstants;
4297   kernel_crc32_1word(crc, buf, len, table, t0, t1, t2, t3, tc0, tc1, tc2, table, false);
4298 
4299   // 6. ~c
4300   if (invertCRC) {
4301     nand(crc, crc, crc);                      // 1s complement of crc
4302   }
4303 
4304   BLOCK_COMMENT("} kernel_crc32_1word_vpmsum");
4305 }
4306 
4307 /**
4308  * @param crc             register containing existing CRC (32-bit)
4309  * @param buf             register pointing to input byte buffer (byte*)
4310  * @param len             register containing number of bytes (will get updated to remaining bytes)
4311  * @param constants       register pointing to CRC table for 128-bit aligned memory
4312  * @param barretConstants register pointing to table for barrett reduction
4313  * @param t0-t4           temp registers
4314  * Precondition: len should be >= 512. Otherwise, nothing will be done.
4315  */
4316 void MacroAssembler::kernel_crc32_1word_aligned(Register crc, Register buf, Register len,
4317     Register constants, Register barretConstants,
4318     Register t0, Register t1, Register t2, Register t3, Register t4) {
4319 
4320   // Save non-volatile vector registers (frameless).
4321   Register offset = t1;
4322   int offsetInt = 0;
4323   offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP);
4324   offsetInt -= 16; li(offset, offsetInt); stvx(VR21, offset, R1_SP);
4325   offsetInt -= 16; li(offset, offsetInt); stvx(VR22, offset, R1_SP);
4326   offsetInt -= 16; li(offset, offsetInt); stvx(VR23, offset, R1_SP);
4327   offsetInt -= 16; li(offset, offsetInt); stvx(VR24, offset, R1_SP);
4328   offsetInt -= 16; li(offset, offsetInt); stvx(VR25, offset, R1_SP);
4329 #ifndef VM_LITTLE_ENDIAN
4330   offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP);
4331 #endif
4332   offsetInt -= 8; std(R14, offsetInt, R1_SP);
4333   offsetInt -= 8; std(R15, offsetInt, R1_SP);
4334   offsetInt -= 8; std(R16, offsetInt, R1_SP);
4335   offsetInt -= 8; std(R17, offsetInt, R1_SP);
4336 
4337   // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor
4338   // bytes per iteration. The basic scheme is:
4339   // lvx: load vector (Big Endian needs reversal)
4340   // vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift
4341   // vxor: xor partial results together to get unroll_factor2 vectors
4342 
4343   // Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors.
4344 
4345   // Using 16 * unroll_factor / unroll_factor_2 bytes for constants.
4346   const int unroll_factor = 2048;
4347   const int unroll_factor2 = 8;
4348 
4349   // Support registers.
4350   Register offs[] = { noreg, t0, t1, t2, t3, t4, crc /* will live in VCRC */, R14 };
4351   Register num_bytes = R15,
4352            loop_count = R16,
4353            cur_const = R17;
4354   // Constant array for outer loop: unroll_factor2 - 1 registers,
4355   // Constant array for inner loop: unroll_factor / unroll_factor2 registers.
4356   VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 },
4357                  consts1[] = { VR23, VR24 };
4358   // Data register arrays: 2 arrays with unroll_factor2 registers.
4359   VectorRegister data0[] = { VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7 },
4360                  data1[] = { VR8, VR9, VR10, VR11, VR12, VR13, VR14, VR15 };
4361 
4362   VectorRegister VCRC = data0[0];
4363   VectorRegister Vc = VR25;
4364   VectorRegister swap_bytes = VR26; // Only for Big Endian.
4365 
4366   // We have at least 1 iteration (ensured by caller).
4367   Label L_outer_loop, L_inner_loop, L_last;
4368 
4369   // If supported set DSCR pre-fetch to deepest.
4370   if (VM_Version::has_mfdscr()) {
4371     load_const_optimized(t0, VM_Version::_dscr_val | 7);
4372     mtdscr(t0);
4373   }
4374 
4375   mtvrwz(VCRC, crc); // crc lives lives in VCRC, now
4376 
4377   for (int i = 1; i < unroll_factor2; ++i) {
4378     li(offs[i], 16 * i);
4379   }
4380 
4381   // Load consts for outer loop
4382   lvx(consts0[0], constants);
4383   for (int i = 1; i < unroll_factor2 - 1; ++i) {
4384     lvx(consts0[i], offs[i], constants);
4385   }
4386   addi(constants, constants, (unroll_factor2 - 1) * 16);
4387 
4388   load_const_optimized(num_bytes, 16 * unroll_factor);
4389   load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off.
4390 
4391   // Reuse data registers outside of the loop.
4392   VectorRegister Vtmp = data1[0];
4393   VectorRegister Vtmp2 = data1[1];
4394   VectorRegister zeroes = data1[2];
4395 
4396   vspltisb(Vtmp, 0);
4397   vsldoi(VCRC, Vtmp, VCRC, 8); // 96 bit zeroes, 32 bit CRC.
4398 
4399   // Load vector for vpermxor (to xor both 64 bit parts together)
4400   lvsl(Vtmp, buf);   // 000102030405060708090a0b0c0d0e0f
4401   vspltisb(Vc, 4);
4402   vsl(Vc, Vtmp, Vc); // 00102030405060708090a0b0c0d0e0f0
4403   xxspltd(Vc->to_vsr(), Vc->to_vsr(), 0);
4404   vor(Vc, Vtmp, Vc); // 001122334455667708192a3b4c5d6e7f
4405 
4406 #ifdef VM_LITTLE_ENDIAN
4407 #define BE_swap_bytes(x)
4408 #else
4409   vspltisb(Vtmp2, 0xf);
4410   vxor(swap_bytes, Vtmp, Vtmp2);
4411 #define BE_swap_bytes(x) vperm(x, x, x, swap_bytes)
4412 #endif
4413 
4414   cmpd(CCR0, len, num_bytes);
4415   blt(CCR0, L_last);
4416 
4417   // ********** Main loop start **********
4418   align(32);
4419   bind(L_outer_loop);
4420 
4421   // Begin of unrolled first iteration (no xor).
4422   lvx(data1[0], buf);
4423   mr(cur_const, constants);
4424   for (int i = 1; i < unroll_factor2 / 2; ++i) {
4425     lvx(data1[i], offs[i], buf);
4426   }
4427   vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
4428   lvx(consts1[0], cur_const);
4429   mtctr(loop_count);
4430   for (int i = 0; i < unroll_factor2 / 2; ++i) {
4431     BE_swap_bytes(data1[i]);
4432     if (i == 0) { vxor(data1[0], data1[0], VCRC); } // xor in previous CRC.
4433     lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
4434     vpmsumw(data0[i], data1[i], consts1[0]);
4435   }
4436   addi(buf, buf, 16 * unroll_factor2);
4437   subf(len, num_bytes, len);
4438   lvx(consts1[1], offs[1], cur_const);
4439   addi(cur_const, cur_const, 32);
4440   // Begin of unrolled second iteration (head).
4441   for (int i = 0; i < unroll_factor2 / 2; ++i) {
4442     BE_swap_bytes(data1[i + unroll_factor2 / 2]);
4443     if (i == 0) { lvx(data1[0], buf); } else { lvx(data1[i], offs[i], buf); }
4444     vpmsumw(data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[0]);
4445   }
4446   for (int i = 0; i < unroll_factor2 / 2; ++i) {
4447     BE_swap_bytes(data1[i]);
4448     lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
4449     vpmsumw(data1[i], data1[i], consts1[1]);
4450   }
4451   addi(buf, buf, 16 * unroll_factor2);
4452 
4453   // Generate most performance relevant code. Loads + half of the vpmsumw have been generated.
4454   // Double-iteration allows using the 2 constant registers alternatingly.
4455   align(32);
4456   bind(L_inner_loop);
4457   for (int j = 1; j < 3; ++j) { // j < unroll_factor / unroll_factor2 - 1 for complete unrolling.
4458     if (j & 1) {
4459       lvx(consts1[0], cur_const);
4460     } else {
4461       lvx(consts1[1], offs[1], cur_const);
4462       addi(cur_const, cur_const, 32);
4463     }
4464     for (int i = 0; i < unroll_factor2; ++i) {
4465       int idx = i + unroll_factor2 / 2, inc = 0; // For modulo-scheduled input.
4466       if (idx >= unroll_factor2) { idx -= unroll_factor2; inc = 1; }
4467       BE_swap_bytes(data1[idx]);
4468       vxor(data0[i], data0[i], data1[i]);
4469       if (i == 0) lvx(data1[0], buf); else lvx(data1[i], offs[i], buf);
4470       vpmsumw(data1[idx], data1[idx], consts1[(j + inc) & 1]);
4471     }
4472     addi(buf, buf, 16 * unroll_factor2);
4473   }
4474   bdnz(L_inner_loop);
4475 
4476   // Tail of last iteration (no loads).
4477   for (int i = 0; i < unroll_factor2 / 2; ++i) {
4478     BE_swap_bytes(data1[i + unroll_factor2 / 2]);
4479     vxor(data0[i], data0[i], data1[i]);
4480     vpmsumw(data1[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[1]);
4481   }
4482   for (int i = 0; i < unroll_factor2 / 2; ++i) {
4483     vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); // First half of fixup shifts.
4484     vxor(data0[i + unroll_factor2 / 2], data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2]);
4485   }
4486 
4487   // Last data register is ok, other ones need fixup shift.
4488   for (int i = unroll_factor2 / 2; i < unroll_factor2 - 1; ++i) {
4489     vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]);
4490   }
4491 
4492   // Combine to 128 bit result vector VCRC = data0[0].
4493   for (int i = 1; i < unroll_factor2; i<<=1) {
4494     for (int j = 0; j <= unroll_factor2 - 2*i; j+=2*i) {
4495       vxor(data0[j], data0[j], data0[j+i]);
4496     }
4497   }
4498   cmpd(CCR0, len, num_bytes);
4499   bge(CCR0, L_outer_loop);
4500 
4501   // Last chance with lower num_bytes.
4502   bind(L_last);
4503   srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations.
4504   add_const_optimized(constants, constants, 16 * (unroll_factor / unroll_factor2)); // Point behind last one.
4505   sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used.
4506   clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2));
4507   subf(constants, R0, constants); // Point to constant to be used first.
4508 
4509   addic_(loop_count, loop_count, -1); // One double-iteration peeled off.
4510   bgt(CCR0, L_outer_loop);
4511   // ********** Main loop end **********
4512 #undef BE_swap_bytes
4513 
4514   // Restore DSCR pre-fetch value.
4515   if (VM_Version::has_mfdscr()) {
4516     load_const_optimized(t0, VM_Version::_dscr_val);
4517     mtdscr(t0);
4518   }
4519 
4520   vspltisb(zeroes, 0);
4521 
4522   // Combine to 64 bit result.
4523   vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
4524 
4525   // Reduce to 32 bit CRC: Remainder by multiply-high.
4526   lvx(Vtmp, barretConstants);
4527   vsldoi(Vtmp2, zeroes, VCRC, 12);  // Extract high 32 bit.
4528   vpmsumd(Vtmp2, Vtmp2, Vtmp);      // Multiply by inverse long poly.
4529   vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit.
4530   vsldoi(Vtmp, zeroes, Vtmp, 8);
4531   vpmsumd(Vtmp2, Vtmp2, Vtmp);      // Multiply quotient by long poly.
4532   vxor(VCRC, VCRC, Vtmp2);          // Remainder fits into 32 bit.
4533 
4534   // Move result. len is already updated.
4535   vsldoi(VCRC, VCRC, zeroes, 8);
4536   mfvrd(crc, VCRC);
4537 
4538   // Restore non-volatile Vector registers (frameless).
4539   offsetInt = 0;
4540   offsetInt -= 16; li(offset, offsetInt); lvx(VR20, offset, R1_SP);
4541   offsetInt -= 16; li(offset, offsetInt); lvx(VR21, offset, R1_SP);
4542   offsetInt -= 16; li(offset, offsetInt); lvx(VR22, offset, R1_SP);
4543   offsetInt -= 16; li(offset, offsetInt); lvx(VR23, offset, R1_SP);
4544   offsetInt -= 16; li(offset, offsetInt); lvx(VR24, offset, R1_SP);
4545   offsetInt -= 16; li(offset, offsetInt); lvx(VR25, offset, R1_SP);
4546 #ifndef VM_LITTLE_ENDIAN
4547   offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP);
4548 #endif
4549   offsetInt -= 8;  ld(R14, offsetInt, R1_SP);
4550   offsetInt -= 8;  ld(R15, offsetInt, R1_SP);
4551   offsetInt -= 8;  ld(R16, offsetInt, R1_SP);
4552   offsetInt -= 8;  ld(R17, offsetInt, R1_SP);
4553 }
4554 
4555 void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp, bool invertCRC) {
4556   assert_different_registers(crc, buf, /* len,  not used!! */ table, tmp);
4557 
4558   BLOCK_COMMENT("kernel_crc32_singleByte:");
4559   if (invertCRC) {
4560     nand(crc, crc, crc);                // 1s complement of crc
4561   }
4562 
4563   lbz(tmp, 0, buf);                     // Byte from buffer, zero-extended.
4564   update_byte_crc32(crc, tmp, table);
4565 
4566   if (invertCRC) {
4567     nand(crc, crc, crc);                // 1s complement of crc
4568   }
4569 }
4570 
4571 void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) {
4572   assert_different_registers(crc, val, table);
4573 
4574   BLOCK_COMMENT("kernel_crc32_singleByteReg:");
4575   if (invertCRC) {
4576     nand(crc, crc, crc);                // 1s complement of crc
4577   }
4578 
4579   update_byte_crc32(crc, val, table);
4580 
4581   if (invertCRC) {
4582     nand(crc, crc, crc);                // 1s complement of crc
4583   }
4584 }
4585 
4586 // dest_lo += src1 + src2
4587 // dest_hi += carry1 + carry2
4588 void MacroAssembler::add2_with_carry(Register dest_hi,
4589                                      Register dest_lo,
4590                                      Register src1, Register src2) {
4591   li(R0, 0);
4592   addc(dest_lo, dest_lo, src1);
4593   adde(dest_hi, dest_hi, R0);
4594   addc(dest_lo, dest_lo, src2);
4595   adde(dest_hi, dest_hi, R0);
4596 }
4597 
4598 // Multiply 64 bit by 64 bit first loop.
4599 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,
4600                                            Register x_xstart,
4601                                            Register y, Register y_idx,
4602                                            Register z,
4603                                            Register carry,
4604                                            Register product_high, Register product,
4605                                            Register idx, Register kdx,
4606                                            Register tmp) {
4607   //  jlong carry, x[], y[], z[];
4608   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
4609   //    huge_128 product = y[idx] * x[xstart] + carry;
4610   //    z[kdx] = (jlong)product;
4611   //    carry  = (jlong)(product >>> 64);
4612   //  }
4613   //  z[xstart] = carry;
4614 
4615   Label L_first_loop, L_first_loop_exit;
4616   Label L_one_x, L_one_y, L_multiply;
4617 
4618   addic_(xstart, xstart, -1);
4619   blt(CCR0, L_one_x);   // Special case: length of x is 1.
4620 
4621   // Load next two integers of x.
4622   sldi(tmp, xstart, LogBytesPerInt);
4623   ldx(x_xstart, x, tmp);
4624 #ifdef VM_LITTLE_ENDIAN
4625   rldicl(x_xstart, x_xstart, 32, 0);
4626 #endif
4627 
4628   align(32, 16);
4629   bind(L_first_loop);
4630 
4631   cmpdi(CCR0, idx, 1);
4632   blt(CCR0, L_first_loop_exit);
4633   addi(idx, idx, -2);
4634   beq(CCR0, L_one_y);
4635 
4636   // Load next two integers of y.
4637   sldi(tmp, idx, LogBytesPerInt);
4638   ldx(y_idx, y, tmp);
4639 #ifdef VM_LITTLE_ENDIAN
4640   rldicl(y_idx, y_idx, 32, 0);
4641 #endif
4642 
4643 
4644   bind(L_multiply);
4645   multiply64(product_high, product, x_xstart, y_idx);
4646 
4647   li(tmp, 0);
4648   addc(product, product, carry);         // Add carry to result.
4649   adde(product_high, product_high, tmp); // Add carry of the last addition.
4650   addi(kdx, kdx, -2);
4651 
4652   // Store result.
4653 #ifdef VM_LITTLE_ENDIAN
4654   rldicl(product, product, 32, 0);
4655 #endif
4656   sldi(tmp, kdx, LogBytesPerInt);
4657   stdx(product, z, tmp);
4658   mr_if_needed(carry, product_high);
4659   b(L_first_loop);
4660 
4661 
4662   bind(L_one_y); // Load one 32 bit portion of y as (0,value).
4663 
4664   lwz(y_idx, 0, y);
4665   b(L_multiply);
4666 
4667 
4668   bind(L_one_x); // Load one 32 bit portion of x as (0,value).
4669 
4670   lwz(x_xstart, 0, x);
4671   b(L_first_loop);
4672 
4673   bind(L_first_loop_exit);
4674 }
4675 
4676 // Multiply 64 bit by 64 bit and add 128 bit.
4677 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y,
4678                                             Register z, Register yz_idx,
4679                                             Register idx, Register carry,
4680                                             Register product_high, Register product,
4681                                             Register tmp, int offset) {
4682 
4683   //  huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
4684   //  z[kdx] = (jlong)product;
4685 
4686   sldi(tmp, idx, LogBytesPerInt);
4687   if (offset) {
4688     addi(tmp, tmp, offset);
4689   }
4690   ldx(yz_idx, y, tmp);
4691 #ifdef VM_LITTLE_ENDIAN
4692   rldicl(yz_idx, yz_idx, 32, 0);
4693 #endif
4694 
4695   multiply64(product_high, product, x_xstart, yz_idx);
4696   ldx(yz_idx, z, tmp);
4697 #ifdef VM_LITTLE_ENDIAN
4698   rldicl(yz_idx, yz_idx, 32, 0);
4699 #endif
4700 
4701   add2_with_carry(product_high, product, carry, yz_idx);
4702 
4703   sldi(tmp, idx, LogBytesPerInt);
4704   if (offset) {
4705     addi(tmp, tmp, offset);
4706   }
4707 #ifdef VM_LITTLE_ENDIAN
4708   rldicl(product, product, 32, 0);
4709 #endif
4710   stdx(product, z, tmp);
4711 }
4712 
4713 // Multiply 128 bit by 128 bit. Unrolled inner loop.
4714 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart,
4715                                              Register y, Register z,
4716                                              Register yz_idx, Register idx, Register carry,
4717                                              Register product_high, Register product,
4718                                              Register carry2, Register tmp) {
4719 
4720   //  jlong carry, x[], y[], z[];
4721   //  int kdx = ystart+1;
4722   //  for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
4723   //    huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
4724   //    z[kdx+idx+1] = (jlong)product;
4725   //    jlong carry2 = (jlong)(product >>> 64);
4726   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
4727   //    z[kdx+idx] = (jlong)product;
4728   //    carry = (jlong)(product >>> 64);
4729   //  }
4730   //  idx += 2;
4731   //  if (idx > 0) {
4732   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
4733   //    z[kdx+idx] = (jlong)product;
4734   //    carry = (jlong)(product >>> 64);
4735   //  }
4736 
4737   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
4738   const Register jdx = R0;
4739 
4740   // Scale the index.
4741   srdi_(jdx, idx, 2);
4742   beq(CCR0, L_third_loop_exit);
4743   mtctr(jdx);
4744 
4745   align(32, 16);
4746   bind(L_third_loop);
4747 
4748   addi(idx, idx, -4);
4749 
4750   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8);
4751   mr_if_needed(carry2, product_high);
4752 
4753   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0);
4754   mr_if_needed(carry, product_high);
4755   bdnz(L_third_loop);
4756 
4757   bind(L_third_loop_exit);  // Handle any left-over operand parts.
4758 
4759   andi_(idx, idx, 0x3);
4760   beq(CCR0, L_post_third_loop_done);
4761 
4762   Label L_check_1;
4763 
4764   addic_(idx, idx, -2);
4765   blt(CCR0, L_check_1);
4766 
4767   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0);
4768   mr_if_needed(carry, product_high);
4769 
4770   bind(L_check_1);
4771 
4772   addi(idx, idx, 0x2);
4773   andi_(idx, idx, 0x1);
4774   addic_(idx, idx, -1);
4775   blt(CCR0, L_post_third_loop_done);
4776 
4777   sldi(tmp, idx, LogBytesPerInt);
4778   lwzx(yz_idx, y, tmp);
4779   multiply64(product_high, product, x_xstart, yz_idx);
4780   lwzx(yz_idx, z, tmp);
4781 
4782   add2_with_carry(product_high, product, yz_idx, carry);
4783 
4784   sldi(tmp, idx, LogBytesPerInt);
4785   stwx(product, z, tmp);
4786   srdi(product, product, 32);
4787 
4788   sldi(product_high, product_high, 32);
4789   orr(product, product, product_high);
4790   mr_if_needed(carry, product);
4791 
4792   bind(L_post_third_loop_done);
4793 }   // multiply_128_x_128_loop
4794 
4795 void MacroAssembler::muladd(Register out, Register in,
4796                             Register offset, Register len, Register k,
4797                             Register tmp1, Register tmp2, Register carry) {
4798 
4799   // Labels
4800   Label LOOP, SKIP;
4801 
4802   // Make sure length is positive.
4803   cmpdi  (CCR0,    len,     0);
4804 
4805   // Prepare variables
4806   subi   (offset,  offset,  4);
4807   li     (carry,   0);
4808   ble    (CCR0,    SKIP);
4809 
4810   mtctr  (len);
4811   subi   (len,     len,     1    );
4812   sldi   (len,     len,     2    );
4813 
4814   // Main loop
4815   bind(LOOP);
4816   lwzx   (tmp1,    len,     in   );
4817   lwzx   (tmp2,    offset,  out  );
4818   mulld  (tmp1,    tmp1,    k    );
4819   add    (tmp2,    carry,   tmp2 );
4820   add    (tmp2,    tmp1,    tmp2 );
4821   stwx   (tmp2,    offset,  out  );
4822   srdi   (carry,   tmp2,    32   );
4823   subi   (offset,  offset,  4    );
4824   subi   (len,     len,     4    );
4825   bdnz   (LOOP);
4826   bind(SKIP);
4827 }
4828 
4829 void MacroAssembler::multiply_to_len(Register x, Register xlen,
4830                                      Register y, Register ylen,
4831                                      Register z, Register zlen,
4832                                      Register tmp1, Register tmp2,
4833                                      Register tmp3, Register tmp4,
4834                                      Register tmp5, Register tmp6,
4835                                      Register tmp7, Register tmp8,
4836                                      Register tmp9, Register tmp10,
4837                                      Register tmp11, Register tmp12,
4838                                      Register tmp13) {
4839 
4840   ShortBranchVerifier sbv(this);
4841 
4842   assert_different_registers(x, xlen, y, ylen, z, zlen,
4843                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
4844   assert_different_registers(x, xlen, y, ylen, z, zlen,
4845                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp7);
4846   assert_different_registers(x, xlen, y, ylen, z, zlen,
4847                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp8);
4848 
4849   const Register idx = tmp1;
4850   const Register kdx = tmp2;
4851   const Register xstart = tmp3;
4852 
4853   const Register y_idx = tmp4;
4854   const Register carry = tmp5;
4855   const Register product = tmp6;
4856   const Register product_high = tmp7;
4857   const Register x_xstart = tmp8;
4858   const Register tmp = tmp9;
4859 
4860   // First Loop.
4861   //
4862   //  final static long LONG_MASK = 0xffffffffL;
4863   //  int xstart = xlen - 1;
4864   //  int ystart = ylen - 1;
4865   //  long carry = 0;
4866   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
4867   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
4868   //    z[kdx] = (int)product;
4869   //    carry = product >>> 32;
4870   //  }
4871   //  z[xstart] = (int)carry;
4872 
4873   mr_if_needed(idx, ylen);        // idx = ylen
4874   mr_if_needed(kdx, zlen);        // kdx = xlen + ylen
4875   li(carry, 0);                   // carry = 0
4876 
4877   Label L_done;
4878 
4879   addic_(xstart, xlen, -1);
4880   blt(CCR0, L_done);
4881 
4882   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z,
4883                         carry, product_high, product, idx, kdx, tmp);
4884 
4885   Label L_second_loop;
4886 
4887   cmpdi(CCR0, kdx, 0);
4888   beq(CCR0, L_second_loop);
4889 
4890   Label L_carry;
4891 
4892   addic_(kdx, kdx, -1);
4893   beq(CCR0, L_carry);
4894 
4895   // Store lower 32 bits of carry.
4896   sldi(tmp, kdx, LogBytesPerInt);
4897   stwx(carry, z, tmp);
4898   srdi(carry, carry, 32);
4899   addi(kdx, kdx, -1);
4900 
4901 
4902   bind(L_carry);
4903 
4904   // Store upper 32 bits of carry.
4905   sldi(tmp, kdx, LogBytesPerInt);
4906   stwx(carry, z, tmp);
4907 
4908   // Second and third (nested) loops.
4909   //
4910   //  for (int i = xstart-1; i >= 0; i--) { // Second loop
4911   //    carry = 0;
4912   //    for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
4913   //      long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
4914   //                     (z[k] & LONG_MASK) + carry;
4915   //      z[k] = (int)product;
4916   //      carry = product >>> 32;
4917   //    }
4918   //    z[i] = (int)carry;
4919   //  }
4920   //
4921   //  i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
4922 
4923   bind(L_second_loop);
4924 
4925   li(carry, 0);                   // carry = 0;
4926 
4927   addic_(xstart, xstart, -1);     // i = xstart-1;
4928   blt(CCR0, L_done);
4929 
4930   Register zsave = tmp10;
4931 
4932   mr(zsave, z);
4933 
4934 
4935   Label L_last_x;
4936 
4937   sldi(tmp, xstart, LogBytesPerInt);
4938   add(z, z, tmp);                 // z = z + k - j
4939   addi(z, z, 4);
4940   addic_(xstart, xstart, -1);     // i = xstart-1;
4941   blt(CCR0, L_last_x);
4942 
4943   sldi(tmp, xstart, LogBytesPerInt);
4944   ldx(x_xstart, x, tmp);
4945 #ifdef VM_LITTLE_ENDIAN
4946   rldicl(x_xstart, x_xstart, 32, 0);
4947 #endif
4948 
4949 
4950   Label L_third_loop_prologue;
4951 
4952   bind(L_third_loop_prologue);
4953 
4954   Register xsave = tmp11;
4955   Register xlensave = tmp12;
4956   Register ylensave = tmp13;
4957 
4958   mr(xsave, x);
4959   mr(xlensave, xstart);
4960   mr(ylensave, ylen);
4961 
4962 
4963   multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen,
4964                           carry, product_high, product, x, tmp);
4965 
4966   mr(z, zsave);
4967   mr(x, xsave);
4968   mr(xlen, xlensave);   // This is the decrement of the loop counter!
4969   mr(ylen, ylensave);
4970 
4971   addi(tmp3, xlen, 1);
4972   sldi(tmp, tmp3, LogBytesPerInt);
4973   stwx(carry, z, tmp);
4974   addic_(tmp3, tmp3, -1);
4975   blt(CCR0, L_done);
4976 
4977   srdi(carry, carry, 32);
4978   sldi(tmp, tmp3, LogBytesPerInt);
4979   stwx(carry, z, tmp);
4980   b(L_second_loop);
4981 
4982   // Next infrequent code is moved outside loops.
4983   bind(L_last_x);
4984 
4985   lwz(x_xstart, 0, x);
4986   b(L_third_loop_prologue);
4987 
4988   bind(L_done);
4989 }   // multiply_to_len
4990 
4991 void MacroAssembler::asm_assert(bool check_equal, const char *msg, int id) {
4992 #ifdef ASSERT
4993   Label ok;
4994   if (check_equal) {
4995     beq(CCR0, ok);
4996   } else {
4997     bne(CCR0, ok);
4998   }
4999   stop(msg, id);
5000   bind(ok);
5001 #endif
5002 }
5003 
5004 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset,
5005                                           Register mem_base, const char* msg, int id) {
5006 #ifdef ASSERT
5007   switch (size) {
5008     case 4:
5009       lwz(R0, mem_offset, mem_base);
5010       cmpwi(CCR0, R0, 0);
5011       break;
5012     case 8:
5013       ld(R0, mem_offset, mem_base);
5014       cmpdi(CCR0, R0, 0);
5015       break;
5016     default:
5017       ShouldNotReachHere();
5018   }
5019   asm_assert(check_equal, msg, id);
5020 #endif // ASSERT
5021 }
5022 
5023 void MacroAssembler::verify_thread() {
5024   if (VerifyThread) {
5025     unimplemented("'VerifyThread' currently not implemented on PPC");
5026   }
5027 }
5028 
5029 // READ: oop. KILL: R0. Volatile floats perhaps.
5030 void MacroAssembler::verify_oop(Register oop, const char* msg) {
5031   if (!VerifyOops) {
5032     return;
5033   }
5034 
5035   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
5036   const Register tmp = R11; // Will be preserved.
5037   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
5038   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
5039 
5040   mr_if_needed(R4_ARG2, oop);
5041   save_LR_CR(tmp); // save in old frame
5042   push_frame_reg_args(nbytes_save, tmp);
5043   // load FunctionDescriptor** / entry_address *
5044   load_const_optimized(tmp, fd, R0);
5045   // load FunctionDescriptor* / entry_address
5046   ld(tmp, 0, tmp);
5047   load_const_optimized(R3_ARG1, (address)msg, R0);
5048   // Call destination for its side effect.
5049   call_c(tmp);
5050 
5051   pop_frame();
5052   restore_LR_CR(tmp);
5053   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
5054 }
5055 
5056 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) {
5057   if (!VerifyOops) {
5058     return;
5059   }
5060 
5061   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
5062   const Register tmp = R11; // Will be preserved.
5063   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
5064   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
5065 
5066   ld(R4_ARG2, offs, base);
5067   save_LR_CR(tmp); // save in old frame
5068   push_frame_reg_args(nbytes_save, tmp);
5069   // load FunctionDescriptor** / entry_address *
5070   load_const_optimized(tmp, fd, R0);
5071   // load FunctionDescriptor* / entry_address
5072   ld(tmp, 0, tmp);
5073   load_const_optimized(R3_ARG1, (address)msg, R0);
5074   // Call destination for its side effect.
5075   call_c(tmp);
5076 
5077   pop_frame();
5078   restore_LR_CR(tmp);
5079   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
5080 }
5081 
5082 const char* stop_types[] = {
5083   "stop",
5084   "untested",
5085   "unimplemented",
5086   "shouldnotreachhere"
5087 };
5088 
5089 static void stop_on_request(int tp, const char* msg) {
5090   tty->print("PPC assembly code requires stop: (%s) %s\n", stop_types[tp%/*stop_end*/4], msg);
5091   guarantee(false, "PPC assembly code requires stop: %s", msg);
5092 }
5093 
5094 // Call a C-function that prints output.
5095 void MacroAssembler::stop(int type, const char* msg, int id) {
5096 #ifndef PRODUCT
5097   block_comment(err_msg("stop: %s %s {", stop_types[type%stop_end], msg));
5098 #else
5099   block_comment("stop {");
5100 #endif
5101 
5102   // setup arguments
5103   load_const_optimized(R3_ARG1, type);
5104   load_const_optimized(R4_ARG2, (void *)msg, /*tmp=*/R0);
5105   call_VM_leaf(CAST_FROM_FN_PTR(address, stop_on_request), R3_ARG1, R4_ARG2);
5106   illtrap();
5107   emit_int32(id);
5108   block_comment("} stop;");
5109 }
5110 
5111 #ifndef PRODUCT
5112 // Write pattern 0x0101010101010101 in memory region [low-before, high+after].
5113 // Val, addr are temp registers.
5114 // If low == addr, addr is killed.
5115 // High is preserved.
5116 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) {
5117   if (!ZapMemory) return;
5118 
5119   assert_different_registers(low, val);
5120 
5121   BLOCK_COMMENT("zap memory region {");
5122   load_const_optimized(val, 0x0101010101010101);
5123   int size = before + after;
5124   if (low == high && size < 5 && size > 0) {
5125     int offset = -before*BytesPerWord;
5126     for (int i = 0; i < size; ++i) {
5127       std(val, offset, low);
5128       offset += (1*BytesPerWord);
5129     }
5130   } else {
5131     addi(addr, low, -before*BytesPerWord);
5132     assert_different_registers(high, val);
5133     if (after) addi(high, high, after * BytesPerWord);
5134     Label loop;
5135     bind(loop);
5136     std(val, 0, addr);
5137     addi(addr, addr, 8);
5138     cmpd(CCR6, addr, high);
5139     ble(CCR6, loop);
5140     if (after) addi(high, high, -after * BytesPerWord);  // Correct back to old value.
5141   }
5142   BLOCK_COMMENT("} zap memory region");
5143 }
5144 
5145 #endif // !PRODUCT
5146 
5147 void SkipIfEqualZero::skip_to_label_if_equal_zero(MacroAssembler* masm, Register temp,
5148                                                   const bool* flag_addr, Label& label) {
5149   int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true);
5150   assert(sizeof(bool) == 1, "PowerPC ABI");
5151   masm->lbz(temp, simm16_offset, temp);
5152   masm->cmpwi(CCR0, temp, 0);
5153   masm->beq(CCR0, label);
5154 }
5155 
5156 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() {
5157   skip_to_label_if_equal_zero(masm, temp, flag_addr, _label);
5158 }
5159 
5160 SkipIfEqualZero::~SkipIfEqualZero() {
5161   _masm->bind(_label);
5162 }