1 /*
   2  * Copyright (c) 1997, 2017, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2012, 2017, SAP SE. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "compiler/disassembler.hpp"
  29 #include "gc/shared/cardTableModRefBS.hpp"
  30 #include "gc/shared/collectedHeap.inline.hpp"
  31 #include "interpreter/interpreter.hpp"
  32 #include "memory/resourceArea.hpp"
  33 #include "nativeInst_ppc.hpp"
  34 #include "prims/methodHandles.hpp"
  35 #include "runtime/biasedLocking.hpp"
  36 #include "runtime/icache.hpp"
  37 #include "runtime/interfaceSupport.hpp"
  38 #include "runtime/objectMonitor.hpp"
  39 #include "runtime/os.hpp"
  40 #include "runtime/safepoint.hpp"
  41 #include "runtime/safepointMechanism.hpp"
  42 #include "runtime/sharedRuntime.hpp"
  43 #include "runtime/stubRoutines.hpp"
  44 #include "utilities/macros.hpp"
  45 #if INCLUDE_ALL_GCS
  46 #include "gc/g1/g1CollectedHeap.inline.hpp"
  47 #include "gc/g1/g1SATBCardTableModRefBS.hpp"
  48 #include "gc/g1/heapRegion.hpp"
  49 #endif // INCLUDE_ALL_GCS
  50 #ifdef COMPILER2
  51 #include "opto/intrinsicnode.hpp"
  52 #endif
  53 
  54 #ifdef PRODUCT
  55 #define BLOCK_COMMENT(str) // nothing
  56 #else
  57 #define BLOCK_COMMENT(str) block_comment(str)
  58 #endif
  59 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  60 
  61 #ifdef ASSERT
  62 // On RISC, there's no benefit to verifying instruction boundaries.
  63 bool AbstractAssembler::pd_check_instruction_mark() { return false; }
  64 #endif
  65 
  66 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) {
  67   assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range");
  68   if (Assembler::is_simm(si31, 16)) {
  69     ld(d, si31, a);
  70     if (emit_filler_nop) nop();
  71   } else {
  72     const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31);
  73     const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31);
  74     addis(d, a, hi);
  75     ld(d, lo, d);
  76   }
  77 }
  78 
  79 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) {
  80   assert_different_registers(d, a);
  81   ld_largeoffset_unchecked(d, si31, a, emit_filler_nop);
  82 }
  83 
  84 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base,
  85                                       size_t size_in_bytes, bool is_signed) {
  86   switch (size_in_bytes) {
  87   case  8:              ld(dst, offs, base);                         break;
  88   case  4:  is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break;
  89   case  2:  is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break;
  90   case  1:  lbz(dst, offs, base); if (is_signed) extsb(dst, dst);    break; // lba doesn't exist :(
  91   default:  ShouldNotReachHere();
  92   }
  93 }
  94 
  95 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base,
  96                                        size_t size_in_bytes) {
  97   switch (size_in_bytes) {
  98   case  8:  std(dst, offs, base); break;
  99   case  4:  stw(dst, offs, base); break;
 100   case  2:  sth(dst, offs, base); break;
 101   case  1:  stb(dst, offs, base); break;
 102   default:  ShouldNotReachHere();
 103   }
 104 }
 105 
 106 void MacroAssembler::align(int modulus, int max, int rem) {
 107   int padding = (rem + modulus - (offset() % modulus)) % modulus;
 108   if (padding > max) return;
 109   for (int c = (padding >> 2); c > 0; --c) { nop(); }
 110 }
 111 
 112 // Issue instructions that calculate given TOC from global TOC.
 113 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16,
 114                                                        bool add_relocation, bool emit_dummy_addr) {
 115   int offset = -1;
 116   if (emit_dummy_addr) {
 117     offset = -128; // dummy address
 118   } else if (addr != (address)(intptr_t)-1) {
 119     offset = MacroAssembler::offset_to_global_toc(addr);
 120   }
 121 
 122   if (hi16) {
 123     addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset));
 124   }
 125   if (lo16) {
 126     if (add_relocation) {
 127       // Relocate at the addi to avoid confusion with a load from the method's TOC.
 128       relocate(internal_word_Relocation::spec(addr));
 129     }
 130     addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset));
 131   }
 132 }
 133 
 134 address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) {
 135   const int offset = MacroAssembler::offset_to_global_toc(addr);
 136 
 137   const address inst2_addr = a;
 138   const int inst2 = *(int *)inst2_addr;
 139 
 140   // The relocation points to the second instruction, the addi,
 141   // and the addi reads and writes the same register dst.
 142   const int dst = inv_rt_field(inst2);
 143   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 144 
 145   // Now, find the preceding addis which writes to dst.
 146   int inst1 = 0;
 147   address inst1_addr = inst2_addr - BytesPerInstWord;
 148   while (inst1_addr >= bound) {
 149     inst1 = *(int *) inst1_addr;
 150     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 151       // Stop, found the addis which writes dst.
 152       break;
 153     }
 154     inst1_addr -= BytesPerInstWord;
 155   }
 156 
 157   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 158   set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset));
 159   set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset));
 160   return inst1_addr;
 161 }
 162 
 163 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) {
 164   const address inst2_addr = a;
 165   const int inst2 = *(int *)inst2_addr;
 166 
 167   // The relocation points to the second instruction, the addi,
 168   // and the addi reads and writes the same register dst.
 169   const int dst = inv_rt_field(inst2);
 170   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 171 
 172   // Now, find the preceding addis which writes to dst.
 173   int inst1 = 0;
 174   address inst1_addr = inst2_addr - BytesPerInstWord;
 175   while (inst1_addr >= bound) {
 176     inst1 = *(int *) inst1_addr;
 177     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 178       // stop, found the addis which writes dst
 179       break;
 180     }
 181     inst1_addr -= BytesPerInstWord;
 182   }
 183 
 184   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 185 
 186   int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0);
 187   // -1 is a special case
 188   if (offset == -1) {
 189     return (address)(intptr_t)-1;
 190   } else {
 191     return global_toc() + offset;
 192   }
 193 }
 194 
 195 #ifdef _LP64
 196 // Patch compressed oops or klass constants.
 197 // Assembler sequence is
 198 // 1) compressed oops:
 199 //    lis  rx = const.hi
 200 //    ori rx = rx | const.lo
 201 // 2) compressed klass:
 202 //    lis  rx = const.hi
 203 //    clrldi rx = rx & 0xFFFFffff // clearMS32b, optional
 204 //    ori rx = rx | const.lo
 205 // Clrldi will be passed by.
 206 address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) {
 207   assert(UseCompressedOops, "Should only patch compressed oops");
 208 
 209   const address inst2_addr = a;
 210   const int inst2 = *(int *)inst2_addr;
 211 
 212   // The relocation points to the second instruction, the ori,
 213   // and the ori reads and writes the same register dst.
 214   const int dst = inv_rta_field(inst2);
 215   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 216   // Now, find the preceding addis which writes to dst.
 217   int inst1 = 0;
 218   address inst1_addr = inst2_addr - BytesPerInstWord;
 219   bool inst1_found = false;
 220   while (inst1_addr >= bound) {
 221     inst1 = *(int *)inst1_addr;
 222     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; }
 223     inst1_addr -= BytesPerInstWord;
 224   }
 225   assert(inst1_found, "inst is not lis");
 226 
 227   int xc = (data >> 16) & 0xffff;
 228   int xd = (data >>  0) & 0xffff;
 229 
 230   set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo
 231   set_imm((int *)inst2_addr,        (xd)); // unsigned int
 232   return inst1_addr;
 233 }
 234 
 235 // Get compressed oop or klass constant.
 236 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) {
 237   assert(UseCompressedOops, "Should only patch compressed oops");
 238 
 239   const address inst2_addr = a;
 240   const int inst2 = *(int *)inst2_addr;
 241 
 242   // The relocation points to the second instruction, the ori,
 243   // and the ori reads and writes the same register dst.
 244   const int dst = inv_rta_field(inst2);
 245   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 246   // Now, find the preceding lis which writes to dst.
 247   int inst1 = 0;
 248   address inst1_addr = inst2_addr - BytesPerInstWord;
 249   bool inst1_found = false;
 250 
 251   while (inst1_addr >= bound) {
 252     inst1 = *(int *) inst1_addr;
 253     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;}
 254     inst1_addr -= BytesPerInstWord;
 255   }
 256   assert(inst1_found, "inst is not lis");
 257 
 258   uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff));
 259   uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16);
 260 
 261   return (int) (xl | xh);
 262 }
 263 #endif // _LP64
 264 
 265 // Returns true if successful.
 266 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a,
 267                                                 Register toc, bool fixed_size) {
 268   int toc_offset = 0;
 269   // Use RelocationHolder::none for the constant pool entry, otherwise
 270   // we will end up with a failing NativeCall::verify(x) where x is
 271   // the address of the constant pool entry.
 272   // FIXME: We should insert relocation information for oops at the constant
 273   // pool entries instead of inserting it at the loads; patching of a constant
 274   // pool entry should be less expensive.
 275   address const_address = address_constant((address)a.value(), RelocationHolder::none);
 276   if (const_address == NULL) { return false; } // allocation failure
 277   // Relocate at the pc of the load.
 278   relocate(a.rspec());
 279   toc_offset = (int)(const_address - code()->consts()->start());
 280   ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size);
 281   return true;
 282 }
 283 
 284 bool MacroAssembler::is_load_const_from_method_toc_at(address a) {
 285   const address inst1_addr = a;
 286   const int inst1 = *(int *)inst1_addr;
 287 
 288    // The relocation points to the ld or the addis.
 289    return (is_ld(inst1)) ||
 290           (is_addis(inst1) && inv_ra_field(inst1) != 0);
 291 }
 292 
 293 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) {
 294   assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc");
 295 
 296   const address inst1_addr = a;
 297   const int inst1 = *(int *)inst1_addr;
 298 
 299   if (is_ld(inst1)) {
 300     return inv_d1_field(inst1);
 301   } else if (is_addis(inst1)) {
 302     const int dst = inv_rt_field(inst1);
 303 
 304     // Now, find the succeeding ld which reads and writes to dst.
 305     address inst2_addr = inst1_addr + BytesPerInstWord;
 306     int inst2 = 0;
 307     while (true) {
 308       inst2 = *(int *) inst2_addr;
 309       if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) {
 310         // Stop, found the ld which reads and writes dst.
 311         break;
 312       }
 313       inst2_addr += BytesPerInstWord;
 314     }
 315     return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2);
 316   }
 317   ShouldNotReachHere();
 318   return 0;
 319 }
 320 
 321 // Get the constant from a `load_const' sequence.
 322 long MacroAssembler::get_const(address a) {
 323   assert(is_load_const_at(a), "not a load of a constant");
 324   const int *p = (const int*) a;
 325   unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48);
 326   if (is_ori(*(p+1))) {
 327     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32);
 328     x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16);
 329     x |= (((unsigned long) (get_imm(a,4) & 0xffff)));
 330   } else if (is_lis(*(p+1))) {
 331     x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32);
 332     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16);
 333     x |= (((unsigned long) (get_imm(a,3) & 0xffff)));
 334   } else {
 335     ShouldNotReachHere();
 336     return (long) 0;
 337   }
 338   return (long) x;
 339 }
 340 
 341 // Patch the 64 bit constant of a `load_const' sequence. This is a low
 342 // level procedure. It neither flushes the instruction cache nor is it
 343 // mt safe.
 344 void MacroAssembler::patch_const(address a, long x) {
 345   assert(is_load_const_at(a), "not a load of a constant");
 346   int *p = (int*) a;
 347   if (is_ori(*(p+1))) {
 348     set_imm(0 + p, (x >> 48) & 0xffff);
 349     set_imm(1 + p, (x >> 32) & 0xffff);
 350     set_imm(3 + p, (x >> 16) & 0xffff);
 351     set_imm(4 + p, x & 0xffff);
 352   } else if (is_lis(*(p+1))) {
 353     set_imm(0 + p, (x >> 48) & 0xffff);
 354     set_imm(2 + p, (x >> 32) & 0xffff);
 355     set_imm(1 + p, (x >> 16) & 0xffff);
 356     set_imm(3 + p, x & 0xffff);
 357   } else {
 358     ShouldNotReachHere();
 359   }
 360 }
 361 
 362 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) {
 363   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
 364   int index = oop_recorder()->allocate_metadata_index(obj);
 365   RelocationHolder rspec = metadata_Relocation::spec(index);
 366   return AddressLiteral((address)obj, rspec);
 367 }
 368 
 369 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) {
 370   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
 371   int index = oop_recorder()->find_index(obj);
 372   RelocationHolder rspec = metadata_Relocation::spec(index);
 373   return AddressLiteral((address)obj, rspec);
 374 }
 375 
 376 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) {
 377   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
 378   int oop_index = oop_recorder()->allocate_oop_index(obj);
 379   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 380 }
 381 
 382 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) {
 383   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
 384   int oop_index = oop_recorder()->find_index(obj);
 385   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 386 }
 387 
 388 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
 389                                                       Register tmp, int offset) {
 390   intptr_t value = *delayed_value_addr;
 391   if (value != 0) {
 392     return RegisterOrConstant(value + offset);
 393   }
 394 
 395   // Load indirectly to solve generation ordering problem.
 396   // static address, no relocation
 397   int simm16_offset = load_const_optimized(tmp, delayed_value_addr, noreg, true);
 398   ld(tmp, simm16_offset, tmp); // must be aligned ((xa & 3) == 0)
 399 
 400   if (offset != 0) {
 401     addi(tmp, tmp, offset);
 402   }
 403 
 404   return RegisterOrConstant(tmp);
 405 }
 406 
 407 #ifndef PRODUCT
 408 void MacroAssembler::pd_print_patched_instruction(address branch) {
 409   Unimplemented(); // TODO: PPC port
 410 }
 411 #endif // ndef PRODUCT
 412 
 413 // Conditional far branch for destinations encodable in 24+2 bits.
 414 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) {
 415 
 416   // If requested by flag optimize, relocate the bc_far as a
 417   // runtime_call and prepare for optimizing it when the code gets
 418   // relocated.
 419   if (optimize == bc_far_optimize_on_relocate) {
 420     relocate(relocInfo::runtime_call_type);
 421   }
 422 
 423   // variant 2:
 424   //
 425   //    b!cxx SKIP
 426   //    bxx   DEST
 427   //  SKIP:
 428   //
 429 
 430   const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 431                                                 opposite_bcond(inv_boint_bcond(boint)));
 432 
 433   // We emit two branches.
 434   // First, a conditional branch which jumps around the far branch.
 435   const address not_taken_pc = pc() + 2 * BytesPerInstWord;
 436   const address bc_pc        = pc();
 437   bc(opposite_boint, biint, not_taken_pc);
 438 
 439   const int bc_instr = *(int*)bc_pc;
 440   assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition");
 441   assert(opposite_boint == inv_bo_field(bc_instr), "postcondition");
 442   assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))),
 443                                      opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))),
 444          "postcondition");
 445   assert(biint == inv_bi_field(bc_instr), "postcondition");
 446 
 447   // Second, an unconditional far branch which jumps to dest.
 448   // Note: target(dest) remembers the current pc (see CodeSection::target)
 449   //       and returns the current pc if the label is not bound yet; when
 450   //       the label gets bound, the unconditional far branch will be patched.
 451   const address target_pc = target(dest);
 452   const address b_pc  = pc();
 453   b(target_pc);
 454 
 455   assert(not_taken_pc == pc(),                     "postcondition");
 456   assert(dest.is_bound() || target_pc == b_pc, "postcondition");
 457 }
 458 
 459 // 1 or 2 instructions
 460 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) {
 461   if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) {
 462     bc(boint, biint, dest);
 463   } else {
 464     bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate);
 465   }
 466 }
 467 
 468 bool MacroAssembler::is_bc_far_at(address instruction_addr) {
 469   return is_bc_far_variant1_at(instruction_addr) ||
 470          is_bc_far_variant2_at(instruction_addr) ||
 471          is_bc_far_variant3_at(instruction_addr);
 472 }
 473 
 474 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) {
 475   if (is_bc_far_variant1_at(instruction_addr)) {
 476     const address instruction_1_addr = instruction_addr;
 477     const int instruction_1 = *(int*)instruction_1_addr;
 478     return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr);
 479   } else if (is_bc_far_variant2_at(instruction_addr)) {
 480     const address instruction_2_addr = instruction_addr + 4;
 481     return bxx_destination(instruction_2_addr);
 482   } else if (is_bc_far_variant3_at(instruction_addr)) {
 483     return instruction_addr + 8;
 484   }
 485   // variant 4 ???
 486   ShouldNotReachHere();
 487   return NULL;
 488 }
 489 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) {
 490 
 491   if (is_bc_far_variant3_at(instruction_addr)) {
 492     // variant 3, far cond branch to the next instruction, already patched to nops:
 493     //
 494     //    nop
 495     //    endgroup
 496     //  SKIP/DEST:
 497     //
 498     return;
 499   }
 500 
 501   // first, extract boint and biint from the current branch
 502   int boint = 0;
 503   int biint = 0;
 504 
 505   ResourceMark rm;
 506   const int code_size = 2 * BytesPerInstWord;
 507   CodeBuffer buf(instruction_addr, code_size);
 508   MacroAssembler masm(&buf);
 509   if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) {
 510     // Far branch to next instruction: Optimize it by patching nops (produce variant 3).
 511     masm.nop();
 512     masm.endgroup();
 513   } else {
 514     if (is_bc_far_variant1_at(instruction_addr)) {
 515       // variant 1, the 1st instruction contains the destination address:
 516       //
 517       //    bcxx  DEST
 518       //    nop
 519       //
 520       const int instruction_1 = *(int*)(instruction_addr);
 521       boint = inv_bo_field(instruction_1);
 522       biint = inv_bi_field(instruction_1);
 523     } else if (is_bc_far_variant2_at(instruction_addr)) {
 524       // variant 2, the 2nd instruction contains the destination address:
 525       //
 526       //    b!cxx SKIP
 527       //    bxx   DEST
 528       //  SKIP:
 529       //
 530       const int instruction_1 = *(int*)(instruction_addr);
 531       boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))),
 532           opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1))));
 533       biint = inv_bi_field(instruction_1);
 534     } else {
 535       // variant 4???
 536       ShouldNotReachHere();
 537     }
 538 
 539     // second, set the new branch destination and optimize the code
 540     if (dest != instruction_addr + 4 && // the bc_far is still unbound!
 541         masm.is_within_range_of_bcxx(dest, instruction_addr)) {
 542       // variant 1:
 543       //
 544       //    bcxx  DEST
 545       //    nop
 546       //
 547       masm.bc(boint, biint, dest);
 548       masm.nop();
 549     } else {
 550       // variant 2:
 551       //
 552       //    b!cxx SKIP
 553       //    bxx   DEST
 554       //  SKIP:
 555       //
 556       const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 557                                                     opposite_bcond(inv_boint_bcond(boint)));
 558       const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord;
 559       masm.bc(opposite_boint, biint, not_taken_pc);
 560       masm.b(dest);
 561     }
 562   }
 563   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 564 }
 565 
 566 // Emit a NOT mt-safe patchable 64 bit absolute call/jump.
 567 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) {
 568   // get current pc
 569   uint64_t start_pc = (uint64_t) pc();
 570 
 571   const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last
 572   const address pc_of_b  = (address) (start_pc + (0*BytesPerInstWord)); // b is first
 573 
 574   // relocate here
 575   if (rt != relocInfo::none) {
 576     relocate(rt);
 577   }
 578 
 579   if ( ReoptimizeCallSequences &&
 580        (( link && is_within_range_of_b(dest, pc_of_bl)) ||
 581         (!link && is_within_range_of_b(dest, pc_of_b)))) {
 582     // variant 2:
 583     // Emit an optimized, pc-relative call/jump.
 584 
 585     if (link) {
 586       // some padding
 587       nop();
 588       nop();
 589       nop();
 590       nop();
 591       nop();
 592       nop();
 593 
 594       // do the call
 595       assert(pc() == pc_of_bl, "just checking");
 596       bl(dest, relocInfo::none);
 597     } else {
 598       // do the jump
 599       assert(pc() == pc_of_b, "just checking");
 600       b(dest, relocInfo::none);
 601 
 602       // some padding
 603       nop();
 604       nop();
 605       nop();
 606       nop();
 607       nop();
 608       nop();
 609     }
 610 
 611     // Assert that we can identify the emitted call/jump.
 612     assert(is_bxx64_patchable_variant2_at((address)start_pc, link),
 613            "can't identify emitted call");
 614   } else {
 615     // variant 1:
 616     mr(R0, R11);  // spill R11 -> R0.
 617 
 618     // Load the destination address into CTR,
 619     // calculate destination relative to global toc.
 620     calculate_address_from_global_toc(R11, dest, true, true, false);
 621 
 622     mtctr(R11);
 623     mr(R11, R0);  // spill R11 <- R0.
 624     nop();
 625 
 626     // do the call/jump
 627     if (link) {
 628       bctrl();
 629     } else{
 630       bctr();
 631     }
 632     // Assert that we can identify the emitted call/jump.
 633     assert(is_bxx64_patchable_variant1b_at((address)start_pc, link),
 634            "can't identify emitted call");
 635   }
 636 
 637   // Assert that we can identify the emitted call/jump.
 638   assert(is_bxx64_patchable_at((address)start_pc, link),
 639          "can't identify emitted call");
 640   assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest,
 641          "wrong encoding of dest address");
 642 }
 643 
 644 // Identify a bxx64_patchable instruction.
 645 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) {
 646   return is_bxx64_patchable_variant1b_at(instruction_addr, link)
 647     //|| is_bxx64_patchable_variant1_at(instruction_addr, link)
 648       || is_bxx64_patchable_variant2_at(instruction_addr, link);
 649 }
 650 
 651 // Does the call64_patchable instruction use a pc-relative encoding of
 652 // the call destination?
 653 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) {
 654   // variant 2 is pc-relative
 655   return is_bxx64_patchable_variant2_at(instruction_addr, link);
 656 }
 657 
 658 // Identify variant 1.
 659 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) {
 660   unsigned int* instr = (unsigned int*) instruction_addr;
 661   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 662       && is_mtctr(instr[5]) // mtctr
 663     && is_load_const_at(instruction_addr);
 664 }
 665 
 666 // Identify variant 1b: load destination relative to global toc.
 667 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) {
 668   unsigned int* instr = (unsigned int*) instruction_addr;
 669   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 670     && is_mtctr(instr[3]) // mtctr
 671     && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr);
 672 }
 673 
 674 // Identify variant 2.
 675 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) {
 676   unsigned int* instr = (unsigned int*) instruction_addr;
 677   if (link) {
 678     return is_bl (instr[6])  // bl dest is last
 679       && is_nop(instr[0])  // nop
 680       && is_nop(instr[1])  // nop
 681       && is_nop(instr[2])  // nop
 682       && is_nop(instr[3])  // nop
 683       && is_nop(instr[4])  // nop
 684       && is_nop(instr[5]); // nop
 685   } else {
 686     return is_b  (instr[0])  // b  dest is first
 687       && is_nop(instr[1])  // nop
 688       && is_nop(instr[2])  // nop
 689       && is_nop(instr[3])  // nop
 690       && is_nop(instr[4])  // nop
 691       && is_nop(instr[5])  // nop
 692       && is_nop(instr[6]); // nop
 693   }
 694 }
 695 
 696 // Set dest address of a bxx64_patchable instruction.
 697 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) {
 698   ResourceMark rm;
 699   int code_size = MacroAssembler::bxx64_patchable_size;
 700   CodeBuffer buf(instruction_addr, code_size);
 701   MacroAssembler masm(&buf);
 702   masm.bxx64_patchable(dest, relocInfo::none, link);
 703   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 704 }
 705 
 706 // Get dest address of a bxx64_patchable instruction.
 707 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) {
 708   if (is_bxx64_patchable_variant1_at(instruction_addr, link)) {
 709     return (address) (unsigned long) get_const(instruction_addr);
 710   } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) {
 711     unsigned int* instr = (unsigned int*) instruction_addr;
 712     if (link) {
 713       const int instr_idx = 6; // bl is last
 714       int branchoffset = branch_destination(instr[instr_idx], 0);
 715       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 716     } else {
 717       const int instr_idx = 0; // b is first
 718       int branchoffset = branch_destination(instr[instr_idx], 0);
 719       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 720     }
 721   // Load dest relative to global toc.
 722   } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) {
 723     return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord,
 724                                                                instruction_addr);
 725   } else {
 726     ShouldNotReachHere();
 727     return NULL;
 728   }
 729 }
 730 
 731 // Uses ordering which corresponds to ABI:
 732 //    _savegpr0_14:  std  r14,-144(r1)
 733 //    _savegpr0_15:  std  r15,-136(r1)
 734 //    _savegpr0_16:  std  r16,-128(r1)
 735 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) {
 736   std(R14, offset, dst);   offset += 8;
 737   std(R15, offset, dst);   offset += 8;
 738   std(R16, offset, dst);   offset += 8;
 739   std(R17, offset, dst);   offset += 8;
 740   std(R18, offset, dst);   offset += 8;
 741   std(R19, offset, dst);   offset += 8;
 742   std(R20, offset, dst);   offset += 8;
 743   std(R21, offset, dst);   offset += 8;
 744   std(R22, offset, dst);   offset += 8;
 745   std(R23, offset, dst);   offset += 8;
 746   std(R24, offset, dst);   offset += 8;
 747   std(R25, offset, dst);   offset += 8;
 748   std(R26, offset, dst);   offset += 8;
 749   std(R27, offset, dst);   offset += 8;
 750   std(R28, offset, dst);   offset += 8;
 751   std(R29, offset, dst);   offset += 8;
 752   std(R30, offset, dst);   offset += 8;
 753   std(R31, offset, dst);   offset += 8;
 754 
 755   stfd(F14, offset, dst);   offset += 8;
 756   stfd(F15, offset, dst);   offset += 8;
 757   stfd(F16, offset, dst);   offset += 8;
 758   stfd(F17, offset, dst);   offset += 8;
 759   stfd(F18, offset, dst);   offset += 8;
 760   stfd(F19, offset, dst);   offset += 8;
 761   stfd(F20, offset, dst);   offset += 8;
 762   stfd(F21, offset, dst);   offset += 8;
 763   stfd(F22, offset, dst);   offset += 8;
 764   stfd(F23, offset, dst);   offset += 8;
 765   stfd(F24, offset, dst);   offset += 8;
 766   stfd(F25, offset, dst);   offset += 8;
 767   stfd(F26, offset, dst);   offset += 8;
 768   stfd(F27, offset, dst);   offset += 8;
 769   stfd(F28, offset, dst);   offset += 8;
 770   stfd(F29, offset, dst);   offset += 8;
 771   stfd(F30, offset, dst);   offset += 8;
 772   stfd(F31, offset, dst);
 773 }
 774 
 775 // Uses ordering which corresponds to ABI:
 776 //    _restgpr0_14:  ld   r14,-144(r1)
 777 //    _restgpr0_15:  ld   r15,-136(r1)
 778 //    _restgpr0_16:  ld   r16,-128(r1)
 779 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) {
 780   ld(R14, offset, src);   offset += 8;
 781   ld(R15, offset, src);   offset += 8;
 782   ld(R16, offset, src);   offset += 8;
 783   ld(R17, offset, src);   offset += 8;
 784   ld(R18, offset, src);   offset += 8;
 785   ld(R19, offset, src);   offset += 8;
 786   ld(R20, offset, src);   offset += 8;
 787   ld(R21, offset, src);   offset += 8;
 788   ld(R22, offset, src);   offset += 8;
 789   ld(R23, offset, src);   offset += 8;
 790   ld(R24, offset, src);   offset += 8;
 791   ld(R25, offset, src);   offset += 8;
 792   ld(R26, offset, src);   offset += 8;
 793   ld(R27, offset, src);   offset += 8;
 794   ld(R28, offset, src);   offset += 8;
 795   ld(R29, offset, src);   offset += 8;
 796   ld(R30, offset, src);   offset += 8;
 797   ld(R31, offset, src);   offset += 8;
 798 
 799   // FP registers
 800   lfd(F14, offset, src);   offset += 8;
 801   lfd(F15, offset, src);   offset += 8;
 802   lfd(F16, offset, src);   offset += 8;
 803   lfd(F17, offset, src);   offset += 8;
 804   lfd(F18, offset, src);   offset += 8;
 805   lfd(F19, offset, src);   offset += 8;
 806   lfd(F20, offset, src);   offset += 8;
 807   lfd(F21, offset, src);   offset += 8;
 808   lfd(F22, offset, src);   offset += 8;
 809   lfd(F23, offset, src);   offset += 8;
 810   lfd(F24, offset, src);   offset += 8;
 811   lfd(F25, offset, src);   offset += 8;
 812   lfd(F26, offset, src);   offset += 8;
 813   lfd(F27, offset, src);   offset += 8;
 814   lfd(F28, offset, src);   offset += 8;
 815   lfd(F29, offset, src);   offset += 8;
 816   lfd(F30, offset, src);   offset += 8;
 817   lfd(F31, offset, src);
 818 }
 819 
 820 // For verify_oops.
 821 void MacroAssembler::save_volatile_gprs(Register dst, int offset) {
 822   std(R2,  offset, dst);   offset += 8;
 823   std(R3,  offset, dst);   offset += 8;
 824   std(R4,  offset, dst);   offset += 8;
 825   std(R5,  offset, dst);   offset += 8;
 826   std(R6,  offset, dst);   offset += 8;
 827   std(R7,  offset, dst);   offset += 8;
 828   std(R8,  offset, dst);   offset += 8;
 829   std(R9,  offset, dst);   offset += 8;
 830   std(R10, offset, dst);   offset += 8;
 831   std(R11, offset, dst);   offset += 8;
 832   std(R12, offset, dst);   offset += 8;
 833 
 834   stfd(F0, offset, dst);   offset += 8;
 835   stfd(F1, offset, dst);   offset += 8;
 836   stfd(F2, offset, dst);   offset += 8;
 837   stfd(F3, offset, dst);   offset += 8;
 838   stfd(F4, offset, dst);   offset += 8;
 839   stfd(F5, offset, dst);   offset += 8;
 840   stfd(F6, offset, dst);   offset += 8;
 841   stfd(F7, offset, dst);   offset += 8;
 842   stfd(F8, offset, dst);   offset += 8;
 843   stfd(F9, offset, dst);   offset += 8;
 844   stfd(F10, offset, dst);  offset += 8;
 845   stfd(F11, offset, dst);  offset += 8;
 846   stfd(F12, offset, dst);  offset += 8;
 847   stfd(F13, offset, dst);
 848 }
 849 
 850 // For verify_oops.
 851 void MacroAssembler::restore_volatile_gprs(Register src, int offset) {
 852   ld(R2,  offset, src);   offset += 8;
 853   ld(R3,  offset, src);   offset += 8;
 854   ld(R4,  offset, src);   offset += 8;
 855   ld(R5,  offset, src);   offset += 8;
 856   ld(R6,  offset, src);   offset += 8;
 857   ld(R7,  offset, src);   offset += 8;
 858   ld(R8,  offset, src);   offset += 8;
 859   ld(R9,  offset, src);   offset += 8;
 860   ld(R10, offset, src);   offset += 8;
 861   ld(R11, offset, src);   offset += 8;
 862   ld(R12, offset, src);   offset += 8;
 863 
 864   lfd(F0, offset, src);   offset += 8;
 865   lfd(F1, offset, src);   offset += 8;
 866   lfd(F2, offset, src);   offset += 8;
 867   lfd(F3, offset, src);   offset += 8;
 868   lfd(F4, offset, src);   offset += 8;
 869   lfd(F5, offset, src);   offset += 8;
 870   lfd(F6, offset, src);   offset += 8;
 871   lfd(F7, offset, src);   offset += 8;
 872   lfd(F8, offset, src);   offset += 8;
 873   lfd(F9, offset, src);   offset += 8;
 874   lfd(F10, offset, src);  offset += 8;
 875   lfd(F11, offset, src);  offset += 8;
 876   lfd(F12, offset, src);  offset += 8;
 877   lfd(F13, offset, src);
 878 }
 879 
 880 void MacroAssembler::save_LR_CR(Register tmp) {
 881   mfcr(tmp);
 882   std(tmp, _abi(cr), R1_SP);
 883   mflr(tmp);
 884   std(tmp, _abi(lr), R1_SP);
 885   // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad)
 886 }
 887 
 888 void MacroAssembler::restore_LR_CR(Register tmp) {
 889   assert(tmp != R1_SP, "must be distinct");
 890   ld(tmp, _abi(lr), R1_SP);
 891   mtlr(tmp);
 892   ld(tmp, _abi(cr), R1_SP);
 893   mtcr(tmp);
 894 }
 895 
 896 address MacroAssembler::get_PC_trash_LR(Register result) {
 897   Label L;
 898   bl(L);
 899   bind(L);
 900   address lr_pc = pc();
 901   mflr(result);
 902   return lr_pc;
 903 }
 904 
 905 void MacroAssembler::resize_frame(Register offset, Register tmp) {
 906 #ifdef ASSERT
 907   assert_different_registers(offset, tmp, R1_SP);
 908   andi_(tmp, offset, frame::alignment_in_bytes-1);
 909   asm_assert_eq("resize_frame: unaligned", 0x204);
 910 #endif
 911 
 912   // tmp <- *(SP)
 913   ld(tmp, _abi(callers_sp), R1_SP);
 914   // addr <- SP + offset;
 915   // *(addr) <- tmp;
 916   // SP <- addr
 917   stdux(tmp, R1_SP, offset);
 918 }
 919 
 920 void MacroAssembler::resize_frame(int offset, Register tmp) {
 921   assert(is_simm(offset, 16), "too big an offset");
 922   assert_different_registers(tmp, R1_SP);
 923   assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned");
 924   // tmp <- *(SP)
 925   ld(tmp, _abi(callers_sp), R1_SP);
 926   // addr <- SP + offset;
 927   // *(addr) <- tmp;
 928   // SP <- addr
 929   stdu(tmp, offset, R1_SP);
 930 }
 931 
 932 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) {
 933   // (addr == tmp1) || (addr == tmp2) is allowed here!
 934   assert(tmp1 != tmp2, "must be distinct");
 935 
 936   // compute offset w.r.t. current stack pointer
 937   // tmp_1 <- addr - SP (!)
 938   subf(tmp1, R1_SP, addr);
 939 
 940   // atomically update SP keeping back link.
 941   resize_frame(tmp1/* offset */, tmp2/* tmp */);
 942 }
 943 
 944 void MacroAssembler::push_frame(Register bytes, Register tmp) {
 945 #ifdef ASSERT
 946   assert(bytes != R0, "r0 not allowed here");
 947   andi_(R0, bytes, frame::alignment_in_bytes-1);
 948   asm_assert_eq("push_frame(Reg, Reg): unaligned", 0x203);
 949 #endif
 950   neg(tmp, bytes);
 951   stdux(R1_SP, R1_SP, tmp);
 952 }
 953 
 954 // Push a frame of size `bytes'.
 955 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) {
 956   long offset = align_addr(bytes, frame::alignment_in_bytes);
 957   if (is_simm(-offset, 16)) {
 958     stdu(R1_SP, -offset, R1_SP);
 959   } else {
 960     load_const_optimized(tmp, -offset);
 961     stdux(R1_SP, R1_SP, tmp);
 962   }
 963 }
 964 
 965 // Push a frame of size `bytes' plus abi_reg_args on top.
 966 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) {
 967   push_frame(bytes + frame::abi_reg_args_size, tmp);
 968 }
 969 
 970 // Setup up a new C frame with a spill area for non-volatile GPRs and
 971 // additional space for local variables.
 972 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes,
 973                                                       Register tmp) {
 974   push_frame(bytes + frame::abi_reg_args_size + frame::spill_nonvolatiles_size, tmp);
 975 }
 976 
 977 // Pop current C frame.
 978 void MacroAssembler::pop_frame() {
 979   ld(R1_SP, _abi(callers_sp), R1_SP);
 980 }
 981 
 982 #if defined(ABI_ELFv2)
 983 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) {
 984   // TODO(asmundak): make sure the caller uses R12 as function descriptor
 985   // most of the times.
 986   if (R12 != r_function_entry) {
 987     mr(R12, r_function_entry);
 988   }
 989   mtctr(R12);
 990   // Do a call or a branch.
 991   if (and_link) {
 992     bctrl();
 993   } else {
 994     bctr();
 995   }
 996   _last_calls_return_pc = pc();
 997 
 998   return _last_calls_return_pc;
 999 }
1000 
1001 // Call a C function via a function descriptor and use full C
1002 // calling conventions. Updates and returns _last_calls_return_pc.
1003 address MacroAssembler::call_c(Register r_function_entry) {
1004   return branch_to(r_function_entry, /*and_link=*/true);
1005 }
1006 
1007 // For tail calls: only branch, don't link, so callee returns to caller of this function.
1008 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) {
1009   return branch_to(r_function_entry, /*and_link=*/false);
1010 }
1011 
1012 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) {
1013   load_const(R12, function_entry, R0);
1014   return branch_to(R12,  /*and_link=*/true);
1015 }
1016 
1017 #else
1018 // Generic version of a call to C function via a function descriptor
1019 // with variable support for C calling conventions (TOC, ENV, etc.).
1020 // Updates and returns _last_calls_return_pc.
1021 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call,
1022                                   bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) {
1023   // we emit standard ptrgl glue code here
1024   assert((function_descriptor != R0), "function_descriptor cannot be R0");
1025 
1026   // retrieve necessary entries from the function descriptor
1027   ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor);
1028   mtctr(R0);
1029 
1030   if (load_toc_of_callee) {
1031     ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor);
1032   }
1033   if (load_env_of_callee) {
1034     ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor);
1035   } else if (load_toc_of_callee) {
1036     li(R11, 0);
1037   }
1038 
1039   // do a call or a branch
1040   if (and_link) {
1041     bctrl();
1042   } else {
1043     bctr();
1044   }
1045   _last_calls_return_pc = pc();
1046 
1047   return _last_calls_return_pc;
1048 }
1049 
1050 // Call a C function via a function descriptor and use full C calling
1051 // conventions.
1052 // We don't use the TOC in generated code, so there is no need to save
1053 // and restore its value.
1054 address MacroAssembler::call_c(Register fd) {
1055   return branch_to(fd, /*and_link=*/true,
1056                        /*save toc=*/false,
1057                        /*restore toc=*/false,
1058                        /*load toc=*/true,
1059                        /*load env=*/true);
1060 }
1061 
1062 address MacroAssembler::call_c_and_return_to_caller(Register fd) {
1063   return branch_to(fd, /*and_link=*/false,
1064                        /*save toc=*/false,
1065                        /*restore toc=*/false,
1066                        /*load toc=*/true,
1067                        /*load env=*/true);
1068 }
1069 
1070 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) {
1071   if (rt != relocInfo::none) {
1072     // this call needs to be relocatable
1073     if (!ReoptimizeCallSequences
1074         || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1075         || fd == NULL   // support code-size estimation
1076         || !fd->is_friend_function()
1077         || fd->entry() == NULL) {
1078       // it's not a friend function as defined by class FunctionDescriptor,
1079       // so do a full call-c here.
1080       load_const(R11, (address)fd, R0);
1081 
1082       bool has_env = (fd != NULL && fd->env() != NULL);
1083       return branch_to(R11, /*and_link=*/true,
1084                             /*save toc=*/false,
1085                             /*restore toc=*/false,
1086                             /*load toc=*/true,
1087                             /*load env=*/has_env);
1088     } else {
1089       // It's a friend function. Load the entry point and don't care about
1090       // toc and env. Use an optimizable call instruction, but ensure the
1091       // same code-size as in the case of a non-friend function.
1092       nop();
1093       nop();
1094       nop();
1095       bl64_patchable(fd->entry(), rt);
1096       _last_calls_return_pc = pc();
1097       return _last_calls_return_pc;
1098     }
1099   } else {
1100     // This call does not need to be relocatable, do more aggressive
1101     // optimizations.
1102     if (!ReoptimizeCallSequences
1103       || !fd->is_friend_function()) {
1104       // It's not a friend function as defined by class FunctionDescriptor,
1105       // so do a full call-c here.
1106       load_const(R11, (address)fd, R0);
1107       return branch_to(R11, /*and_link=*/true,
1108                             /*save toc=*/false,
1109                             /*restore toc=*/false,
1110                             /*load toc=*/true,
1111                             /*load env=*/true);
1112     } else {
1113       // it's a friend function, load the entry point and don't care about
1114       // toc and env.
1115       address dest = fd->entry();
1116       if (is_within_range_of_b(dest, pc())) {
1117         bl(dest);
1118       } else {
1119         bl64_patchable(dest, rt);
1120       }
1121       _last_calls_return_pc = pc();
1122       return _last_calls_return_pc;
1123     }
1124   }
1125 }
1126 
1127 // Call a C function.  All constants needed reside in TOC.
1128 //
1129 // Read the address to call from the TOC.
1130 // Read env from TOC, if fd specifies an env.
1131 // Read new TOC from TOC.
1132 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd,
1133                                          relocInfo::relocType rt, Register toc) {
1134   if (!ReoptimizeCallSequences
1135     || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1136     || !fd->is_friend_function()) {
1137     // It's not a friend function as defined by class FunctionDescriptor,
1138     // so do a full call-c here.
1139     assert(fd->entry() != NULL, "function must be linked");
1140 
1141     AddressLiteral fd_entry(fd->entry());
1142     bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true);
1143     mtctr(R11);
1144     if (fd->env() == NULL) {
1145       li(R11, 0);
1146       nop();
1147     } else {
1148       AddressLiteral fd_env(fd->env());
1149       success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true);
1150     }
1151     AddressLiteral fd_toc(fd->toc());
1152     // Set R2_TOC (load from toc)
1153     success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true);
1154     bctrl();
1155     _last_calls_return_pc = pc();
1156     if (!success) { return NULL; }
1157   } else {
1158     // It's a friend function, load the entry point and don't care about
1159     // toc and env. Use an optimizable call instruction, but ensure the
1160     // same code-size as in the case of a non-friend function.
1161     nop();
1162     bl64_patchable(fd->entry(), rt);
1163     _last_calls_return_pc = pc();
1164   }
1165   return _last_calls_return_pc;
1166 }
1167 #endif // ABI_ELFv2
1168 
1169 void MacroAssembler::call_VM_base(Register oop_result,
1170                                   Register last_java_sp,
1171                                   address  entry_point,
1172                                   bool     check_exceptions) {
1173   BLOCK_COMMENT("call_VM {");
1174   // Determine last_java_sp register.
1175   if (!last_java_sp->is_valid()) {
1176     last_java_sp = R1_SP;
1177   }
1178   set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1);
1179 
1180   // ARG1 must hold thread address.
1181   mr(R3_ARG1, R16_thread);
1182 #if defined(ABI_ELFv2)
1183   address return_pc = call_c(entry_point, relocInfo::none);
1184 #else
1185   address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none);
1186 #endif
1187 
1188   reset_last_Java_frame();
1189 
1190   // Check for pending exceptions.
1191   if (check_exceptions) {
1192     // We don't check for exceptions here.
1193     ShouldNotReachHere();
1194   }
1195 
1196   // Get oop result if there is one and reset the value in the thread.
1197   if (oop_result->is_valid()) {
1198     get_vm_result(oop_result);
1199   }
1200 
1201   _last_calls_return_pc = return_pc;
1202   BLOCK_COMMENT("} call_VM");
1203 }
1204 
1205 void MacroAssembler::call_VM_leaf_base(address entry_point) {
1206   BLOCK_COMMENT("call_VM_leaf {");
1207 #if defined(ABI_ELFv2)
1208   call_c(entry_point, relocInfo::none);
1209 #else
1210   call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none);
1211 #endif
1212   BLOCK_COMMENT("} call_VM_leaf");
1213 }
1214 
1215 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) {
1216   call_VM_base(oop_result, noreg, entry_point, check_exceptions);
1217 }
1218 
1219 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1,
1220                              bool check_exceptions) {
1221   // R3_ARG1 is reserved for the thread.
1222   mr_if_needed(R4_ARG2, arg_1);
1223   call_VM(oop_result, entry_point, check_exceptions);
1224 }
1225 
1226 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2,
1227                              bool check_exceptions) {
1228   // R3_ARG1 is reserved for the thread
1229   mr_if_needed(R4_ARG2, arg_1);
1230   assert(arg_2 != R4_ARG2, "smashed argument");
1231   mr_if_needed(R5_ARG3, arg_2);
1232   call_VM(oop_result, entry_point, check_exceptions);
1233 }
1234 
1235 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3,
1236                              bool check_exceptions) {
1237   // R3_ARG1 is reserved for the thread
1238   mr_if_needed(R4_ARG2, arg_1);
1239   assert(arg_2 != R4_ARG2, "smashed argument");
1240   mr_if_needed(R5_ARG3, arg_2);
1241   mr_if_needed(R6_ARG4, arg_3);
1242   call_VM(oop_result, entry_point, check_exceptions);
1243 }
1244 
1245 void MacroAssembler::call_VM_leaf(address entry_point) {
1246   call_VM_leaf_base(entry_point);
1247 }
1248 
1249 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) {
1250   mr_if_needed(R3_ARG1, arg_1);
1251   call_VM_leaf(entry_point);
1252 }
1253 
1254 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) {
1255   mr_if_needed(R3_ARG1, arg_1);
1256   assert(arg_2 != R3_ARG1, "smashed argument");
1257   mr_if_needed(R4_ARG2, arg_2);
1258   call_VM_leaf(entry_point);
1259 }
1260 
1261 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) {
1262   mr_if_needed(R3_ARG1, arg_1);
1263   assert(arg_2 != R3_ARG1, "smashed argument");
1264   mr_if_needed(R4_ARG2, arg_2);
1265   assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument");
1266   mr_if_needed(R5_ARG3, arg_3);
1267   call_VM_leaf(entry_point);
1268 }
1269 
1270 // Check whether instruction is a read access to the polling page
1271 // which was emitted by load_from_polling_page(..).
1272 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext,
1273                                                address* polling_address_ptr) {
1274   if (!is_ld(instruction))
1275     return false; // It's not a ld. Fail.
1276 
1277   int rt = inv_rt_field(instruction);
1278   int ra = inv_ra_field(instruction);
1279   int ds = inv_ds_field(instruction);
1280   if (!(ds == 0 && ra != 0 && rt == 0)) {
1281     return false; // It's not a ld(r0, X, ra). Fail.
1282   }
1283 
1284   if (!ucontext) {
1285     // Set polling address.
1286     if (polling_address_ptr != NULL) {
1287       *polling_address_ptr = NULL;
1288     }
1289     return true; // No ucontext given. Can't check value of ra. Assume true.
1290   }
1291 
1292 #ifdef LINUX
1293   // Ucontext given. Check that register ra contains the address of
1294   // the safepoing polling page.
1295   ucontext_t* uc = (ucontext_t*) ucontext;
1296   // Set polling address.
1297   address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds;
1298   if (polling_address_ptr != NULL) {
1299     *polling_address_ptr = addr;
1300   }
1301   return os::is_poll_address(addr);
1302 #else
1303   // Not on Linux, ucontext must be NULL.
1304   ShouldNotReachHere();
1305   return false;
1306 #endif
1307 }
1308 
1309 bool MacroAssembler::is_memory_serialization(int instruction, JavaThread* thread, void* ucontext) {
1310 #ifdef LINUX
1311   ucontext_t* uc = (ucontext_t*) ucontext;
1312 
1313   if (is_stwx(instruction) || is_stwux(instruction)) {
1314     int ra = inv_ra_field(instruction);
1315     int rb = inv_rb_field(instruction);
1316 
1317     // look up content of ra and rb in ucontext
1318     address ra_val=(address)uc->uc_mcontext.regs->gpr[ra];
1319     long rb_val=(long)uc->uc_mcontext.regs->gpr[rb];
1320     return os::is_memory_serialize_page(thread, ra_val+rb_val);
1321   } else if (is_stw(instruction) || is_stwu(instruction)) {
1322     int ra = inv_ra_field(instruction);
1323     int d1 = inv_d1_field(instruction);
1324 
1325     // look up content of ra in ucontext
1326     address ra_val=(address)uc->uc_mcontext.regs->gpr[ra];
1327     return os::is_memory_serialize_page(thread, ra_val+d1);
1328   } else {
1329     return false;
1330   }
1331 #else
1332   // workaround not needed on !LINUX :-)
1333   ShouldNotCallThis();
1334   return false;
1335 #endif
1336 }
1337 
1338 void MacroAssembler::bang_stack_with_offset(int offset) {
1339   // When increasing the stack, the old stack pointer will be written
1340   // to the new top of stack according to the PPC64 abi.
1341   // Therefore, stack banging is not necessary when increasing
1342   // the stack by <= os::vm_page_size() bytes.
1343   // When increasing the stack by a larger amount, this method is
1344   // called repeatedly to bang the intermediate pages.
1345 
1346   // Stack grows down, caller passes positive offset.
1347   assert(offset > 0, "must bang with positive offset");
1348 
1349   long stdoffset = -offset;
1350 
1351   if (is_simm(stdoffset, 16)) {
1352     // Signed 16 bit offset, a simple std is ok.
1353     if (UseLoadInstructionsForStackBangingPPC64) {
1354       ld(R0, (int)(signed short)stdoffset, R1_SP);
1355     } else {
1356       std(R0,(int)(signed short)stdoffset, R1_SP);
1357     }
1358   } else if (is_simm(stdoffset, 31)) {
1359     const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset);
1360     const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset);
1361 
1362     Register tmp = R11;
1363     addis(tmp, R1_SP, hi);
1364     if (UseLoadInstructionsForStackBangingPPC64) {
1365       ld(R0,  lo, tmp);
1366     } else {
1367       std(R0, lo, tmp);
1368     }
1369   } else {
1370     ShouldNotReachHere();
1371   }
1372 }
1373 
1374 // If instruction is a stack bang of the form
1375 //    std    R0,    x(Ry),       (see bang_stack_with_offset())
1376 //    stdu   R1_SP, x(R1_SP),    (see push_frame(), resize_frame())
1377 // or stdux  R1_SP, Rx, R1_SP    (see push_frame(), resize_frame())
1378 // return the banged address. Otherwise, return 0.
1379 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) {
1380 #ifdef LINUX
1381   ucontext_t* uc = (ucontext_t*) ucontext;
1382   int rs = inv_rs_field(instruction);
1383   int ra = inv_ra_field(instruction);
1384   if (   (is_ld(instruction)   && rs == 0 &&  UseLoadInstructionsForStackBangingPPC64)
1385       || (is_std(instruction)  && rs == 0 && !UseLoadInstructionsForStackBangingPPC64)
1386       || (is_stdu(instruction) && rs == 1)) {
1387     int ds = inv_ds_field(instruction);
1388     // return banged address
1389     return ds+(address)uc->uc_mcontext.regs->gpr[ra];
1390   } else if (is_stdux(instruction) && rs == 1) {
1391     int rb = inv_rb_field(instruction);
1392     address sp = (address)uc->uc_mcontext.regs->gpr[1];
1393     long rb_val = (long)uc->uc_mcontext.regs->gpr[rb];
1394     return ra != 1 || rb_val >= 0 ? NULL         // not a stack bang
1395                                   : sp + rb_val; // banged address
1396   }
1397   return NULL; // not a stack bang
1398 #else
1399   // workaround not needed on !LINUX :-)
1400   ShouldNotCallThis();
1401   return NULL;
1402 #endif
1403 }
1404 
1405 void MacroAssembler::reserved_stack_check(Register return_pc) {
1406   // Test if reserved zone needs to be enabled.
1407   Label no_reserved_zone_enabling;
1408 
1409   ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread);
1410   cmpld(CCR0, R1_SP, R0);
1411   blt_predict_taken(CCR0, no_reserved_zone_enabling);
1412 
1413   // Enable reserved zone again, throw stack overflow exception.
1414   push_frame_reg_args(0, R0);
1415   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread);
1416   pop_frame();
1417   mtlr(return_pc);
1418   load_const_optimized(R0, StubRoutines::throw_delayed_StackOverflowError_entry());
1419   mtctr(R0);
1420   bctr();
1421 
1422   should_not_reach_here();
1423 
1424   bind(no_reserved_zone_enabling);
1425 }
1426 
1427 void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base,
1428                                 bool cmpxchgx_hint) {
1429   Label retry;
1430   bind(retry);
1431   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1432   stdcx_(exchange_value, addr_base);
1433   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1434     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1435   } else {
1436     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1437   }
1438 }
1439 
1440 void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base,
1441                                 Register tmp, bool cmpxchgx_hint) {
1442   Label retry;
1443   bind(retry);
1444   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1445   add(tmp, dest_current_value, inc_value);
1446   stdcx_(tmp, addr_base);
1447   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1448     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1449   } else {
1450     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1451   }
1452 }
1453 
1454 // Word/sub-word atomic helper functions
1455 
1456 // Temps and addr_base are killed if size < 4 and processor does not support respective instructions.
1457 // Only signed types are supported with size < 4.
1458 // Atomic add always kills tmp1.
1459 void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value,
1460                                                    Register addr_base, Register tmp1, Register tmp2, Register tmp3,
1461                                                    bool cmpxchgx_hint, bool is_add, int size) {
1462   // Sub-word instructions are available since Power 8.
1463   // For older processors, instruction_type != size holds, and we
1464   // emulate the sub-word instructions by constructing a 4-byte value
1465   // that leaves the other bytes unchanged.
1466   const int instruction_type = VM_Version::has_lqarx() ? size : 4;
1467 
1468   Label retry;
1469   Register shift_amount = noreg,
1470            val32 = dest_current_value,
1471            modval = is_add ? tmp1 : exchange_value;
1472 
1473   if (instruction_type != size) {
1474     assert_different_registers(tmp1, tmp2, tmp3, dest_current_value, exchange_value, addr_base);
1475     modval = tmp1;
1476     shift_amount = tmp2;
1477     val32 = tmp3;
1478     // Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.
1479 #ifdef VM_LITTLE_ENDIAN
1480     rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;
1481     clrrdi(addr_base, addr_base, 2);
1482 #else
1483     xori(shift_amount, addr_base, (size == 1) ? 3 : 2);
1484     clrrdi(addr_base, addr_base, 2);
1485     rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;
1486 #endif
1487   }
1488 
1489   // atomic emulation loop
1490   bind(retry);
1491 
1492   switch (instruction_type) {
1493     case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1494     case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1495     case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1496     default: ShouldNotReachHere();
1497   }
1498 
1499   if (instruction_type != size) {
1500     srw(dest_current_value, val32, shift_amount);
1501   }
1502 
1503   if (is_add) { add(modval, dest_current_value, exchange_value); }
1504 
1505   if (instruction_type != size) {
1506     // Transform exchange value such that the replacement can be done by one xor instruction.
1507     xorr(modval, dest_current_value, is_add ? modval : exchange_value);
1508     clrldi(modval, modval, (size == 1) ? 56 : 48);
1509     slw(modval, modval, shift_amount);
1510     xorr(modval, val32, modval);
1511   }
1512 
1513   switch (instruction_type) {
1514     case 4: stwcx_(modval, addr_base); break;
1515     case 2: sthcx_(modval, addr_base); break;
1516     case 1: stbcx_(modval, addr_base); break;
1517     default: ShouldNotReachHere();
1518   }
1519 
1520   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1521     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1522   } else {
1523     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1524   }
1525 
1526   // l?arx zero-extends, but Java wants byte/short values sign-extended.
1527   if (size == 1) {
1528     extsb(dest_current_value, dest_current_value);
1529   } else if (size == 2) {
1530     extsh(dest_current_value, dest_current_value);
1531   };
1532 }
1533 
1534 // Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions.
1535 // Only signed types are supported with size < 4.
1536 void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value,
1537                                        Register compare_value, Register exchange_value,
1538                                        Register addr_base, Register tmp1, Register tmp2,
1539                                        Label &retry, Label &failed, bool cmpxchgx_hint, int size) {
1540   // Sub-word instructions are available since Power 8.
1541   // For older processors, instruction_type != size holds, and we
1542   // emulate the sub-word instructions by constructing a 4-byte value
1543   // that leaves the other bytes unchanged.
1544   const int instruction_type = VM_Version::has_lqarx() ? size : 4;
1545 
1546   Register shift_amount = noreg,
1547            val32 = dest_current_value,
1548            modval = exchange_value;
1549 
1550   if (instruction_type != size) {
1551     assert_different_registers(tmp1, tmp2, dest_current_value, compare_value, exchange_value, addr_base);
1552     shift_amount = tmp1;
1553     val32 = tmp2;
1554     modval = tmp2;
1555     // Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.
1556 #ifdef VM_LITTLE_ENDIAN
1557     rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;
1558     clrrdi(addr_base, addr_base, 2);
1559 #else
1560     xori(shift_amount, addr_base, (size == 1) ? 3 : 2);
1561     clrrdi(addr_base, addr_base, 2);
1562     rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;
1563 #endif
1564     // Transform exchange value such that the replacement can be done by one xor instruction.
1565     xorr(exchange_value, compare_value, exchange_value);
1566     clrldi(exchange_value, exchange_value, (size == 1) ? 56 : 48);
1567     slw(exchange_value, exchange_value, shift_amount);
1568   }
1569 
1570   // atomic emulation loop
1571   bind(retry);
1572 
1573   switch (instruction_type) {
1574     case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1575     case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1576     case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1577     default: ShouldNotReachHere();
1578   }
1579 
1580   if (instruction_type != size) {
1581     srw(dest_current_value, val32, shift_amount);
1582   }
1583   if (size == 1) {
1584     extsb(dest_current_value, dest_current_value);
1585   } else if (size == 2) {
1586     extsh(dest_current_value, dest_current_value);
1587   };
1588 
1589   cmpw(flag, dest_current_value, compare_value);
1590   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1591     bne_predict_not_taken(flag, failed);
1592   } else {
1593     bne(                  flag, failed);
1594   }
1595   // branch to done  => (flag == ne), (dest_current_value != compare_value)
1596   // fall through    => (flag == eq), (dest_current_value == compare_value)
1597 
1598   if (instruction_type != size) {
1599     xorr(modval, val32, exchange_value);
1600   }
1601 
1602   switch (instruction_type) {
1603     case 4: stwcx_(modval, addr_base); break;
1604     case 2: sthcx_(modval, addr_base); break;
1605     case 1: stbcx_(modval, addr_base); break;
1606     default: ShouldNotReachHere();
1607   }
1608 }
1609 
1610 // CmpxchgX sets condition register to cmpX(current, compare).
1611 void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value,
1612                                      Register compare_value, Register exchange_value,
1613                                      Register addr_base, Register tmp1, Register tmp2,
1614                                      int semantics, bool cmpxchgx_hint,
1615                                      Register int_flag_success, bool contention_hint, bool weak, int size) {
1616   Label retry;
1617   Label failed;
1618   Label done;
1619 
1620   // Save one branch if result is returned via register and
1621   // result register is different from the other ones.
1622   bool use_result_reg    = (int_flag_success != noreg);
1623   bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value &&
1624                             int_flag_success != exchange_value && int_flag_success != addr_base &&
1625                             int_flag_success != tmp1 && int_flag_success != tmp2);
1626   assert(!weak || flag == CCR0, "weak only supported with CCR0");
1627   assert(size == 1 || size == 2 || size == 4, "unsupported");
1628 
1629   if (use_result_reg && preset_result_reg) {
1630     li(int_flag_success, 0); // preset (assume cas failed)
1631   }
1632 
1633   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1634   if (contention_hint) { // Don't try to reserve if cmp fails.
1635     switch (size) {
1636       case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break;
1637       case 2: lha(dest_current_value, 0, addr_base); break;
1638       case 4: lwz(dest_current_value, 0, addr_base); break;
1639       default: ShouldNotReachHere();
1640     }
1641     cmpw(flag, dest_current_value, compare_value);
1642     bne(flag, failed);
1643   }
1644 
1645   // release/fence semantics
1646   if (semantics & MemBarRel) {
1647     release();
1648   }
1649 
1650   cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base, tmp1, tmp2,
1651                     retry, failed, cmpxchgx_hint, size);
1652   if (!weak || use_result_reg) {
1653     if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1654       bne_predict_not_taken(CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1655     } else {
1656       bne(                  CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1657     }
1658   }
1659   // fall through    => (flag == eq), (dest_current_value == compare_value), (swapped)
1660 
1661   // Result in register (must do this at the end because int_flag_success can be the
1662   // same register as one above).
1663   if (use_result_reg) {
1664     li(int_flag_success, 1);
1665   }
1666 
1667   if (semantics & MemBarFenceAfter) {
1668     fence();
1669   } else if (semantics & MemBarAcq) {
1670     isync();
1671   }
1672 
1673   if (use_result_reg && !preset_result_reg) {
1674     b(done);
1675   }
1676 
1677   bind(failed);
1678   if (use_result_reg && !preset_result_reg) {
1679     li(int_flag_success, 0);
1680   }
1681 
1682   bind(done);
1683   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1684   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1685 }
1686 
1687 // Preforms atomic compare exchange:
1688 //   if (compare_value == *addr_base)
1689 //     *addr_base = exchange_value
1690 //     int_flag_success = 1;
1691 //   else
1692 //     int_flag_success = 0;
1693 //
1694 // ConditionRegister flag       = cmp(compare_value, *addr_base)
1695 // Register dest_current_value  = *addr_base
1696 // Register compare_value       Used to compare with value in memory
1697 // Register exchange_value      Written to memory if compare_value == *addr_base
1698 // Register addr_base           The memory location to compareXChange
1699 // Register int_flag_success    Set to 1 if exchange_value was written to *addr_base
1700 //
1701 // To avoid the costly compare exchange the value is tested beforehand.
1702 // Several special cases exist to avoid that unnecessary information is generated.
1703 //
1704 void MacroAssembler::cmpxchgd(ConditionRegister flag,
1705                               Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value,
1706                               Register addr_base, int semantics, bool cmpxchgx_hint,
1707                               Register int_flag_success, Label* failed_ext, bool contention_hint, bool weak) {
1708   Label retry;
1709   Label failed_int;
1710   Label& failed = (failed_ext != NULL) ? *failed_ext : failed_int;
1711   Label done;
1712 
1713   // Save one branch if result is returned via register and result register is different from the other ones.
1714   bool use_result_reg    = (int_flag_success!=noreg);
1715   bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() &&
1716                             int_flag_success!=exchange_value && int_flag_success!=addr_base);
1717   assert(!weak || flag == CCR0, "weak only supported with CCR0");
1718   assert(int_flag_success == noreg || failed_ext == NULL, "cannot have both");
1719 
1720   if (use_result_reg && preset_result_reg) {
1721     li(int_flag_success, 0); // preset (assume cas failed)
1722   }
1723 
1724   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1725   if (contention_hint) { // Don't try to reserve if cmp fails.
1726     ld(dest_current_value, 0, addr_base);
1727     cmpd(flag, compare_value, dest_current_value);
1728     bne(flag, failed);
1729   }
1730 
1731   // release/fence semantics
1732   if (semantics & MemBarRel) {
1733     release();
1734   }
1735 
1736   // atomic emulation loop
1737   bind(retry);
1738 
1739   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1740   cmpd(flag, compare_value, dest_current_value);
1741   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1742     bne_predict_not_taken(flag, failed);
1743   } else {
1744     bne(                  flag, failed);
1745   }
1746 
1747   stdcx_(exchange_value, addr_base);
1748   if (!weak || use_result_reg || failed_ext) {
1749     if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1750       bne_predict_not_taken(CCR0, weak ? failed : retry); // stXcx_ sets CCR0
1751     } else {
1752       bne(                  CCR0, weak ? failed : retry); // stXcx_ sets CCR0
1753     }
1754   }
1755 
1756   // result in register (must do this at the end because int_flag_success can be the same register as one above)
1757   if (use_result_reg) {
1758     li(int_flag_success, 1);
1759   }
1760 
1761   if (semantics & MemBarFenceAfter) {
1762     fence();
1763   } else if (semantics & MemBarAcq) {
1764     isync();
1765   }
1766 
1767   if (use_result_reg && !preset_result_reg) {
1768     b(done);
1769   }
1770 
1771   bind(failed_int);
1772   if (use_result_reg && !preset_result_reg) {
1773     li(int_flag_success, 0);
1774   }
1775 
1776   bind(done);
1777   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1778   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1779 }
1780 
1781 // Look up the method for a megamorphic invokeinterface call.
1782 // The target method is determined by <intf_klass, itable_index>.
1783 // The receiver klass is in recv_klass.
1784 // On success, the result will be in method_result, and execution falls through.
1785 // On failure, execution transfers to the given label.
1786 void MacroAssembler::lookup_interface_method(Register recv_klass,
1787                                              Register intf_klass,
1788                                              RegisterOrConstant itable_index,
1789                                              Register method_result,
1790                                              Register scan_temp,
1791                                              Register sethi_temp,
1792                                              Label& L_no_such_interface) {
1793   assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
1794   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
1795          "caller must use same register for non-constant itable index as for method");
1796 
1797   // Compute start of first itableOffsetEntry (which is at the end of the vtable).
1798   int vtable_base = in_bytes(Klass::vtable_start_offset());
1799   int itentry_off = itableMethodEntry::method_offset_in_bytes();
1800   int logMEsize   = exact_log2(itableMethodEntry::size() * wordSize);
1801   int scan_step   = itableOffsetEntry::size() * wordSize;
1802   int log_vte_size= exact_log2(vtableEntry::size_in_bytes());
1803 
1804   lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass);
1805   // %%% We should store the aligned, prescaled offset in the klassoop.
1806   // Then the next several instructions would fold away.
1807 
1808   sldi(scan_temp, scan_temp, log_vte_size);
1809   addi(scan_temp, scan_temp, vtable_base);
1810   add(scan_temp, recv_klass, scan_temp);
1811 
1812   // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1813   if (itable_index.is_register()) {
1814     Register itable_offset = itable_index.as_register();
1815     sldi(itable_offset, itable_offset, logMEsize);
1816     if (itentry_off) addi(itable_offset, itable_offset, itentry_off);
1817     add(recv_klass, itable_offset, recv_klass);
1818   } else {
1819     long itable_offset = (long)itable_index.as_constant();
1820     load_const_optimized(sethi_temp, (itable_offset<<logMEsize)+itentry_off); // static address, no relocation
1821     add(recv_klass, sethi_temp, recv_klass);
1822   }
1823 
1824   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
1825   //   if (scan->interface() == intf) {
1826   //     result = (klass + scan->offset() + itable_index);
1827   //   }
1828   // }
1829   Label search, found_method;
1830 
1831   for (int peel = 1; peel >= 0; peel--) {
1832     // %%%% Could load both offset and interface in one ldx, if they were
1833     // in the opposite order. This would save a load.
1834     ld(method_result, itableOffsetEntry::interface_offset_in_bytes(), scan_temp);
1835 
1836     // Check that this entry is non-null. A null entry means that
1837     // the receiver class doesn't implement the interface, and wasn't the
1838     // same as when the caller was compiled.
1839     cmpd(CCR0, method_result, intf_klass);
1840 
1841     if (peel) {
1842       beq(CCR0, found_method);
1843     } else {
1844       bne(CCR0, search);
1845       // (invert the test to fall through to found_method...)
1846     }
1847 
1848     if (!peel) break;
1849 
1850     bind(search);
1851 
1852     cmpdi(CCR0, method_result, 0);
1853     beq(CCR0, L_no_such_interface);
1854     addi(scan_temp, scan_temp, scan_step);
1855   }
1856 
1857   bind(found_method);
1858 
1859   // Got a hit.
1860   int ito_offset = itableOffsetEntry::offset_offset_in_bytes();
1861   lwz(scan_temp, ito_offset, scan_temp);
1862   ldx(method_result, scan_temp, recv_klass);
1863 }
1864 
1865 // virtual method calling
1866 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1867                                            RegisterOrConstant vtable_index,
1868                                            Register method_result) {
1869 
1870   assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg());
1871 
1872   const int base = in_bytes(Klass::vtable_start_offset());
1873   assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1874 
1875   if (vtable_index.is_register()) {
1876     sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord);
1877     add(recv_klass, vtable_index.as_register(), recv_klass);
1878   } else {
1879     addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord);
1880   }
1881   ld(R19_method, base + vtableEntry::method_offset_in_bytes(), recv_klass);
1882 }
1883 
1884 /////////////////////////////////////////// subtype checking ////////////////////////////////////////////
1885 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1886                                                    Register super_klass,
1887                                                    Register temp1_reg,
1888                                                    Register temp2_reg,
1889                                                    Label* L_success,
1890                                                    Label* L_failure,
1891                                                    Label* L_slow_path,
1892                                                    RegisterOrConstant super_check_offset) {
1893 
1894   const Register check_cache_offset = temp1_reg;
1895   const Register cached_super       = temp2_reg;
1896 
1897   assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super);
1898 
1899   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1900   int sc_offset  = in_bytes(Klass::secondary_super_cache_offset());
1901 
1902   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1903   bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset);
1904 
1905   Label L_fallthrough;
1906   int label_nulls = 0;
1907   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1908   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1909   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1910   assert(label_nulls <= 1 ||
1911          (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path),
1912          "at most one NULL in the batch, usually");
1913 
1914   // If the pointers are equal, we are done (e.g., String[] elements).
1915   // This self-check enables sharing of secondary supertype arrays among
1916   // non-primary types such as array-of-interface. Otherwise, each such
1917   // type would need its own customized SSA.
1918   // We move this check to the front of the fast path because many
1919   // type checks are in fact trivially successful in this manner,
1920   // so we get a nicely predicted branch right at the start of the check.
1921   cmpd(CCR0, sub_klass, super_klass);
1922   beq(CCR0, *L_success);
1923 
1924   // Check the supertype display:
1925   if (must_load_sco) {
1926     // The super check offset is always positive...
1927     lwz(check_cache_offset, sco_offset, super_klass);
1928     super_check_offset = RegisterOrConstant(check_cache_offset);
1929     // super_check_offset is register.
1930     assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register());
1931   }
1932   // The loaded value is the offset from KlassOopDesc.
1933 
1934   ld(cached_super, super_check_offset, sub_klass);
1935   cmpd(CCR0, cached_super, super_klass);
1936 
1937   // This check has worked decisively for primary supers.
1938   // Secondary supers are sought in the super_cache ('super_cache_addr').
1939   // (Secondary supers are interfaces and very deeply nested subtypes.)
1940   // This works in the same check above because of a tricky aliasing
1941   // between the super_cache and the primary super display elements.
1942   // (The 'super_check_addr' can address either, as the case requires.)
1943   // Note that the cache is updated below if it does not help us find
1944   // what we need immediately.
1945   // So if it was a primary super, we can just fail immediately.
1946   // Otherwise, it's the slow path for us (no success at this point).
1947 
1948 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); }
1949 
1950   if (super_check_offset.is_register()) {
1951     beq(CCR0, *L_success);
1952     cmpwi(CCR0, super_check_offset.as_register(), sc_offset);
1953     if (L_failure == &L_fallthrough) {
1954       beq(CCR0, *L_slow_path);
1955     } else {
1956       bne(CCR0, *L_failure);
1957       FINAL_JUMP(*L_slow_path);
1958     }
1959   } else {
1960     if (super_check_offset.as_constant() == sc_offset) {
1961       // Need a slow path; fast failure is impossible.
1962       if (L_slow_path == &L_fallthrough) {
1963         beq(CCR0, *L_success);
1964       } else {
1965         bne(CCR0, *L_slow_path);
1966         FINAL_JUMP(*L_success);
1967       }
1968     } else {
1969       // No slow path; it's a fast decision.
1970       if (L_failure == &L_fallthrough) {
1971         beq(CCR0, *L_success);
1972       } else {
1973         bne(CCR0, *L_failure);
1974         FINAL_JUMP(*L_success);
1975       }
1976     }
1977   }
1978 
1979   bind(L_fallthrough);
1980 #undef FINAL_JUMP
1981 }
1982 
1983 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1984                                                    Register super_klass,
1985                                                    Register temp1_reg,
1986                                                    Register temp2_reg,
1987                                                    Label* L_success,
1988                                                    Register result_reg) {
1989   const Register array_ptr = temp1_reg; // current value from cache array
1990   const Register temp      = temp2_reg;
1991 
1992   assert_different_registers(sub_klass, super_klass, array_ptr, temp);
1993 
1994   int source_offset = in_bytes(Klass::secondary_supers_offset());
1995   int target_offset = in_bytes(Klass::secondary_super_cache_offset());
1996 
1997   int length_offset = Array<Klass*>::length_offset_in_bytes();
1998   int base_offset   = Array<Klass*>::base_offset_in_bytes();
1999 
2000   Label hit, loop, failure, fallthru;
2001 
2002   ld(array_ptr, source_offset, sub_klass);
2003 
2004   // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated.");
2005   lwz(temp, length_offset, array_ptr);
2006   cmpwi(CCR0, temp, 0);
2007   beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0
2008 
2009   mtctr(temp); // load ctr
2010 
2011   bind(loop);
2012   // Oops in table are NO MORE compressed.
2013   ld(temp, base_offset, array_ptr);
2014   cmpd(CCR0, temp, super_klass);
2015   beq(CCR0, hit);
2016   addi(array_ptr, array_ptr, BytesPerWord);
2017   bdnz(loop);
2018 
2019   bind(failure);
2020   if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss)
2021   b(fallthru);
2022 
2023   bind(hit);
2024   std(super_klass, target_offset, sub_klass); // save result to cache
2025   if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit)
2026   if (L_success != NULL) { b(*L_success); }
2027   else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided
2028 
2029   bind(fallthru);
2030 }
2031 
2032 // Try fast path, then go to slow one if not successful
2033 void MacroAssembler::check_klass_subtype(Register sub_klass,
2034                          Register super_klass,
2035                          Register temp1_reg,
2036                          Register temp2_reg,
2037                          Label& L_success) {
2038   Label L_failure;
2039   check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure);
2040   check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success);
2041   bind(L_failure); // Fallthru if not successful.
2042 }
2043 
2044 void MacroAssembler::check_method_handle_type(Register mtype_reg, Register mh_reg,
2045                                               Register temp_reg,
2046                                               Label& wrong_method_type) {
2047   assert_different_registers(mtype_reg, mh_reg, temp_reg);
2048   // Compare method type against that of the receiver.
2049   load_heap_oop_not_null(temp_reg, delayed_value(java_lang_invoke_MethodHandle::type_offset_in_bytes, temp_reg), mh_reg);
2050   cmpd(CCR0, temp_reg, mtype_reg);
2051   bne(CCR0, wrong_method_type);
2052 }
2053 
2054 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot,
2055                                                    Register temp_reg,
2056                                                    int extra_slot_offset) {
2057   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
2058   int stackElementSize = Interpreter::stackElementSize;
2059   int offset = extra_slot_offset * stackElementSize;
2060   if (arg_slot.is_constant()) {
2061     offset += arg_slot.as_constant() * stackElementSize;
2062     return offset;
2063   } else {
2064     assert(temp_reg != noreg, "must specify");
2065     sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize));
2066     if (offset != 0)
2067       addi(temp_reg, temp_reg, offset);
2068     return temp_reg;
2069   }
2070 }
2071 
2072 // Supports temp2_reg = R0.
2073 void MacroAssembler::biased_locking_enter(ConditionRegister cr_reg, Register obj_reg,
2074                                           Register mark_reg, Register temp_reg,
2075                                           Register temp2_reg, Label& done, Label* slow_case) {
2076   assert(UseBiasedLocking, "why call this otherwise?");
2077 
2078 #ifdef ASSERT
2079   assert_different_registers(obj_reg, mark_reg, temp_reg, temp2_reg);
2080 #endif
2081 
2082   Label cas_label;
2083 
2084   // Branch to done if fast path fails and no slow_case provided.
2085   Label *slow_case_int = (slow_case != NULL) ? slow_case : &done;
2086 
2087   // Biased locking
2088   // See whether the lock is currently biased toward our thread and
2089   // whether the epoch is still valid
2090   // Note that the runtime guarantees sufficient alignment of JavaThread
2091   // pointers to allow age to be placed into low bits
2092   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits,
2093          "biased locking makes assumptions about bit layout");
2094 
2095   if (PrintBiasedLockingStatistics) {
2096     load_const(temp2_reg, (address) BiasedLocking::total_entry_count_addr(), temp_reg);
2097     lwzx(temp_reg, temp2_reg);
2098     addi(temp_reg, temp_reg, 1);
2099     stwx(temp_reg, temp2_reg);
2100   }
2101 
2102   andi(temp_reg, mark_reg, markOopDesc::biased_lock_mask_in_place);
2103   cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern);
2104   bne(cr_reg, cas_label);
2105 
2106   load_klass(temp_reg, obj_reg);
2107 
2108   load_const_optimized(temp2_reg, ~((int) markOopDesc::age_mask_in_place));
2109   ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
2110   orr(temp_reg, R16_thread, temp_reg);
2111   xorr(temp_reg, mark_reg, temp_reg);
2112   andr(temp_reg, temp_reg, temp2_reg);
2113   cmpdi(cr_reg, temp_reg, 0);
2114   if (PrintBiasedLockingStatistics) {
2115     Label l;
2116     bne(cr_reg, l);
2117     load_const(temp2_reg, (address) BiasedLocking::biased_lock_entry_count_addr());
2118     lwzx(mark_reg, temp2_reg);
2119     addi(mark_reg, mark_reg, 1);
2120     stwx(mark_reg, temp2_reg);
2121     // restore mark_reg
2122     ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);
2123     bind(l);
2124   }
2125   beq(cr_reg, done);
2126 
2127   Label try_revoke_bias;
2128   Label try_rebias;
2129 
2130   // At this point we know that the header has the bias pattern and
2131   // that we are not the bias owner in the current epoch. We need to
2132   // figure out more details about the state of the header in order to
2133   // know what operations can be legally performed on the object's
2134   // header.
2135 
2136   // If the low three bits in the xor result aren't clear, that means
2137   // the prototype header is no longer biased and we have to revoke
2138   // the bias on this object.
2139   andi(temp2_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
2140   cmpwi(cr_reg, temp2_reg, 0);
2141   bne(cr_reg, try_revoke_bias);
2142 
2143   // Biasing is still enabled for this data type. See whether the
2144   // epoch of the current bias is still valid, meaning that the epoch
2145   // bits of the mark word are equal to the epoch bits of the
2146   // prototype header. (Note that the prototype header's epoch bits
2147   // only change at a safepoint.) If not, attempt to rebias the object
2148   // toward the current thread. Note that we must be absolutely sure
2149   // that the current epoch is invalid in order to do this because
2150   // otherwise the manipulations it performs on the mark word are
2151   // illegal.
2152 
2153   int shift_amount = 64 - markOopDesc::epoch_shift;
2154   // rotate epoch bits to right (little) end and set other bits to 0
2155   // [ big part | epoch | little part ] -> [ 0..0 | epoch ]
2156   rldicl_(temp2_reg, temp_reg, shift_amount, 64 - markOopDesc::epoch_bits);
2157   // branch if epoch bits are != 0, i.e. they differ, because the epoch has been incremented
2158   bne(CCR0, try_rebias);
2159 
2160   // The epoch of the current bias is still valid but we know nothing
2161   // about the owner; it might be set or it might be clear. Try to
2162   // acquire the bias of the object using an atomic operation. If this
2163   // fails we will go in to the runtime to revoke the object's bias.
2164   // Note that we first construct the presumed unbiased header so we
2165   // don't accidentally blow away another thread's valid bias.
2166   andi(mark_reg, mark_reg, (markOopDesc::biased_lock_mask_in_place |
2167                                 markOopDesc::age_mask_in_place |
2168                                 markOopDesc::epoch_mask_in_place));
2169   orr(temp_reg, R16_thread, mark_reg);
2170 
2171   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2172 
2173   // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
2174   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
2175            /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
2176            /*where=*/obj_reg,
2177            MacroAssembler::MemBarAcq,
2178            MacroAssembler::cmpxchgx_hint_acquire_lock(),
2179            noreg, slow_case_int); // bail out if failed
2180 
2181   // If the biasing toward our thread failed, this means that
2182   // another thread succeeded in biasing it toward itself and we
2183   // need to revoke that bias. The revocation will occur in the
2184   // interpreter runtime in the slow case.
2185   if (PrintBiasedLockingStatistics) {
2186     load_const(temp2_reg, (address) BiasedLocking::anonymously_biased_lock_entry_count_addr(), temp_reg);
2187     lwzx(temp_reg, temp2_reg);
2188     addi(temp_reg, temp_reg, 1);
2189     stwx(temp_reg, temp2_reg);
2190   }
2191   b(done);
2192 
2193   bind(try_rebias);
2194   // At this point we know the epoch has expired, meaning that the
2195   // current "bias owner", if any, is actually invalid. Under these
2196   // circumstances _only_, we are allowed to use the current header's
2197   // value as the comparison value when doing the cas to acquire the
2198   // bias in the current epoch. In other words, we allow transfer of
2199   // the bias from one thread to another directly in this situation.
2200   load_klass(temp_reg, obj_reg);
2201   andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place);
2202   orr(temp2_reg, R16_thread, temp2_reg);
2203   ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
2204   orr(temp_reg, temp2_reg, temp_reg);
2205 
2206   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2207 
2208   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
2209                  /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
2210                  /*where=*/obj_reg,
2211                  MacroAssembler::MemBarAcq,
2212                  MacroAssembler::cmpxchgx_hint_acquire_lock(),
2213                  noreg, slow_case_int); // bail out if failed
2214 
2215   // If the biasing toward our thread failed, this means that
2216   // another thread succeeded in biasing it toward itself and we
2217   // need to revoke that bias. The revocation will occur in the
2218   // interpreter runtime in the slow case.
2219   if (PrintBiasedLockingStatistics) {
2220     load_const(temp2_reg, (address) BiasedLocking::rebiased_lock_entry_count_addr(), temp_reg);
2221     lwzx(temp_reg, temp2_reg);
2222     addi(temp_reg, temp_reg, 1);
2223     stwx(temp_reg, temp2_reg);
2224   }
2225   b(done);
2226 
2227   bind(try_revoke_bias);
2228   // The prototype mark in the klass doesn't have the bias bit set any
2229   // more, indicating that objects of this data type are not supposed
2230   // to be biased any more. We are going to try to reset the mark of
2231   // this object to the prototype value and fall through to the
2232   // CAS-based locking scheme. Note that if our CAS fails, it means
2233   // that another thread raced us for the privilege of revoking the
2234   // bias of this particular object, so it's okay to continue in the
2235   // normal locking code.
2236   load_klass(temp_reg, obj_reg);
2237   ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
2238   andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place);
2239   orr(temp_reg, temp_reg, temp2_reg);
2240 
2241   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2242 
2243   // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
2244   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
2245                  /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
2246                  /*where=*/obj_reg,
2247                  MacroAssembler::MemBarAcq,
2248                  MacroAssembler::cmpxchgx_hint_acquire_lock());
2249 
2250   // reload markOop in mark_reg before continuing with lightweight locking
2251   ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);
2252 
2253   // Fall through to the normal CAS-based lock, because no matter what
2254   // the result of the above CAS, some thread must have succeeded in
2255   // removing the bias bit from the object's header.
2256   if (PrintBiasedLockingStatistics) {
2257     Label l;
2258     bne(cr_reg, l);
2259     load_const(temp2_reg, (address) BiasedLocking::revoked_lock_entry_count_addr(), temp_reg);
2260     lwzx(temp_reg, temp2_reg);
2261     addi(temp_reg, temp_reg, 1);
2262     stwx(temp_reg, temp2_reg);
2263     bind(l);
2264   }
2265 
2266   bind(cas_label);
2267 }
2268 
2269 void MacroAssembler::biased_locking_exit (ConditionRegister cr_reg, Register mark_addr, Register temp_reg, Label& done) {
2270   // Check for biased locking unlock case, which is a no-op
2271   // Note: we do not have to check the thread ID for two reasons.
2272   // First, the interpreter checks for IllegalMonitorStateException at
2273   // a higher level. Second, if the bias was revoked while we held the
2274   // lock, the object could not be rebiased toward another thread, so
2275   // the bias bit would be clear.
2276 
2277   ld(temp_reg, 0, mark_addr);
2278   andi(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
2279 
2280   cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern);
2281   beq(cr_reg, done);
2282 }
2283 
2284 // allocation (for C1)
2285 void MacroAssembler::eden_allocate(
2286   Register obj,                      // result: pointer to object after successful allocation
2287   Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
2288   int      con_size_in_bytes,        // object size in bytes if   known at compile time
2289   Register t1,                       // temp register
2290   Register t2,                       // temp register
2291   Label&   slow_case                 // continuation point if fast allocation fails
2292 ) {
2293   b(slow_case);
2294 }
2295 
2296 void MacroAssembler::tlab_allocate(
2297   Register obj,                      // result: pointer to object after successful allocation
2298   Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
2299   int      con_size_in_bytes,        // object size in bytes if   known at compile time
2300   Register t1,                       // temp register
2301   Label&   slow_case                 // continuation point if fast allocation fails
2302 ) {
2303   // make sure arguments make sense
2304   assert_different_registers(obj, var_size_in_bytes, t1);
2305   assert(0 <= con_size_in_bytes && is_simm13(con_size_in_bytes), "illegal object size");
2306   assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment");
2307 
2308   const Register new_top = t1;
2309   //verify_tlab(); not implemented
2310 
2311   ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2312   ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread);
2313   if (var_size_in_bytes == noreg) {
2314     addi(new_top, obj, con_size_in_bytes);
2315   } else {
2316     add(new_top, obj, var_size_in_bytes);
2317   }
2318   cmpld(CCR0, new_top, R0);
2319   bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case);
2320 
2321 #ifdef ASSERT
2322   // make sure new free pointer is properly aligned
2323   {
2324     Label L;
2325     andi_(R0, new_top, MinObjAlignmentInBytesMask);
2326     beq(CCR0, L);
2327     stop("updated TLAB free is not properly aligned", 0x934);
2328     bind(L);
2329   }
2330 #endif // ASSERT
2331 
2332   // update the tlab top pointer
2333   std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2334   //verify_tlab(); not implemented
2335 }
2336 void MacroAssembler::tlab_refill(Label& retry_tlab, Label& try_eden, Label& slow_case) {
2337   unimplemented("tlab_refill");
2338 }
2339 void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register t1, Register t2) {
2340   unimplemented("incr_allocated_bytes");
2341 }
2342 
2343 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset,
2344                                              int insts_call_instruction_offset, Register Rtoc) {
2345   // Start the stub.
2346   address stub = start_a_stub(64);
2347   if (stub == NULL) { return NULL; } // CodeCache full: bail out
2348 
2349   // Create a trampoline stub relocation which relates this trampoline stub
2350   // with the call instruction at insts_call_instruction_offset in the
2351   // instructions code-section.
2352   relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset));
2353   const int stub_start_offset = offset();
2354 
2355   // For java_to_interp stubs we use R11_scratch1 as scratch register
2356   // and in call trampoline stubs we use R12_scratch2. This way we
2357   // can distinguish them (see is_NativeCallTrampolineStub_at()).
2358   Register reg_scratch = R12_scratch2;
2359 
2360   // Now, create the trampoline stub's code:
2361   // - load the TOC
2362   // - load the call target from the constant pool
2363   // - call
2364   if (Rtoc == noreg) {
2365     calculate_address_from_global_toc(reg_scratch, method_toc());
2366     Rtoc = reg_scratch;
2367   }
2368 
2369   ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false);
2370   mtctr(reg_scratch);
2371   bctr();
2372 
2373   const address stub_start_addr = addr_at(stub_start_offset);
2374 
2375   // Assert that the encoded destination_toc_offset can be identified and that it is correct.
2376   assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(),
2377          "encoded offset into the constant pool must match");
2378   // Trampoline_stub_size should be good.
2379   assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size");
2380   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
2381 
2382   // End the stub.
2383   end_a_stub();
2384   return stub;
2385 }
2386 
2387 // TM on PPC64.
2388 void MacroAssembler::atomic_inc_ptr(Register addr, Register result, int simm16) {
2389   Label retry;
2390   bind(retry);
2391   ldarx(result, addr, /*hint*/ false);
2392   addi(result, result, simm16);
2393   stdcx_(result, addr);
2394   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
2395     bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
2396   } else {
2397     bne(                  CCR0, retry); // stXcx_ sets CCR0
2398   }
2399 }
2400 
2401 void MacroAssembler::atomic_ori_int(Register addr, Register result, int uimm16) {
2402   Label retry;
2403   bind(retry);
2404   lwarx(result, addr, /*hint*/ false);
2405   ori(result, result, uimm16);
2406   stwcx_(result, addr);
2407   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
2408     bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
2409   } else {
2410     bne(                  CCR0, retry); // stXcx_ sets CCR0
2411   }
2412 }
2413 
2414 #if INCLUDE_RTM_OPT
2415 
2416 // Update rtm_counters based on abort status
2417 // input: abort_status
2418 //        rtm_counters (RTMLockingCounters*)
2419 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters_Reg) {
2420   // Mapping to keep PreciseRTMLockingStatistics similar to x86.
2421   // x86 ppc (! means inverted, ? means not the same)
2422   //  0   31  Set if abort caused by XABORT instruction.
2423   //  1  ! 7  If set, the transaction may succeed on a retry. This bit is always clear if bit 0 is set.
2424   //  2   13  Set if another logical processor conflicted with a memory address that was part of the transaction that aborted.
2425   //  3   10  Set if an internal buffer overflowed.
2426   //  4  ?12  Set if a debug breakpoint was hit.
2427   //  5  ?32  Set if an abort occurred during execution of a nested transaction.
2428   const  int tm_failure_bit[] = {Assembler::tm_tabort, // Note: Seems like signal handler sets this, too.
2429                                  Assembler::tm_failure_persistent, // inverted: transient
2430                                  Assembler::tm_trans_cf,
2431                                  Assembler::tm_footprint_of,
2432                                  Assembler::tm_non_trans_cf,
2433                                  Assembler::tm_suspended};
2434   const bool tm_failure_inv[] = {false, true, false, false, false, false};
2435   assert(sizeof(tm_failure_bit)/sizeof(int) == RTMLockingCounters::ABORT_STATUS_LIMIT, "adapt mapping!");
2436 
2437   const Register addr_Reg = R0;
2438   // Keep track of offset to where rtm_counters_Reg had pointed to.
2439   int counters_offs = RTMLockingCounters::abort_count_offset();
2440   addi(addr_Reg, rtm_counters_Reg, counters_offs);
2441   const Register temp_Reg = rtm_counters_Reg;
2442 
2443   //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically
2444   ldx(temp_Reg, addr_Reg);
2445   addi(temp_Reg, temp_Reg, 1);
2446   stdx(temp_Reg, addr_Reg);
2447 
2448   if (PrintPreciseRTMLockingStatistics) {
2449     int counters_offs_delta = RTMLockingCounters::abortX_count_offset() - counters_offs;
2450 
2451     //mftexasr(abort_status); done by caller
2452     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
2453       counters_offs += counters_offs_delta;
2454       li(temp_Reg, counters_offs_delta); // can't use addi with R0
2455       add(addr_Reg, addr_Reg, temp_Reg); // point to next counter
2456       counters_offs_delta = sizeof(uintx);
2457 
2458       Label check_abort;
2459       rldicr_(temp_Reg, abort_status, tm_failure_bit[i], 0);
2460       if (tm_failure_inv[i]) {
2461         bne(CCR0, check_abort);
2462       } else {
2463         beq(CCR0, check_abort);
2464       }
2465       //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically
2466       ldx(temp_Reg, addr_Reg);
2467       addi(temp_Reg, temp_Reg, 1);
2468       stdx(temp_Reg, addr_Reg);
2469       bind(check_abort);
2470     }
2471   }
2472   li(temp_Reg, -counters_offs); // can't use addi with R0
2473   add(rtm_counters_Reg, addr_Reg, temp_Reg); // restore
2474 }
2475 
2476 // Branch if (random & (count-1) != 0), count is 2^n
2477 // tmp and CR0 are killed
2478 void MacroAssembler::branch_on_random_using_tb(Register tmp, int count, Label& brLabel) {
2479   mftb(tmp);
2480   andi_(tmp, tmp, count-1);
2481   bne(CCR0, brLabel);
2482 }
2483 
2484 // Perform abort ratio calculation, set no_rtm bit if high ratio.
2485 // input:  rtm_counters_Reg (RTMLockingCounters* address) - KILLED
2486 void MacroAssembler::rtm_abort_ratio_calculation(Register rtm_counters_Reg,
2487                                                  RTMLockingCounters* rtm_counters,
2488                                                  Metadata* method_data) {
2489   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
2490 
2491   if (RTMLockingCalculationDelay > 0) {
2492     // Delay calculation.
2493     ld(rtm_counters_Reg, (RegisterOrConstant)(intptr_t)RTMLockingCounters::rtm_calculation_flag_addr());
2494     cmpdi(CCR0, rtm_counters_Reg, 0);
2495     beq(CCR0, L_done);
2496     load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2497   }
2498   // Abort ratio calculation only if abort_count > RTMAbortThreshold.
2499   //   Aborted transactions = abort_count * 100
2500   //   All transactions = total_count *  RTMTotalCountIncrRate
2501   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
2502   ld(R0, RTMLockingCounters::abort_count_offset(), rtm_counters_Reg);
2503   if (is_simm(RTMAbortThreshold, 16)) {   // cmpdi can handle 16bit immediate only.
2504     cmpdi(CCR0, R0, RTMAbortThreshold);
2505     blt(CCR0, L_check_always_rtm2);  // reload of rtm_counters_Reg not necessary
2506   } else {
2507     load_const_optimized(rtm_counters_Reg, RTMAbortThreshold);
2508     cmpd(CCR0, R0, rtm_counters_Reg);
2509     blt(CCR0, L_check_always_rtm1);  // reload of rtm_counters_Reg required
2510   }
2511   mulli(R0, R0, 100);
2512 
2513   const Register tmpReg = rtm_counters_Reg;
2514   ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2515   mulli(tmpReg, tmpReg, RTMTotalCountIncrRate); // allowable range: int16
2516   mulli(tmpReg, tmpReg, RTMAbortRatio);         // allowable range: int16
2517   cmpd(CCR0, R0, tmpReg);
2518   blt(CCR0, L_check_always_rtm1); // jump to reload
2519   if (method_data != NULL) {
2520     // Set rtm_state to "no rtm" in MDO.
2521     // Not using a metadata relocation. Method and Class Loader are kept alive anyway.
2522     // (See nmethod::metadata_do and CodeBuffer::finalize_oop_references.)
2523     load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2524     atomic_ori_int(R0, tmpReg, NoRTM);
2525   }
2526   b(L_done);
2527 
2528   bind(L_check_always_rtm1);
2529   load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2530   bind(L_check_always_rtm2);
2531   ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2532   int64_t thresholdValue = RTMLockingThreshold / RTMTotalCountIncrRate;
2533   if (is_simm(thresholdValue, 16)) {   // cmpdi can handle 16bit immediate only.
2534     cmpdi(CCR0, tmpReg, thresholdValue);
2535   } else {
2536     load_const_optimized(R0, thresholdValue);
2537     cmpd(CCR0, tmpReg, R0);
2538   }
2539   blt(CCR0, L_done);
2540   if (method_data != NULL) {
2541     // Set rtm_state to "always rtm" in MDO.
2542     // Not using a metadata relocation. See above.
2543     load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2544     atomic_ori_int(R0, tmpReg, UseRTM);
2545   }
2546   bind(L_done);
2547 }
2548 
2549 // Update counters and perform abort ratio calculation.
2550 // input: abort_status_Reg
2551 void MacroAssembler::rtm_profiling(Register abort_status_Reg, Register temp_Reg,
2552                                    RTMLockingCounters* rtm_counters,
2553                                    Metadata* method_data,
2554                                    bool profile_rtm) {
2555 
2556   assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2557   // Update rtm counters based on state at abort.
2558   // Reads abort_status_Reg, updates flags.
2559   assert_different_registers(abort_status_Reg, temp_Reg);
2560   load_const_optimized(temp_Reg, (address)rtm_counters, R0);
2561   rtm_counters_update(abort_status_Reg, temp_Reg);
2562   if (profile_rtm) {
2563     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2564     rtm_abort_ratio_calculation(temp_Reg, rtm_counters, method_data);
2565   }
2566 }
2567 
2568 // Retry on abort if abort's status indicates non-persistent failure.
2569 // inputs: retry_count_Reg
2570 //       : abort_status_Reg
2571 // output: retry_count_Reg decremented by 1
2572 void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg,
2573                                              Label& retryLabel, Label* checkRetry) {
2574   Label doneRetry;
2575   rldicr_(R0, abort_status_Reg, tm_failure_persistent, 0);
2576   bne(CCR0, doneRetry);
2577   if (checkRetry) { bind(*checkRetry); }
2578   addic_(retry_count_Reg, retry_count_Reg, -1);
2579   blt(CCR0, doneRetry);
2580   smt_yield(); // Can't use wait(). No permission (SIGILL).
2581   b(retryLabel);
2582   bind(doneRetry);
2583 }
2584 
2585 // Spin and retry if lock is busy.
2586 // inputs: owner_addr_Reg (monitor address)
2587 //       : retry_count_Reg
2588 // output: retry_count_Reg decremented by 1
2589 // CTR is killed
2590 void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register owner_addr_Reg, Label& retryLabel) {
2591   Label SpinLoop, doneRetry;
2592   addic_(retry_count_Reg, retry_count_Reg, -1);
2593   blt(CCR0, doneRetry);
2594 
2595   if (RTMSpinLoopCount > 1) {
2596     li(R0, RTMSpinLoopCount);
2597     mtctr(R0);
2598   }
2599 
2600   bind(SpinLoop);
2601   smt_yield(); // Can't use waitrsv(). No permission (SIGILL).
2602 
2603   if (RTMSpinLoopCount > 1) {
2604     bdz(retryLabel);
2605     ld(R0, 0, owner_addr_Reg);
2606     cmpdi(CCR0, R0, 0);
2607     bne(CCR0, SpinLoop);
2608   }
2609 
2610   b(retryLabel);
2611 
2612   bind(doneRetry);
2613 }
2614 
2615 // Use RTM for normal stack locks.
2616 // Input: objReg (object to lock)
2617 void MacroAssembler::rtm_stack_locking(ConditionRegister flag,
2618                                        Register obj, Register mark_word, Register tmp,
2619                                        Register retry_on_abort_count_Reg,
2620                                        RTMLockingCounters* stack_rtm_counters,
2621                                        Metadata* method_data, bool profile_rtm,
2622                                        Label& DONE_LABEL, Label& IsInflated) {
2623   assert(UseRTMForStackLocks, "why call this otherwise?");
2624   assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
2625   Label L_rtm_retry, L_decrement_retry, L_on_abort;
2626 
2627   if (RTMRetryCount > 0) {
2628     load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
2629     bind(L_rtm_retry);
2630   }
2631   andi_(R0, mark_word, markOopDesc::monitor_value);  // inflated vs stack-locked|neutral|biased
2632   bne(CCR0, IsInflated);
2633 
2634   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2635     Label L_noincrement;
2636     if (RTMTotalCountIncrRate > 1) {
2637       branch_on_random_using_tb(tmp, RTMTotalCountIncrRate, L_noincrement);
2638     }
2639     assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
2640     load_const_optimized(tmp, (address)stack_rtm_counters->total_count_addr(), R0);
2641     //atomic_inc_ptr(tmp, /*temp, will be reloaded*/mark_word); We don't increment atomically
2642     ldx(mark_word, tmp);
2643     addi(mark_word, mark_word, 1);
2644     stdx(mark_word, tmp);
2645     bind(L_noincrement);
2646   }
2647   tbegin_();
2648   beq(CCR0, L_on_abort);
2649   ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);         // Reload in transaction, conflicts need to be tracked.
2650   andi(R0, mark_word, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
2651   cmpwi(flag, R0, markOopDesc::unlocked_value);                // bits = 001 unlocked
2652   beq(flag, DONE_LABEL);                                       // all done if unlocked
2653 
2654   if (UseRTMXendForLockBusy) {
2655     tend_();
2656     b(L_decrement_retry);
2657   } else {
2658     tabort_();
2659   }
2660   bind(L_on_abort);
2661   const Register abort_status_Reg = tmp;
2662   mftexasr(abort_status_Reg);
2663   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2664     rtm_profiling(abort_status_Reg, /*temp*/mark_word, stack_rtm_counters, method_data, profile_rtm);
2665   }
2666   ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // reload
2667   if (RTMRetryCount > 0) {
2668     // Retry on lock abort if abort status is not permanent.
2669     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry, &L_decrement_retry);
2670   } else {
2671     bind(L_decrement_retry);
2672   }
2673 }
2674 
2675 // Use RTM for inflating locks
2676 // inputs: obj       (object to lock)
2677 //         mark_word (current header - KILLED)
2678 //         boxReg    (on-stack box address (displaced header location) - KILLED)
2679 void MacroAssembler::rtm_inflated_locking(ConditionRegister flag,
2680                                           Register obj, Register mark_word, Register boxReg,
2681                                           Register retry_on_busy_count_Reg, Register retry_on_abort_count_Reg,
2682                                           RTMLockingCounters* rtm_counters,
2683                                           Metadata* method_data, bool profile_rtm,
2684                                           Label& DONE_LABEL) {
2685   assert(UseRTMLocking, "why call this otherwise?");
2686   Label L_rtm_retry, L_decrement_retry, L_on_abort;
2687   // Clean monitor_value bit to get valid pointer.
2688   int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markOopDesc::monitor_value;
2689 
2690   // Store non-null, using boxReg instead of (intptr_t)markOopDesc::unused_mark().
2691   std(boxReg, BasicLock::displaced_header_offset_in_bytes(), boxReg);
2692   const Register tmpReg = boxReg;
2693   const Register owner_addr_Reg = mark_word;
2694   addi(owner_addr_Reg, mark_word, owner_offset);
2695 
2696   if (RTMRetryCount > 0) {
2697     load_const_optimized(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy.
2698     load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort.
2699     bind(L_rtm_retry);
2700   }
2701   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2702     Label L_noincrement;
2703     if (RTMTotalCountIncrRate > 1) {
2704       branch_on_random_using_tb(R0, RTMTotalCountIncrRate, L_noincrement);
2705     }
2706     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2707     load_const(R0, (address)rtm_counters->total_count_addr(), tmpReg);
2708     //atomic_inc_ptr(R0, tmpReg); We don't increment atomically
2709     ldx(tmpReg, R0);
2710     addi(tmpReg, tmpReg, 1);
2711     stdx(tmpReg, R0);
2712     bind(L_noincrement);
2713   }
2714   tbegin_();
2715   beq(CCR0, L_on_abort);
2716   // We don't reload mark word. Will only be reset at safepoint.
2717   ld(R0, 0, owner_addr_Reg); // Load in transaction, conflicts need to be tracked.
2718   cmpdi(flag, R0, 0);
2719   beq(flag, DONE_LABEL);
2720 
2721   if (UseRTMXendForLockBusy) {
2722     tend_();
2723     b(L_decrement_retry);
2724   } else {
2725     tabort_();
2726   }
2727   bind(L_on_abort);
2728   const Register abort_status_Reg = tmpReg;
2729   mftexasr(abort_status_Reg);
2730   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2731     rtm_profiling(abort_status_Reg, /*temp*/ owner_addr_Reg, rtm_counters, method_data, profile_rtm);
2732     // Restore owner_addr_Reg
2733     ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);
2734 #ifdef ASSERT
2735     andi_(R0, mark_word, markOopDesc::monitor_value);
2736     asm_assert_ne("must be inflated", 0xa754); // Deflating only allowed at safepoint.
2737 #endif
2738     addi(owner_addr_Reg, mark_word, owner_offset);
2739   }
2740   if (RTMRetryCount > 0) {
2741     // Retry on lock abort if abort status is not permanent.
2742     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
2743   }
2744 
2745   // Appears unlocked - try to swing _owner from null to non-null.
2746   cmpxchgd(flag, /*current val*/ R0, (intptr_t)0, /*new val*/ R16_thread, owner_addr_Reg,
2747            MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2748            MacroAssembler::cmpxchgx_hint_acquire_lock(), noreg, &L_decrement_retry, true);
2749 
2750   if (RTMRetryCount > 0) {
2751     // success done else retry
2752     b(DONE_LABEL);
2753     bind(L_decrement_retry);
2754     // Spin and retry if lock is busy.
2755     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, owner_addr_Reg, L_rtm_retry);
2756   } else {
2757     bind(L_decrement_retry);
2758   }
2759 }
2760 
2761 #endif //  INCLUDE_RTM_OPT
2762 
2763 // "The box" is the space on the stack where we copy the object mark.
2764 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box,
2765                                                Register temp, Register displaced_header, Register current_header,
2766                                                bool try_bias,
2767                                                RTMLockingCounters* rtm_counters,
2768                                                RTMLockingCounters* stack_rtm_counters,
2769                                                Metadata* method_data,
2770                                                bool use_rtm, bool profile_rtm) {
2771   assert_different_registers(oop, box, temp, displaced_header, current_header);
2772   assert(flag != CCR0, "bad condition register");
2773   Label cont;
2774   Label object_has_monitor;
2775   Label cas_failed;
2776 
2777   // Load markOop from object into displaced_header.
2778   ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop);
2779 
2780 
2781   // Always do locking in runtime.
2782   if (EmitSync & 0x01) {
2783     cmpdi(flag, oop, 0); // Oop can't be 0 here => always false.
2784     return;
2785   }
2786 
2787   if (try_bias) {
2788     biased_locking_enter(flag, oop, displaced_header, temp, current_header, cont);
2789   }
2790 
2791 #if INCLUDE_RTM_OPT
2792   if (UseRTMForStackLocks && use_rtm) {
2793     rtm_stack_locking(flag, oop, displaced_header, temp, /*temp*/ current_header,
2794                       stack_rtm_counters, method_data, profile_rtm,
2795                       cont, object_has_monitor);
2796   }
2797 #endif // INCLUDE_RTM_OPT
2798 
2799   // Handle existing monitor.
2800   if ((EmitSync & 0x02) == 0) {
2801     // The object has an existing monitor iff (mark & monitor_value) != 0.
2802     andi_(temp, displaced_header, markOopDesc::monitor_value);
2803     bne(CCR0, object_has_monitor);
2804   }
2805 
2806   // Set displaced_header to be (markOop of object | UNLOCK_VALUE).
2807   ori(displaced_header, displaced_header, markOopDesc::unlocked_value);
2808 
2809   // Load Compare Value application register.
2810 
2811   // Initialize the box. (Must happen before we update the object mark!)
2812   std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2813 
2814   // Must fence, otherwise, preceding store(s) may float below cmpxchg.
2815   // Compare object markOop with mark and if equal exchange scratch1 with object markOop.
2816   cmpxchgd(/*flag=*/flag,
2817            /*current_value=*/current_header,
2818            /*compare_value=*/displaced_header,
2819            /*exchange_value=*/box,
2820            /*where=*/oop,
2821            MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2822            MacroAssembler::cmpxchgx_hint_acquire_lock(),
2823            noreg,
2824            &cas_failed,
2825            /*check without membar and ldarx first*/true);
2826   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2827 
2828   // If the compare-and-exchange succeeded, then we found an unlocked
2829   // object and we have now locked it.
2830   b(cont);
2831 
2832   bind(cas_failed);
2833   // We did not see an unlocked object so try the fast recursive case.
2834 
2835   // Check if the owner is self by comparing the value in the markOop of object
2836   // (current_header) with the stack pointer.
2837   sub(current_header, current_header, R1_SP);
2838   load_const_optimized(temp, ~(os::vm_page_size()-1) | markOopDesc::lock_mask_in_place);
2839 
2840   and_(R0/*==0?*/, current_header, temp);
2841   // If condition is true we are cont and hence we can store 0 as the
2842   // displaced header in the box, which indicates that it is a recursive lock.
2843   mcrf(flag,CCR0);
2844   std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box);
2845 
2846   // Handle existing monitor.
2847   if ((EmitSync & 0x02) == 0) {
2848     b(cont);
2849 
2850     bind(object_has_monitor);
2851     // The object's monitor m is unlocked iff m->owner == NULL,
2852     // otherwise m->owner may contain a thread or a stack address.
2853 
2854 #if INCLUDE_RTM_OPT
2855     // Use the same RTM locking code in 32- and 64-bit VM.
2856     if (use_rtm) {
2857       rtm_inflated_locking(flag, oop, displaced_header, box, temp, /*temp*/ current_header,
2858                            rtm_counters, method_data, profile_rtm, cont);
2859     } else {
2860 #endif // INCLUDE_RTM_OPT
2861 
2862     // Try to CAS m->owner from NULL to current thread.
2863     addi(temp, displaced_header, ObjectMonitor::owner_offset_in_bytes()-markOopDesc::monitor_value);
2864     cmpxchgd(/*flag=*/flag,
2865              /*current_value=*/current_header,
2866              /*compare_value=*/(intptr_t)0,
2867              /*exchange_value=*/R16_thread,
2868              /*where=*/temp,
2869              MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2870              MacroAssembler::cmpxchgx_hint_acquire_lock());
2871 
2872     // Store a non-null value into the box.
2873     std(box, BasicLock::displaced_header_offset_in_bytes(), box);
2874 
2875 #   ifdef ASSERT
2876     bne(flag, cont);
2877     // We have acquired the monitor, check some invariants.
2878     addi(/*monitor=*/temp, temp, -ObjectMonitor::owner_offset_in_bytes());
2879     // Invariant 1: _recursions should be 0.
2880     //assert(ObjectMonitor::recursions_size_in_bytes() == 8, "unexpected size");
2881     asm_assert_mem8_is_zero(ObjectMonitor::recursions_offset_in_bytes(), temp,
2882                             "monitor->_recursions should be 0", -1);
2883     // Invariant 2: OwnerIsThread shouldn't be 0.
2884     //assert(ObjectMonitor::OwnerIsThread_size_in_bytes() == 4, "unexpected size");
2885     //asm_assert_mem4_isnot_zero(ObjectMonitor::OwnerIsThread_offset_in_bytes(), temp,
2886     //                           "monitor->OwnerIsThread shouldn't be 0", -1);
2887 #   endif
2888 
2889 #if INCLUDE_RTM_OPT
2890     } // use_rtm()
2891 #endif
2892   }
2893 
2894   bind(cont);
2895   // flag == EQ indicates success
2896   // flag == NE indicates failure
2897 }
2898 
2899 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box,
2900                                                  Register temp, Register displaced_header, Register current_header,
2901                                                  bool try_bias, bool use_rtm) {
2902   assert_different_registers(oop, box, temp, displaced_header, current_header);
2903   assert(flag != CCR0, "bad condition register");
2904   Label cont;
2905   Label object_has_monitor;
2906 
2907   // Always do locking in runtime.
2908   if (EmitSync & 0x01) {
2909     cmpdi(flag, oop, 0); // Oop can't be 0 here => always false.
2910     return;
2911   }
2912 
2913   if (try_bias) {
2914     biased_locking_exit(flag, oop, current_header, cont);
2915   }
2916 
2917 #if INCLUDE_RTM_OPT
2918   if (UseRTMForStackLocks && use_rtm) {
2919     assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
2920     Label L_regular_unlock;
2921     ld(current_header, oopDesc::mark_offset_in_bytes(), oop);         // fetch markword
2922     andi(R0, current_header, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
2923     cmpwi(flag, R0, markOopDesc::unlocked_value);                     // bits = 001 unlocked
2924     bne(flag, L_regular_unlock);                                      // else RegularLock
2925     tend_();                                                          // otherwise end...
2926     b(cont);                                                          // ... and we're done
2927     bind(L_regular_unlock);
2928   }
2929 #endif
2930 
2931   // Find the lock address and load the displaced header from the stack.
2932   ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2933 
2934   // If the displaced header is 0, we have a recursive unlock.
2935   cmpdi(flag, displaced_header, 0);
2936   beq(flag, cont);
2937 
2938   // Handle existing monitor.
2939   if ((EmitSync & 0x02) == 0) {
2940     // The object has an existing monitor iff (mark & monitor_value) != 0.
2941     RTM_OPT_ONLY( if (!(UseRTMForStackLocks && use_rtm)) ) // skip load if already done
2942     ld(current_header, oopDesc::mark_offset_in_bytes(), oop);
2943     andi_(R0, current_header, markOopDesc::monitor_value);
2944     bne(CCR0, object_has_monitor);
2945   }
2946 
2947   // Check if it is still a light weight lock, this is is true if we see
2948   // the stack address of the basicLock in the markOop of the object.
2949   // Cmpxchg sets flag to cmpd(current_header, box).
2950   cmpxchgd(/*flag=*/flag,
2951            /*current_value=*/current_header,
2952            /*compare_value=*/box,
2953            /*exchange_value=*/displaced_header,
2954            /*where=*/oop,
2955            MacroAssembler::MemBarRel,
2956            MacroAssembler::cmpxchgx_hint_release_lock(),
2957            noreg,
2958            &cont);
2959 
2960   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2961 
2962   // Handle existing monitor.
2963   if ((EmitSync & 0x02) == 0) {
2964     b(cont);
2965 
2966     bind(object_has_monitor);
2967     addi(current_header, current_header, -markOopDesc::monitor_value); // monitor
2968     ld(temp,             ObjectMonitor::owner_offset_in_bytes(), current_header);
2969 
2970     // It's inflated.
2971 #if INCLUDE_RTM_OPT
2972     if (use_rtm) {
2973       Label L_regular_inflated_unlock;
2974       // Clean monitor_value bit to get valid pointer
2975       cmpdi(flag, temp, 0);
2976       bne(flag, L_regular_inflated_unlock);
2977       tend_();
2978       b(cont);
2979       bind(L_regular_inflated_unlock);
2980     }
2981 #endif
2982 
2983     ld(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header);
2984     xorr(temp, R16_thread, temp);      // Will be 0 if we are the owner.
2985     orr(temp, temp, displaced_header); // Will be 0 if there are 0 recursions.
2986     cmpdi(flag, temp, 0);
2987     bne(flag, cont);
2988 
2989     ld(temp,             ObjectMonitor::EntryList_offset_in_bytes(), current_header);
2990     ld(displaced_header, ObjectMonitor::cxq_offset_in_bytes(), current_header);
2991     orr(temp, temp, displaced_header); // Will be 0 if both are 0.
2992     cmpdi(flag, temp, 0);
2993     bne(flag, cont);
2994     release();
2995     std(temp, ObjectMonitor::owner_offset_in_bytes(), current_header);
2996   }
2997 
2998   bind(cont);
2999   // flag == EQ indicates success
3000   // flag == NE indicates failure
3001 }
3002 
3003 // Write serialization page so VM thread can do a pseudo remote membar.
3004 // We use the current thread pointer to calculate a thread specific
3005 // offset to write to within the page. This minimizes bus traffic
3006 // due to cache line collision.
3007 void MacroAssembler::serialize_memory(Register thread, Register tmp1, Register tmp2) {
3008   srdi(tmp2, thread, os::get_serialize_page_shift_count());
3009 
3010   int mask = os::vm_page_size() - sizeof(int);
3011   if (Assembler::is_simm(mask, 16)) {
3012     andi(tmp2, tmp2, mask);
3013   } else {
3014     lis(tmp1, (int)((signed short) (mask >> 16)));
3015     ori(tmp1, tmp1, mask & 0x0000ffff);
3016     andr(tmp2, tmp2, tmp1);
3017   }
3018 
3019   load_const(tmp1, (long) os::get_memory_serialize_page());
3020   release();
3021   stwx(R0, tmp1, tmp2);
3022 }
3023 
3024 void MacroAssembler::safepoint_poll(Label& slow_path, Register temp_reg) {
3025   if (SafepointMechanism::uses_thread_local_poll()) {
3026     ld(temp_reg, in_bytes(Thread::polling_page_offset()), R16_thread);
3027     // Armed page has poll_bit set.
3028     andi_(temp_reg, temp_reg, SafepointMechanism::poll_bit());
3029   } else {
3030     lwz(temp_reg, (RegisterOrConstant)(intptr_t)SafepointSynchronize::address_of_state());
3031     cmpwi(CCR0, temp_reg, SafepointSynchronize::_not_synchronized);
3032   }
3033   bne(CCR0, slow_path);
3034 }
3035 
3036 
3037 // GC barrier helper macros
3038 
3039 // Write the card table byte if needed.
3040 void MacroAssembler::card_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp) {
3041   CardTableModRefBS* bs =
3042     barrier_set_cast<CardTableModRefBS>(Universe::heap()->barrier_set());
3043   assert(bs->kind() == BarrierSet::CardTableForRS ||
3044          bs->kind() == BarrierSet::CardTableExtension, "wrong barrier");
3045 #ifdef ASSERT
3046   cmpdi(CCR0, Rnew_val, 0);
3047   asm_assert_ne("null oop not allowed", 0x321);
3048 #endif
3049   card_table_write(bs->byte_map_base, Rtmp, Rstore_addr);
3050 }
3051 
3052 // Write the card table byte.
3053 void MacroAssembler::card_table_write(jbyte* byte_map_base, Register Rtmp, Register Robj) {
3054   assert_different_registers(Robj, Rtmp, R0);
3055   load_const_optimized(Rtmp, (address)byte_map_base, R0);
3056   srdi(Robj, Robj, CardTableModRefBS::card_shift);
3057   li(R0, 0); // dirty
3058   if (UseConcMarkSweepGC) membar(Assembler::StoreStore);
3059   stbx(R0, Rtmp, Robj);
3060 }
3061 
3062 // Kills R31 if value is a volatile register.
3063 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2, bool needs_frame) {
3064   Label done;
3065   cmpdi(CCR0, value, 0);
3066   beq(CCR0, done);         // Use NULL as-is.
3067 
3068   clrrdi(tmp1, value, JNIHandles::weak_tag_size);
3069 #if INCLUDE_ALL_GCS
3070   if (UseG1GC) { andi_(tmp2, value, JNIHandles::weak_tag_mask); }
3071 #endif
3072   ld(value, 0, tmp1);      // Resolve (untagged) jobject.
3073 
3074 #if INCLUDE_ALL_GCS
3075   if (UseG1GC) {
3076     Label not_weak;
3077     beq(CCR0, not_weak);   // Test for jweak tag.
3078     verify_oop(value);
3079     g1_write_barrier_pre(noreg, // obj
3080                          noreg, // offset
3081                          value, // pre_val
3082                          tmp1, tmp2, needs_frame);
3083     bind(not_weak);
3084   }
3085 #endif // INCLUDE_ALL_GCS
3086   verify_oop(value);
3087   bind(done);
3088 }
3089 
3090 #if INCLUDE_ALL_GCS
3091 // General G1 pre-barrier generator.
3092 // Goal: record the previous value if it is not null.
3093 void MacroAssembler::g1_write_barrier_pre(Register Robj, RegisterOrConstant offset, Register Rpre_val,
3094                                           Register Rtmp1, Register Rtmp2, bool needs_frame) {
3095   Label runtime, filtered;
3096 
3097   // Is marking active?
3098   if (in_bytes(SATBMarkQueue::byte_width_of_active()) == 4) {
3099     lwz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_active()), R16_thread);
3100   } else {
3101     guarantee(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, "Assumption");
3102     lbz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_active()), R16_thread);
3103   }
3104   cmpdi(CCR0, Rtmp1, 0);
3105   beq(CCR0, filtered);
3106 
3107   // Do we need to load the previous value?
3108   if (Robj != noreg) {
3109     // Load the previous value...
3110     if (UseCompressedOops) {
3111       lwz(Rpre_val, offset, Robj);
3112     } else {
3113       ld(Rpre_val, offset, Robj);
3114     }
3115     // Previous value has been loaded into Rpre_val.
3116   }
3117   assert(Rpre_val != noreg, "must have a real register");
3118 
3119   // Is the previous value null?
3120   cmpdi(CCR0, Rpre_val, 0);
3121   beq(CCR0, filtered);
3122 
3123   if (Robj != noreg && UseCompressedOops) {
3124     decode_heap_oop_not_null(Rpre_val);
3125   }
3126 
3127   // OK, it's not filtered, so we'll need to call enqueue. In the normal
3128   // case, pre_val will be a scratch G-reg, but there are some cases in
3129   // which it's an O-reg. In the first case, do a normal call. In the
3130   // latter, do a save here and call the frameless version.
3131 
3132   // Can we store original value in the thread's buffer?
3133   // Is index == 0?
3134   // (The index field is typed as size_t.)
3135   const Register Rbuffer = Rtmp1, Rindex = Rtmp2;
3136 
3137   ld(Rindex, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_index()), R16_thread);
3138   cmpdi(CCR0, Rindex, 0);
3139   beq(CCR0, runtime); // If index == 0, goto runtime.
3140   ld(Rbuffer, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_buf()), R16_thread);
3141 
3142   addi(Rindex, Rindex, -wordSize); // Decrement index.
3143   std(Rindex, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_index()), R16_thread);
3144 
3145   // Record the previous value.
3146   stdx(Rpre_val, Rbuffer, Rindex);
3147   b(filtered);
3148 
3149   bind(runtime);
3150 
3151   // May need to preserve LR. Also needed if current frame is not compatible with C calling convention.
3152   if (needs_frame) {
3153     save_LR_CR(Rtmp1);
3154     push_frame_reg_args(0, Rtmp2);
3155   }
3156 
3157   if (Rpre_val->is_volatile() && Robj == noreg) mr(R31, Rpre_val); // Save pre_val across C call if it was preloaded.
3158   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), Rpre_val, R16_thread);
3159   if (Rpre_val->is_volatile() && Robj == noreg) mr(Rpre_val, R31); // restore
3160 
3161   if (needs_frame) {
3162     pop_frame();
3163     restore_LR_CR(Rtmp1);
3164   }
3165 
3166   bind(filtered);
3167 }
3168 
3169 // General G1 post-barrier generator
3170 // Store cross-region card.
3171 void MacroAssembler::g1_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp1, Register Rtmp2, Register Rtmp3, Label *filtered_ext) {
3172   Label runtime, filtered_int;
3173   Label& filtered = (filtered_ext != NULL) ? *filtered_ext : filtered_int;
3174   assert_different_registers(Rstore_addr, Rnew_val, Rtmp1, Rtmp2);
3175 
3176   G1SATBCardTableLoggingModRefBS* bs =
3177     barrier_set_cast<G1SATBCardTableLoggingModRefBS>(Universe::heap()->barrier_set());
3178 
3179   // Does store cross heap regions?
3180   if (G1RSBarrierRegionFilter) {
3181     xorr(Rtmp1, Rstore_addr, Rnew_val);
3182     srdi_(Rtmp1, Rtmp1, HeapRegion::LogOfHRGrainBytes);
3183     beq(CCR0, filtered);
3184   }
3185 
3186   // Crosses regions, storing NULL?
3187 #ifdef ASSERT
3188   cmpdi(CCR0, Rnew_val, 0);
3189   asm_assert_ne("null oop not allowed (G1)", 0x322); // Checked by caller on PPC64, so following branch is obsolete:
3190   //beq(CCR0, filtered);
3191 #endif
3192 
3193   // Storing region crossing non-NULL, is card already dirty?
3194   assert(sizeof(*bs->byte_map_base) == sizeof(jbyte), "adjust this code");
3195   const Register Rcard_addr = Rtmp1;
3196   Register Rbase = Rtmp2;
3197   load_const_optimized(Rbase, (address)bs->byte_map_base, /*temp*/ Rtmp3);
3198 
3199   srdi(Rcard_addr, Rstore_addr, CardTableModRefBS::card_shift);
3200 
3201   // Get the address of the card.
3202   lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr);
3203   cmpwi(CCR0, Rtmp3, (int)G1SATBCardTableModRefBS::g1_young_card_val());
3204   beq(CCR0, filtered);
3205 
3206   membar(Assembler::StoreLoad);
3207   lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr);  // Reload after membar.
3208   cmpwi(CCR0, Rtmp3 /* card value */, CardTableModRefBS::dirty_card_val());
3209   beq(CCR0, filtered);
3210 
3211   // Storing a region crossing, non-NULL oop, card is clean.
3212   // Dirty card and log.
3213   li(Rtmp3, CardTableModRefBS::dirty_card_val());
3214   //release(); // G1: oops are allowed to get visible after dirty marking.
3215   stbx(Rtmp3, Rbase, Rcard_addr);
3216 
3217   add(Rcard_addr, Rbase, Rcard_addr); // This is the address which needs to get enqueued.
3218   Rbase = noreg; // end of lifetime
3219 
3220   const Register Rqueue_index = Rtmp2,
3221                  Rqueue_buf   = Rtmp3;
3222   ld(Rqueue_index, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_index()), R16_thread);
3223   cmpdi(CCR0, Rqueue_index, 0);
3224   beq(CCR0, runtime); // index == 0 then jump to runtime
3225   ld(Rqueue_buf, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_buf()), R16_thread);
3226 
3227   addi(Rqueue_index, Rqueue_index, -wordSize); // decrement index
3228   std(Rqueue_index, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_index()), R16_thread);
3229 
3230   stdx(Rcard_addr, Rqueue_buf, Rqueue_index); // store card
3231   b(filtered);
3232 
3233   bind(runtime);
3234 
3235   // Save the live input values.
3236   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), Rcard_addr, R16_thread);
3237 
3238   bind(filtered_int);
3239 }
3240 #endif // INCLUDE_ALL_GCS
3241 
3242 // Values for last_Java_pc, and last_Java_sp must comply to the rules
3243 // in frame_ppc.hpp.
3244 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) {
3245   // Always set last_Java_pc and flags first because once last_Java_sp
3246   // is visible has_last_Java_frame is true and users will look at the
3247   // rest of the fields. (Note: flags should always be zero before we
3248   // get here so doesn't need to be set.)
3249 
3250   // Verify that last_Java_pc was zeroed on return to Java
3251   asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread,
3252                           "last_Java_pc not zeroed before leaving Java", 0x200);
3253 
3254   // When returning from calling out from Java mode the frame anchor's
3255   // last_Java_pc will always be set to NULL. It is set here so that
3256   // if we are doing a call to native (not VM) that we capture the
3257   // known pc and don't have to rely on the native call having a
3258   // standard frame linkage where we can find the pc.
3259   if (last_Java_pc != noreg)
3260     std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
3261 
3262   // Set last_Java_sp last.
3263   std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
3264 }
3265 
3266 void MacroAssembler::reset_last_Java_frame(void) {
3267   asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()),
3268                              R16_thread, "SP was not set, still zero", 0x202);
3269 
3270   BLOCK_COMMENT("reset_last_Java_frame {");
3271   li(R0, 0);
3272 
3273   // _last_Java_sp = 0
3274   std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
3275 
3276   // _last_Java_pc = 0
3277   std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
3278   BLOCK_COMMENT("} reset_last_Java_frame");
3279 }
3280 
3281 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) {
3282   assert_different_registers(sp, tmp1);
3283 
3284   // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via
3285   // TOP_IJAVA_FRAME_ABI.
3286   // FIXME: assert that we really have a TOP_IJAVA_FRAME here!
3287   address entry = pc();
3288   load_const_optimized(tmp1, entry);
3289 
3290   set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1);
3291 }
3292 
3293 void MacroAssembler::get_vm_result(Register oop_result) {
3294   // Read:
3295   //   R16_thread
3296   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
3297   //
3298   // Updated:
3299   //   oop_result
3300   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
3301 
3302   verify_thread();
3303 
3304   ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread);
3305   li(R0, 0);
3306   std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread);
3307 
3308   verify_oop(oop_result);
3309 }
3310 
3311 void MacroAssembler::get_vm_result_2(Register metadata_result) {
3312   // Read:
3313   //   R16_thread
3314   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
3315   //
3316   // Updated:
3317   //   metadata_result
3318   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
3319 
3320   ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
3321   li(R0, 0);
3322   std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
3323 }
3324 
3325 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3326   Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided.
3327   if (Universe::narrow_klass_base() != 0) {
3328     // Use dst as temp if it is free.
3329     sub_const_optimized(dst, current, Universe::narrow_klass_base(), R0);
3330     current = dst;
3331   }
3332   if (Universe::narrow_klass_shift() != 0) {
3333     srdi(dst, current, Universe::narrow_klass_shift());
3334     current = dst;
3335   }
3336   return current;
3337 }
3338 
3339 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) {
3340   if (UseCompressedClassPointers) {
3341     Register compressedKlass = encode_klass_not_null(ck, klass);
3342     stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop);
3343   } else {
3344     std(klass, oopDesc::klass_offset_in_bytes(), dst_oop);
3345   }
3346 }
3347 
3348 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) {
3349   if (UseCompressedClassPointers) {
3350     if (val == noreg) {
3351       val = R0;
3352       li(val, 0);
3353     }
3354     stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed
3355   }
3356 }
3357 
3358 int MacroAssembler::instr_size_for_decode_klass_not_null() {
3359   if (!UseCompressedClassPointers) return 0;
3360   int num_instrs = 1;  // shift or move
3361   if (Universe::narrow_klass_base() != 0) num_instrs = 7;  // shift + load const + add
3362   return num_instrs * BytesPerInstWord;
3363 }
3364 
3365 void MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3366   assert(dst != R0, "Dst reg may not be R0, as R0 is used here.");
3367   if (src == noreg) src = dst;
3368   Register shifted_src = src;
3369   if (Universe::narrow_klass_shift() != 0 ||
3370       Universe::narrow_klass_base() == 0 && src != dst) {  // Move required.
3371     shifted_src = dst;
3372     sldi(shifted_src, src, Universe::narrow_klass_shift());
3373   }
3374   if (Universe::narrow_klass_base() != 0) {
3375     add_const_optimized(dst, shifted_src, Universe::narrow_klass_base(), R0);
3376   }
3377 }
3378 
3379 void MacroAssembler::load_klass(Register dst, Register src) {
3380   if (UseCompressedClassPointers) {
3381     lwz(dst, oopDesc::klass_offset_in_bytes(), src);
3382     // Attention: no null check here!
3383     decode_klass_not_null(dst, dst);
3384   } else {
3385     ld(dst, oopDesc::klass_offset_in_bytes(), src);
3386   }
3387 }
3388 
3389 // ((OopHandle)result).resolve();
3390 void MacroAssembler::resolve_oop_handle(Register result) {
3391   // OopHandle::resolve is an indirection.
3392   ld(result, 0, result);
3393 }
3394 
3395 void MacroAssembler::load_mirror_from_const_method(Register mirror, Register const_method) {
3396   ld(mirror, in_bytes(ConstMethod::constants_offset()), const_method);
3397   ld(mirror, ConstantPool::pool_holder_offset_in_bytes(), mirror);
3398   ld(mirror, in_bytes(Klass::java_mirror_offset()), mirror);
3399   resolve_oop_handle(mirror);
3400 }
3401 
3402 // Clear Array
3403 // For very short arrays. tmp == R0 is allowed.
3404 void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) {
3405   if (cnt_dwords > 0) { li(tmp, 0); }
3406   for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); }
3407 }
3408 
3409 // Version for constant short array length. Kills base_ptr. tmp == R0 is allowed.
3410 void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) {
3411   if (cnt_dwords < 8) {
3412     clear_memory_unrolled(base_ptr, cnt_dwords, tmp);
3413     return;
3414   }
3415 
3416   Label loop;
3417   const long loopcnt   = cnt_dwords >> 1,
3418              remainder = cnt_dwords & 1;
3419 
3420   li(tmp, loopcnt);
3421   mtctr(tmp);
3422   li(tmp, 0);
3423   bind(loop);
3424     std(tmp, 0, base_ptr);
3425     std(tmp, 8, base_ptr);
3426     addi(base_ptr, base_ptr, 16);
3427     bdnz(loop);
3428   if (remainder) { std(tmp, 0, base_ptr); }
3429 }
3430 
3431 // Kills both input registers. tmp == R0 is allowed.
3432 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) {
3433   // Procedure for large arrays (uses data cache block zero instruction).
3434     Label startloop, fast, fastloop, small_rest, restloop, done;
3435     const int cl_size         = VM_Version::L1_data_cache_line_size(),
3436               cl_dwords       = cl_size >> 3,
3437               cl_dw_addr_bits = exact_log2(cl_dwords),
3438               dcbz_min        = 1,  // Min count of dcbz executions, needs to be >0.
3439               min_cnt         = ((dcbz_min + 1) << cl_dw_addr_bits) - 1;
3440 
3441   if (const_cnt >= 0) {
3442     // Constant case.
3443     if (const_cnt < min_cnt) {
3444       clear_memory_constlen(base_ptr, const_cnt, tmp);
3445       return;
3446     }
3447     load_const_optimized(cnt_dwords, const_cnt, tmp);
3448   } else {
3449     // cnt_dwords already loaded in register. Need to check size.
3450     cmpdi(CCR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included).
3451     blt(CCR1, small_rest);
3452   }
3453     rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line.
3454     beq(CCR0, fast);                                  // Already 128byte aligned.
3455 
3456     subfic(tmp, tmp, cl_dwords);
3457     mtctr(tmp);                        // Set ctr to hit 128byte boundary (0<ctr<cl_dwords).
3458     subf(cnt_dwords, tmp, cnt_dwords); // rest.
3459     li(tmp, 0);
3460 
3461   bind(startloop);                     // Clear at the beginning to reach 128byte boundary.
3462     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3463     addi(base_ptr, base_ptr, 8);
3464     bdnz(startloop);
3465 
3466   bind(fast);                                  // Clear 128byte blocks.
3467     srdi(tmp, cnt_dwords, cl_dw_addr_bits);    // Loop count for 128byte loop (>0).
3468     andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords.
3469     mtctr(tmp);                                // Load counter.
3470 
3471   bind(fastloop);
3472     dcbz(base_ptr);                    // Clear 128byte aligned block.
3473     addi(base_ptr, base_ptr, cl_size);
3474     bdnz(fastloop);
3475 
3476   bind(small_rest);
3477     cmpdi(CCR0, cnt_dwords, 0);        // size 0?
3478     beq(CCR0, done);                   // rest == 0
3479     li(tmp, 0);
3480     mtctr(cnt_dwords);                 // Load counter.
3481 
3482   bind(restloop);                      // Clear rest.
3483     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3484     addi(base_ptr, base_ptr, 8);
3485     bdnz(restloop);
3486 
3487   bind(done);
3488 }
3489 
3490 /////////////////////////////////////////// String intrinsics ////////////////////////////////////////////
3491 
3492 #ifdef COMPILER2
3493 // Intrinsics for CompactStrings
3494 
3495 // Compress char[] to byte[] by compressing 16 bytes at once.
3496 void MacroAssembler::string_compress_16(Register src, Register dst, Register cnt,
3497                                         Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5,
3498                                         Label& Lfailure) {
3499 
3500   const Register tmp0 = R0;
3501   assert_different_registers(src, dst, cnt, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5);
3502   Label Lloop, Lslow;
3503 
3504   // Check if cnt >= 8 (= 16 bytes)
3505   lis(tmp1, 0xFF);                // tmp1 = 0x00FF00FF00FF00FF
3506   srwi_(tmp2, cnt, 3);
3507   beq(CCR0, Lslow);
3508   ori(tmp1, tmp1, 0xFF);
3509   rldimi(tmp1, tmp1, 32, 0);
3510   mtctr(tmp2);
3511 
3512   // 2x unrolled loop
3513   bind(Lloop);
3514   ld(tmp2, 0, src);               // _0_1_2_3 (Big Endian)
3515   ld(tmp4, 8, src);               // _4_5_6_7
3516 
3517   orr(tmp0, tmp2, tmp4);
3518   rldicl(tmp3, tmp2, 6*8, 64-24); // _____1_2
3519   rldimi(tmp2, tmp2, 2*8, 2*8);   // _0_2_3_3
3520   rldicl(tmp5, tmp4, 6*8, 64-24); // _____5_6
3521   rldimi(tmp4, tmp4, 2*8, 2*8);   // _4_6_7_7
3522 
3523   andc_(tmp0, tmp0, tmp1);
3524   bne(CCR0, Lfailure);            // Not latin1.
3525   addi(src, src, 16);
3526 
3527   rlwimi(tmp3, tmp2, 0*8, 24, 31);// _____1_3
3528   srdi(tmp2, tmp2, 3*8);          // ____0_2_
3529   rlwimi(tmp5, tmp4, 0*8, 24, 31);// _____5_7
3530   srdi(tmp4, tmp4, 3*8);          // ____4_6_
3531 
3532   orr(tmp2, tmp2, tmp3);          // ____0123
3533   orr(tmp4, tmp4, tmp5);          // ____4567
3534 
3535   stw(tmp2, 0, dst);
3536   stw(tmp4, 4, dst);
3537   addi(dst, dst, 8);
3538   bdnz(Lloop);
3539 
3540   bind(Lslow);                    // Fallback to slow version
3541 }
3542 
3543 // Compress char[] to byte[]. cnt must be positive int.
3544 void MacroAssembler::string_compress(Register src, Register dst, Register cnt, Register tmp, Label& Lfailure) {
3545   Label Lloop;
3546   mtctr(cnt);
3547 
3548   bind(Lloop);
3549   lhz(tmp, 0, src);
3550   cmplwi(CCR0, tmp, 0xff);
3551   bgt(CCR0, Lfailure);            // Not latin1.
3552   addi(src, src, 2);
3553   stb(tmp, 0, dst);
3554   addi(dst, dst, 1);
3555   bdnz(Lloop);
3556 }
3557 
3558 // Inflate byte[] to char[] by inflating 16 bytes at once.
3559 void MacroAssembler::string_inflate_16(Register src, Register dst, Register cnt,
3560                                        Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
3561   const Register tmp0 = R0;
3562   assert_different_registers(src, dst, cnt, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5);
3563   Label Lloop, Lslow;
3564 
3565   // Check if cnt >= 8
3566   srwi_(tmp2, cnt, 3);
3567   beq(CCR0, Lslow);
3568   lis(tmp1, 0xFF);                // tmp1 = 0x00FF00FF
3569   ori(tmp1, tmp1, 0xFF);
3570   mtctr(tmp2);
3571 
3572   // 2x unrolled loop
3573   bind(Lloop);
3574   lwz(tmp2, 0, src);              // ____0123 (Big Endian)
3575   lwz(tmp4, 4, src);              // ____4567
3576   addi(src, src, 8);
3577 
3578   rldicl(tmp3, tmp2, 7*8, 64-8);  // _______2
3579   rlwimi(tmp2, tmp2, 3*8, 16, 23);// ____0113
3580   rldicl(tmp5, tmp4, 7*8, 64-8);  // _______6
3581   rlwimi(tmp4, tmp4, 3*8, 16, 23);// ____4557
3582 
3583   andc(tmp0, tmp2, tmp1);         // ____0_1_
3584   rlwimi(tmp2, tmp3, 2*8, 0, 23); // _____2_3
3585   andc(tmp3, tmp4, tmp1);         // ____4_5_
3586   rlwimi(tmp4, tmp5, 2*8, 0, 23); // _____6_7
3587 
3588   rldimi(tmp2, tmp0, 3*8, 0*8);   // _0_1_2_3
3589   rldimi(tmp4, tmp3, 3*8, 0*8);   // _4_5_6_7
3590 
3591   std(tmp2, 0, dst);
3592   std(tmp4, 8, dst);
3593   addi(dst, dst, 16);
3594   bdnz(Lloop);
3595 
3596   bind(Lslow);                    // Fallback to slow version
3597 }
3598 
3599 // Inflate byte[] to char[]. cnt must be positive int.
3600 void MacroAssembler::string_inflate(Register src, Register dst, Register cnt, Register tmp) {
3601   Label Lloop;
3602   mtctr(cnt);
3603 
3604   bind(Lloop);
3605   lbz(tmp, 0, src);
3606   addi(src, src, 1);
3607   sth(tmp, 0, dst);
3608   addi(dst, dst, 2);
3609   bdnz(Lloop);
3610 }
3611 
3612 void MacroAssembler::string_compare(Register str1, Register str2,
3613                                     Register cnt1, Register cnt2,
3614                                     Register tmp1, Register result, int ae) {
3615   const Register tmp0 = R0,
3616                  diff = tmp1;
3617 
3618   assert_different_registers(str1, str2, cnt1, cnt2, tmp0, tmp1, result);
3619   Label Ldone, Lslow, Lloop, Lreturn_diff;
3620 
3621   // Note: Making use of the fact that compareTo(a, b) == -compareTo(b, a)
3622   // we interchange str1 and str2 in the UL case and negate the result.
3623   // Like this, str1 is always latin1 encoded, except for the UU case.
3624   // In addition, we need 0 (or sign which is 0) extend.
3625 
3626   if (ae == StrIntrinsicNode::UU) {
3627     srwi(cnt1, cnt1, 1);
3628   } else {
3629     clrldi(cnt1, cnt1, 32);
3630   }
3631 
3632   if (ae != StrIntrinsicNode::LL) {
3633     srwi(cnt2, cnt2, 1);
3634   } else {
3635     clrldi(cnt2, cnt2, 32);
3636   }
3637 
3638   // See if the lengths are different, and calculate min in cnt1.
3639   // Save diff in case we need it for a tie-breaker.
3640   subf_(diff, cnt2, cnt1); // diff = cnt1 - cnt2
3641   // if (diff > 0) { cnt1 = cnt2; }
3642   if (VM_Version::has_isel()) {
3643     isel(cnt1, CCR0, Assembler::greater, /*invert*/ false, cnt2);
3644   } else {
3645     Label Lskip;
3646     blt(CCR0, Lskip);
3647     mr(cnt1, cnt2);
3648     bind(Lskip);
3649   }
3650 
3651   // Rename registers
3652   Register chr1 = result;
3653   Register chr2 = tmp0;
3654 
3655   // Compare multiple characters in fast loop (only implemented for same encoding).
3656   int stride1 = 8, stride2 = 8;
3657   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3658     int log2_chars_per_iter = (ae == StrIntrinsicNode::LL) ? 3 : 2;
3659     Label Lfastloop, Lskipfast;
3660 
3661     srwi_(tmp0, cnt1, log2_chars_per_iter);
3662     beq(CCR0, Lskipfast);
3663     rldicl(cnt2, cnt1, 0, 64 - log2_chars_per_iter); // Remaining characters.
3664     li(cnt1, 1 << log2_chars_per_iter); // Initialize for failure case: Rescan characters from current iteration.
3665     mtctr(tmp0);
3666 
3667     bind(Lfastloop);
3668     ld(chr1, 0, str1);
3669     ld(chr2, 0, str2);
3670     cmpd(CCR0, chr1, chr2);
3671     bne(CCR0, Lslow);
3672     addi(str1, str1, stride1);
3673     addi(str2, str2, stride2);
3674     bdnz(Lfastloop);
3675     mr(cnt1, cnt2); // Remaining characters.
3676     bind(Lskipfast);
3677   }
3678 
3679   // Loop which searches the first difference character by character.
3680   cmpwi(CCR0, cnt1, 0);
3681   beq(CCR0, Lreturn_diff);
3682   bind(Lslow);
3683   mtctr(cnt1);
3684 
3685   switch (ae) {
3686     case StrIntrinsicNode::LL: stride1 = 1; stride2 = 1; break;
3687     case StrIntrinsicNode::UL: // fallthru (see comment above)
3688     case StrIntrinsicNode::LU: stride1 = 1; stride2 = 2; break;
3689     case StrIntrinsicNode::UU: stride1 = 2; stride2 = 2; break;
3690     default: ShouldNotReachHere(); break;
3691   }
3692 
3693   bind(Lloop);
3694   if (stride1 == 1) { lbz(chr1, 0, str1); } else { lhz(chr1, 0, str1); }
3695   if (stride2 == 1) { lbz(chr2, 0, str2); } else { lhz(chr2, 0, str2); }
3696   subf_(result, chr2, chr1); // result = chr1 - chr2
3697   bne(CCR0, Ldone);
3698   addi(str1, str1, stride1);
3699   addi(str2, str2, stride2);
3700   bdnz(Lloop);
3701 
3702   // If strings are equal up to min length, return the length difference.
3703   bind(Lreturn_diff);
3704   mr(result, diff);
3705 
3706   // Otherwise, return the difference between the first mismatched chars.
3707   bind(Ldone);
3708   if (ae == StrIntrinsicNode::UL) {
3709     neg(result, result); // Negate result (see note above).
3710   }
3711 }
3712 
3713 void MacroAssembler::array_equals(bool is_array_equ, Register ary1, Register ary2,
3714                                   Register limit, Register tmp1, Register result, bool is_byte) {
3715   const Register tmp0 = R0;
3716   assert_different_registers(ary1, ary2, limit, tmp0, tmp1, result);
3717   Label Ldone, Lskiploop, Lloop, Lfastloop, Lskipfast;
3718   bool limit_needs_shift = false;
3719 
3720   if (is_array_equ) {
3721     const int length_offset = arrayOopDesc::length_offset_in_bytes();
3722     const int base_offset   = arrayOopDesc::base_offset_in_bytes(is_byte ? T_BYTE : T_CHAR);
3723 
3724     // Return true if the same array.
3725     cmpd(CCR0, ary1, ary2);
3726     beq(CCR0, Lskiploop);
3727 
3728     // Return false if one of them is NULL.
3729     cmpdi(CCR0, ary1, 0);
3730     cmpdi(CCR1, ary2, 0);
3731     li(result, 0);
3732     cror(CCR0, Assembler::equal, CCR1, Assembler::equal);
3733     beq(CCR0, Ldone);
3734 
3735     // Load the lengths of arrays.
3736     lwz(limit, length_offset, ary1);
3737     lwz(tmp0, length_offset, ary2);
3738 
3739     // Return false if the two arrays are not equal length.
3740     cmpw(CCR0, limit, tmp0);
3741     bne(CCR0, Ldone);
3742 
3743     // Load array addresses.
3744     addi(ary1, ary1, base_offset);
3745     addi(ary2, ary2, base_offset);
3746   } else {
3747     limit_needs_shift = !is_byte;
3748     li(result, 0); // Assume not equal.
3749   }
3750 
3751   // Rename registers
3752   Register chr1 = tmp0;
3753   Register chr2 = tmp1;
3754 
3755   // Compare 8 bytes per iteration in fast loop.
3756   const int log2_chars_per_iter = is_byte ? 3 : 2;
3757 
3758   srwi_(tmp0, limit, log2_chars_per_iter + (limit_needs_shift ? 1 : 0));
3759   beq(CCR0, Lskipfast);
3760   mtctr(tmp0);
3761 
3762   bind(Lfastloop);
3763   ld(chr1, 0, ary1);
3764   ld(chr2, 0, ary2);
3765   addi(ary1, ary1, 8);
3766   addi(ary2, ary2, 8);
3767   cmpd(CCR0, chr1, chr2);
3768   bne(CCR0, Ldone);
3769   bdnz(Lfastloop);
3770 
3771   bind(Lskipfast);
3772   rldicl_(limit, limit, limit_needs_shift ? 64 - 1 : 0, 64 - log2_chars_per_iter); // Remaining characters.
3773   beq(CCR0, Lskiploop);
3774   mtctr(limit);
3775 
3776   // Character by character.
3777   bind(Lloop);
3778   if (is_byte) {
3779     lbz(chr1, 0, ary1);
3780     lbz(chr2, 0, ary2);
3781     addi(ary1, ary1, 1);
3782     addi(ary2, ary2, 1);
3783   } else {
3784     lhz(chr1, 0, ary1);
3785     lhz(chr2, 0, ary2);
3786     addi(ary1, ary1, 2);
3787     addi(ary2, ary2, 2);
3788   }
3789   cmpw(CCR0, chr1, chr2);
3790   bne(CCR0, Ldone);
3791   bdnz(Lloop);
3792 
3793   bind(Lskiploop);
3794   li(result, 1); // All characters are equal.
3795   bind(Ldone);
3796 }
3797 
3798 void MacroAssembler::string_indexof(Register result, Register haystack, Register haycnt,
3799                                     Register needle, ciTypeArray* needle_values, Register needlecnt, int needlecntval,
3800                                     Register tmp1, Register tmp2, Register tmp3, Register tmp4, int ae) {
3801 
3802   // Ensure 0<needlecnt<=haycnt in ideal graph as prerequisite!
3803   Label L_TooShort, L_Found, L_NotFound, L_End;
3804   Register last_addr = haycnt, // Kill haycnt at the beginning.
3805   addr      = tmp1,
3806   n_start   = tmp2,
3807   ch1       = tmp3,
3808   ch2       = R0;
3809 
3810   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
3811   const int h_csize = (ae == StrIntrinsicNode::LL) ? 1 : 2;
3812   const int n_csize = (ae == StrIntrinsicNode::UU) ? 2 : 1;
3813 
3814   // **************************************************************************************************
3815   // Prepare for main loop: optimized for needle count >=2, bail out otherwise.
3816   // **************************************************************************************************
3817 
3818   // Compute last haystack addr to use if no match gets found.
3819   clrldi(haycnt, haycnt, 32);         // Ensure positive int is valid as 64 bit value.
3820   addi(addr, haystack, -h_csize);     // Accesses use pre-increment.
3821   if (needlecntval == 0) { // variable needlecnt
3822    cmpwi(CCR6, needlecnt, 2);
3823    clrldi(needlecnt, needlecnt, 32);  // Ensure positive int is valid as 64 bit value.
3824    blt(CCR6, L_TooShort);             // Variable needlecnt: handle short needle separately.
3825   }
3826 
3827   if (n_csize == 2) { lwz(n_start, 0, needle); } else { lhz(n_start, 0, needle); } // Load first 2 characters of needle.
3828 
3829   if (needlecntval == 0) { // variable needlecnt
3830    subf(ch1, needlecnt, haycnt);      // Last character index to compare is haycnt-needlecnt.
3831    addi(needlecnt, needlecnt, -2);    // Rest of needle.
3832   } else { // constant needlecnt
3833   guarantee(needlecntval != 1, "IndexOf with single-character needle must be handled separately");
3834   assert((needlecntval & 0x7fff) == needlecntval, "wrong immediate");
3835    addi(ch1, haycnt, -needlecntval);  // Last character index to compare is haycnt-needlecnt.
3836    if (needlecntval > 3) { li(needlecnt, needlecntval - 2); } // Rest of needle.
3837   }
3838 
3839   if (h_csize == 2) { slwi(ch1, ch1, 1); } // Scale to number of bytes.
3840 
3841   if (ae ==StrIntrinsicNode::UL) {
3842    srwi(tmp4, n_start, 1*8);          // ___0
3843    rlwimi(n_start, tmp4, 2*8, 0, 23); // _0_1
3844   }
3845 
3846   add(last_addr, haystack, ch1);      // Point to last address to compare (haystack+2*(haycnt-needlecnt)).
3847 
3848   // Main Loop (now we have at least 2 characters).
3849   Label L_OuterLoop, L_InnerLoop, L_FinalCheck, L_Comp1, L_Comp2;
3850   bind(L_OuterLoop); // Search for 1st 2 characters.
3851   Register addr_diff = tmp4;
3852    subf(addr_diff, addr, last_addr);  // Difference between already checked address and last address to check.
3853    addi(addr, addr, h_csize);         // This is the new address we want to use for comparing.
3854    srdi_(ch2, addr_diff, h_csize);
3855    beq(CCR0, L_FinalCheck);           // 2 characters left?
3856    mtctr(ch2);                        // num of characters / 2
3857   bind(L_InnerLoop);                  // Main work horse (2x unrolled search loop)
3858    if (h_csize == 2) {                // Load 2 characters of haystack (ignore alignment).
3859     lwz(ch1, 0, addr);
3860     lwz(ch2, 2, addr);
3861    } else {
3862     lhz(ch1, 0, addr);
3863     lhz(ch2, 1, addr);
3864    }
3865    cmpw(CCR0, ch1, n_start);          // Compare 2 characters (1 would be sufficient but try to reduce branches to CompLoop).
3866    cmpw(CCR1, ch2, n_start);
3867    beq(CCR0, L_Comp1);                // Did we find the needle start?
3868    beq(CCR1, L_Comp2);
3869    addi(addr, addr, 2 * h_csize);
3870    bdnz(L_InnerLoop);
3871   bind(L_FinalCheck);
3872    andi_(addr_diff, addr_diff, h_csize); // Remaining characters not covered by InnerLoop: (num of characters) & 1.
3873    beq(CCR0, L_NotFound);
3874    if (h_csize == 2) { lwz(ch1, 0, addr); } else { lhz(ch1, 0, addr); } // One position left at which we have to compare.
3875    cmpw(CCR1, ch1, n_start);
3876    beq(CCR1, L_Comp1);
3877   bind(L_NotFound);
3878    li(result, -1);                    // not found
3879    b(L_End);
3880 
3881    // **************************************************************************************************
3882    // Special Case: unfortunately, the variable needle case can be called with needlecnt<2
3883    // **************************************************************************************************
3884   if (needlecntval == 0) {           // We have to handle these cases separately.
3885   Label L_OneCharLoop;
3886   bind(L_TooShort);
3887    mtctr(haycnt);
3888    if (n_csize == 2) { lhz(n_start, 0, needle); } else { lbz(n_start, 0, needle); } // First character of needle
3889   bind(L_OneCharLoop);
3890    if (h_csize == 2) { lhzu(ch1, 2, addr); } else { lbzu(ch1, 1, addr); }
3891    cmpw(CCR1, ch1, n_start);
3892    beq(CCR1, L_Found);               // Did we find the one character needle?
3893    bdnz(L_OneCharLoop);
3894    li(result, -1);                   // Not found.
3895    b(L_End);
3896   }
3897 
3898   // **************************************************************************************************
3899   // Regular Case Part II: compare rest of needle (first 2 characters have been compared already)
3900   // **************************************************************************************************
3901 
3902   // Compare the rest
3903   bind(L_Comp2);
3904    addi(addr, addr, h_csize);        // First comparison has failed, 2nd one hit.
3905   bind(L_Comp1);                     // Addr points to possible needle start.
3906   if (needlecntval != 2) {           // Const needlecnt==2?
3907    if (needlecntval != 3) {
3908     if (needlecntval == 0) { beq(CCR6, L_Found); } // Variable needlecnt==2?
3909     Register n_ind = tmp4,
3910              h_ind = n_ind;
3911     li(n_ind, 2 * n_csize);          // First 2 characters are already compared, use index 2.
3912     mtctr(needlecnt);                // Decremented by 2, still > 0.
3913    Label L_CompLoop;
3914    bind(L_CompLoop);
3915     if (ae ==StrIntrinsicNode::UL) {
3916       h_ind = ch1;
3917       sldi(h_ind, n_ind, 1);
3918     }
3919     if (n_csize == 2) { lhzx(ch2, needle, n_ind); } else { lbzx(ch2, needle, n_ind); }
3920     if (h_csize == 2) { lhzx(ch1, addr, h_ind); } else { lbzx(ch1, addr, h_ind); }
3921     cmpw(CCR1, ch1, ch2);
3922     bne(CCR1, L_OuterLoop);
3923     addi(n_ind, n_ind, n_csize);
3924     bdnz(L_CompLoop);
3925    } else { // No loop required if there's only one needle character left.
3926     if (n_csize == 2) { lhz(ch2, 2 * 2, needle); } else { lbz(ch2, 2 * 1, needle); }
3927     if (h_csize == 2) { lhz(ch1, 2 * 2, addr); } else { lbz(ch1, 2 * 1, addr); }
3928     cmpw(CCR1, ch1, ch2);
3929     bne(CCR1, L_OuterLoop);
3930    }
3931   }
3932   // Return index ...
3933   bind(L_Found);
3934    subf(result, haystack, addr);     // relative to haystack, ...
3935    if (h_csize == 2) { srdi(result, result, 1); } // in characters.
3936   bind(L_End);
3937 } // string_indexof
3938 
3939 void MacroAssembler::string_indexof_char(Register result, Register haystack, Register haycnt,
3940                                          Register needle, jchar needleChar, Register tmp1, Register tmp2, bool is_byte) {
3941   assert_different_registers(haystack, haycnt, needle, tmp1, tmp2);
3942 
3943   Label L_InnerLoop, L_FinalCheck, L_Found1, L_Found2, L_NotFound, L_End;
3944   Register addr = tmp1,
3945            ch1 = tmp2,
3946            ch2 = R0;
3947 
3948   const int h_csize = is_byte ? 1 : 2;
3949 
3950 //4:
3951    srwi_(tmp2, haycnt, 1);   // Shift right by exact_log2(UNROLL_FACTOR).
3952    mr(addr, haystack);
3953    beq(CCR0, L_FinalCheck);
3954    mtctr(tmp2);              // Move to count register.
3955 //8:
3956   bind(L_InnerLoop);         // Main work horse (2x unrolled search loop).
3957    if (!is_byte) {
3958     lhz(ch1, 0, addr);
3959     lhz(ch2, 2, addr);
3960    } else {
3961     lbz(ch1, 0, addr);
3962     lbz(ch2, 1, addr);
3963    }
3964    (needle != R0) ? cmpw(CCR0, ch1, needle) : cmplwi(CCR0, ch1, (unsigned int)needleChar);
3965    (needle != R0) ? cmpw(CCR1, ch2, needle) : cmplwi(CCR1, ch2, (unsigned int)needleChar);
3966    beq(CCR0, L_Found1);      // Did we find the needle?
3967    beq(CCR1, L_Found2);
3968    addi(addr, addr, 2 * h_csize);
3969    bdnz(L_InnerLoop);
3970 //16:
3971   bind(L_FinalCheck);
3972    andi_(R0, haycnt, 1);
3973    beq(CCR0, L_NotFound);
3974    if (!is_byte) { lhz(ch1, 0, addr); } else { lbz(ch1, 0, addr); } // One position left at which we have to compare.
3975    (needle != R0) ? cmpw(CCR1, ch1, needle) : cmplwi(CCR1, ch1, (unsigned int)needleChar);
3976    beq(CCR1, L_Found1);
3977 //21:
3978   bind(L_NotFound);
3979    li(result, -1);           // Not found.
3980    b(L_End);
3981 
3982   bind(L_Found2);
3983    addi(addr, addr, h_csize);
3984 //24:
3985   bind(L_Found1);            // Return index ...
3986    subf(result, haystack, addr); // relative to haystack, ...
3987    if (!is_byte) { srdi(result, result, 1); } // in characters.
3988   bind(L_End);
3989 } // string_indexof_char
3990 
3991 
3992 void MacroAssembler::has_negatives(Register src, Register cnt, Register result,
3993                                    Register tmp1, Register tmp2) {
3994   const Register tmp0 = R0;
3995   assert_different_registers(src, result, cnt, tmp0, tmp1, tmp2);
3996   Label Lfastloop, Lslow, Lloop, Lnoneg, Ldone;
3997 
3998   // Check if cnt >= 8 (= 16 bytes)
3999   lis(tmp1, (int)(short)0x8080);  // tmp1 = 0x8080808080808080
4000   srwi_(tmp2, cnt, 4);
4001   li(result, 1);                  // Assume there's a negative byte.
4002   beq(CCR0, Lslow);
4003   ori(tmp1, tmp1, 0x8080);
4004   rldimi(tmp1, tmp1, 32, 0);
4005   mtctr(tmp2);
4006 
4007   // 2x unrolled loop
4008   bind(Lfastloop);
4009   ld(tmp2, 0, src);
4010   ld(tmp0, 8, src);
4011 
4012   orr(tmp0, tmp2, tmp0);
4013 
4014   and_(tmp0, tmp0, tmp1);
4015   bne(CCR0, Ldone);               // Found negative byte.
4016   addi(src, src, 16);
4017 
4018   bdnz(Lfastloop);
4019 
4020   bind(Lslow);                    // Fallback to slow version
4021   rldicl_(tmp0, cnt, 0, 64-4);
4022   beq(CCR0, Lnoneg);
4023   mtctr(tmp0);
4024   bind(Lloop);
4025   lbz(tmp0, 0, src);
4026   addi(src, src, 1);
4027   andi_(tmp0, tmp0, 0x80);
4028   bne(CCR0, Ldone);               // Found negative byte.
4029   bdnz(Lloop);
4030   bind(Lnoneg);
4031   li(result, 0);
4032 
4033   bind(Ldone);
4034 }
4035 
4036 #endif // Compiler2
4037 
4038 // Helpers for Intrinsic Emitters
4039 //
4040 // Revert the byte order of a 32bit value in a register
4041 //   src: 0x44556677
4042 //   dst: 0x77665544
4043 // Three steps to obtain the result:
4044 //  1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word
4045 //     into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared.
4046 //     This value initializes dst.
4047 //  2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost
4048 //     byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go.
4049 //     This value is mask inserted into dst with a [0..23] mask of 1s.
4050 //  3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position.
4051 //     This value is mask inserted into dst with a [8..15] mask of 1s.
4052 void MacroAssembler::load_reverse_32(Register dst, Register src) {
4053   assert_different_registers(dst, src);
4054 
4055   rldicl(dst, src, (4+1)*8, 56);       // Rotate byte 4 into position 7 (rightmost), clear all to the left.
4056   rlwimi(dst, src,     3*8,  0, 23);   // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone.
4057   rlwimi(dst, src,     1*8,  8, 15);   // Insert byte 6 into position 5, leave the rest alone.
4058 }
4059 
4060 // Calculate the column addresses of the crc32 lookup table into distinct registers.
4061 // This loop-invariant calculation is moved out of the loop body, reducing the loop
4062 // body size from 20 to 16 instructions.
4063 // Returns the offset that was used to calculate the address of column tc3.
4064 // Due to register shortage, setting tc3 may overwrite table. With the return offset
4065 // at hand, the original table address can be easily reconstructed.
4066 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {
4067 
4068 #ifdef VM_LITTLE_ENDIAN
4069   // This is what we implement (the DOLIT4 part):
4070   // ========================================================================= */
4071   // #define DOLIT4 c ^= *buf4++; \
4072   //         c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \
4073   //             crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24]
4074   // #define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4
4075   // ========================================================================= */
4076   const int ix0 = 3*(4*CRC32_COLUMN_SIZE);
4077   const int ix1 = 2*(4*CRC32_COLUMN_SIZE);
4078   const int ix2 = 1*(4*CRC32_COLUMN_SIZE);
4079   const int ix3 = 0*(4*CRC32_COLUMN_SIZE);
4080 #else
4081   // This is what we implement (the DOBIG4 part):
4082   // =========================================================================
4083   // #define DOBIG4 c ^= *++buf4; \
4084   //         c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \
4085   //             crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24]
4086   // #define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4
4087   // =========================================================================
4088   const int ix0 = 4*(4*CRC32_COLUMN_SIZE);
4089   const int ix1 = 5*(4*CRC32_COLUMN_SIZE);
4090   const int ix2 = 6*(4*CRC32_COLUMN_SIZE);
4091   const int ix3 = 7*(4*CRC32_COLUMN_SIZE);
4092 #endif
4093   assert_different_registers(table, tc0, tc1, tc2);
4094   assert(table == tc3, "must be!");
4095 
4096   addi(tc0, table, ix0);
4097   addi(tc1, table, ix1);
4098   addi(tc2, table, ix2);
4099   if (ix3 != 0) addi(tc3, table, ix3);
4100 
4101   return ix3;
4102 }
4103 
4104 /**
4105  * uint32_t crc;
4106  * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
4107  */
4108 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) {
4109   assert_different_registers(crc, table, tmp);
4110   assert_different_registers(val, table);
4111 
4112   if (crc == val) {                   // Must rotate first to use the unmodified value.
4113     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
4114                                       // As we use a word (4-byte) instruction, we have to adapt the mask bit positions.
4115     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
4116   } else {
4117     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
4118     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
4119   }
4120   lwzx(tmp, table, tmp);
4121   xorr(crc, crc, tmp);
4122 }
4123 
4124 /**
4125  * uint32_t crc;
4126  * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
4127  */
4128 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
4129   fold_byte_crc32(crc, crc, table, tmp);
4130 }
4131 
4132 /**
4133  * Emits code to update CRC-32 with a byte value according to constants in table.
4134  *
4135  * @param [in,out]crc   Register containing the crc.
4136  * @param [in]val       Register containing the byte to fold into the CRC.
4137  * @param [in]table     Register containing the table of crc constants.
4138  *
4139  * uint32_t crc;
4140  * val = crc_table[(val ^ crc) & 0xFF];
4141  * crc = val ^ (crc >> 8);
4142  */
4143 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
4144   BLOCK_COMMENT("update_byte_crc32:");
4145   xorr(val, val, crc);
4146   fold_byte_crc32(crc, val, table, val);
4147 }
4148 
4149 /**
4150  * @param crc   register containing existing CRC (32-bit)
4151  * @param buf   register pointing to input byte buffer (byte*)
4152  * @param len   register containing number of bytes
4153  * @param table register pointing to CRC table
4154  */
4155 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,
4156                                            Register data, bool loopAlignment) {
4157   assert_different_registers(crc, buf, len, table, data);
4158 
4159   Label L_mainLoop, L_done;
4160   const int mainLoop_stepping  = 1;
4161   const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4;
4162 
4163   // Process all bytes in a single-byte loop.
4164   clrldi_(len, len, 32);                         // Enforce 32 bit. Anything to do?
4165   beq(CCR0, L_done);
4166 
4167   mtctr(len);
4168   align(mainLoop_alignment);
4169   BIND(L_mainLoop);
4170     lbz(data, 0, buf);                           // Byte from buffer, zero-extended.
4171     addi(buf, buf, mainLoop_stepping);           // Advance buffer position.
4172     update_byte_crc32(crc, data, table);
4173     bdnz(L_mainLoop);                            // Iterate.
4174 
4175   bind(L_done);
4176 }
4177 
4178 /**
4179  * Emits code to update CRC-32 with a 4-byte value according to constants in table
4180  * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
4181  */
4182 // A not on the lookup table address(es):
4183 // The lookup table consists of two sets of four columns each.
4184 // The columns {0..3} are used for little-endian machines.
4185 // The columns {4..7} are used for big-endian machines.
4186 // To save the effort of adding the column offset to the table address each time
4187 // a table element is looked up, it is possible to pass the pre-calculated
4188 // column addresses.
4189 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary.
4190 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,
4191                                         Register t0,  Register t1,  Register t2,  Register t3,
4192                                         Register tc0, Register tc1, Register tc2, Register tc3) {
4193   assert_different_registers(crc, t3);
4194 
4195   // XOR crc with next four bytes of buffer.
4196   lwz(t3, bufDisp, buf);
4197   if (bufInc != 0) {
4198     addi(buf, buf, bufInc);
4199   }
4200   xorr(t3, t3, crc);
4201 
4202   // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices.
4203   rlwinm(t0, t3,  2,         24-2, 31-2);  // ((t1 >>  0) & 0xff) << 2
4204   rlwinm(t1, t3,  32+(2- 8), 24-2, 31-2);  // ((t1 >>  8) & 0xff) << 2
4205   rlwinm(t2, t3,  32+(2-16), 24-2, 31-2);  // ((t1 >> 16) & 0xff) << 2
4206   rlwinm(t3, t3,  32+(2-24), 24-2, 31-2);  // ((t1 >> 24) & 0xff) << 2
4207 
4208   // Use the pre-calculated column addresses.
4209   // Load pre-calculated table values.
4210   lwzx(t0, tc0, t0);
4211   lwzx(t1, tc1, t1);
4212   lwzx(t2, tc2, t2);
4213   lwzx(t3, tc3, t3);
4214 
4215   // Calculate new crc from table values.
4216   xorr(t0,  t0, t1);
4217   xorr(t2,  t2, t3);
4218   xorr(crc, t0, t2);  // Now crc contains the final checksum value.
4219 }
4220 
4221 /**
4222  * @param crc   register containing existing CRC (32-bit)
4223  * @param buf   register pointing to input byte buffer (byte*)
4224  * @param len   register containing number of bytes
4225  * @param table register pointing to CRC table
4226  *
4227  * Uses R9..R12 as work register. Must be saved/restored by caller!
4228  */
4229 void MacroAssembler::kernel_crc32_2word(Register crc, Register buf, Register len, Register table,
4230                                         Register t0,  Register t1,  Register t2,  Register t3,
4231                                         Register tc0, Register tc1, Register tc2, Register tc3,
4232                                         bool invertCRC) {
4233   assert_different_registers(crc, buf, len, table);
4234 
4235   Label L_mainLoop, L_tail;
4236   Register  tmp  = t0;
4237   Register  data = t0;
4238   Register  tmp2 = t1;
4239   const int mainLoop_stepping  = 8;
4240   const int tailLoop_stepping  = 1;
4241   const int log_stepping       = exact_log2(mainLoop_stepping);
4242   const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
4243   const int complexThreshold   = 2*mainLoop_stepping;
4244 
4245   // Don't test for len <= 0 here. This pathological case should not occur anyway.
4246   // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles
4247   // for all well-behaved cases. The situation itself is detected and handled correctly
4248   // within update_byteLoop_crc32.
4249   assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
4250 
4251   BLOCK_COMMENT("kernel_crc32_2word {");
4252 
4253   if (invertCRC) {
4254     nand(crc, crc, crc);                      // 1s complement of crc
4255   }
4256 
4257   // Check for short (<mainLoop_stepping) buffer.
4258   cmpdi(CCR0, len, complexThreshold);
4259   blt(CCR0, L_tail);
4260 
4261   // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
4262   // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
4263   {
4264     // Align buf addr to mainLoop_stepping boundary.
4265     neg(tmp2, buf);                           // Calculate # preLoop iterations for alignment.
4266     rldicl(tmp2, tmp2, 0, 64-log_stepping);   // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
4267 
4268     if (complexThreshold > mainLoop_stepping) {
4269       sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4270     } else {
4271       sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
4272       cmpdi(CCR0, tmp, mainLoop_stepping);
4273       blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
4274       mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4275     }
4276     update_byteLoop_crc32(crc, buf, tmp2, table, data, false);
4277   }
4278 
4279   srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
4280   andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
4281   mtctr(tmp2);
4282 
4283 #ifdef VM_LITTLE_ENDIAN
4284   Register crc_rv = crc;
4285 #else
4286   Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
4287                                                  // Occupies tmp, but frees up crc.
4288   load_reverse_32(crc_rv, crc);                  // Revert byte order because we are dealing with big-endian data.
4289   tmp = crc;
4290 #endif
4291 
4292   int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
4293 
4294   align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
4295   BIND(L_mainLoop);
4296     update_1word_crc32(crc_rv, buf, table, 0, 0, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
4297     update_1word_crc32(crc_rv, buf, table, 4, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
4298     bdnz(L_mainLoop);
4299 
4300 #ifndef VM_LITTLE_ENDIAN
4301   load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
4302   tmp = crc_rv;                                  // Tmp uses it's original register again.
4303 #endif
4304 
4305   // Restore original table address for tailLoop.
4306   if (reconstructTableOffset != 0) {
4307     addi(table, table, -reconstructTableOffset);
4308   }
4309 
4310   // Process last few (<complexThreshold) bytes of buffer.
4311   BIND(L_tail);
4312   update_byteLoop_crc32(crc, buf, len, table, data, false);
4313 
4314   if (invertCRC) {
4315     nand(crc, crc, crc);                      // 1s complement of crc
4316   }
4317   BLOCK_COMMENT("} kernel_crc32_2word");
4318 }
4319 
4320 /**
4321  * @param crc   register containing existing CRC (32-bit)
4322  * @param buf   register pointing to input byte buffer (byte*)
4323  * @param len   register containing number of bytes
4324  * @param table register pointing to CRC table
4325  *
4326  * uses R9..R12 as work register. Must be saved/restored by caller!
4327  */
4328 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table,
4329                                         Register t0,  Register t1,  Register t2,  Register t3,
4330                                         Register tc0, Register tc1, Register tc2, Register tc3,
4331                                         bool invertCRC) {
4332   assert_different_registers(crc, buf, len, table);
4333 
4334   Label L_mainLoop, L_tail;
4335   Register  tmp          = t0;
4336   Register  data         = t0;
4337   Register  tmp2         = t1;
4338   const int mainLoop_stepping  = 4;
4339   const int tailLoop_stepping  = 1;
4340   const int log_stepping       = exact_log2(mainLoop_stepping);
4341   const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
4342   const int complexThreshold   = 2*mainLoop_stepping;
4343 
4344   // Don't test for len <= 0 here. This pathological case should not occur anyway.
4345   // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles
4346   // for all well-behaved cases. The situation itself is detected and handled correctly
4347   // within update_byteLoop_crc32.
4348   assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
4349 
4350   BLOCK_COMMENT("kernel_crc32_1word {");
4351 
4352   if (invertCRC) {
4353     nand(crc, crc, crc);                      // 1s complement of crc
4354   }
4355 
4356   // Check for short (<mainLoop_stepping) buffer.
4357   cmpdi(CCR0, len, complexThreshold);
4358   blt(CCR0, L_tail);
4359 
4360   // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
4361   // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
4362   {
4363     // Align buf addr to mainLoop_stepping boundary.
4364     neg(tmp2, buf);                              // Calculate # preLoop iterations for alignment.
4365     rldicl(tmp2, tmp2, 0, 64-log_stepping);      // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
4366 
4367     if (complexThreshold > mainLoop_stepping) {
4368       sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4369     } else {
4370       sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
4371       cmpdi(CCR0, tmp, mainLoop_stepping);
4372       blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
4373       mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4374     }
4375     update_byteLoop_crc32(crc, buf, tmp2, table, data, false);
4376   }
4377 
4378   srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
4379   andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
4380   mtctr(tmp2);
4381 
4382 #ifdef VM_LITTLE_ENDIAN
4383   Register crc_rv = crc;
4384 #else
4385   Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
4386                                                  // Occupies tmp, but frees up crc.
4387   load_reverse_32(crc_rv, crc);                  // Revert byte order because we are dealing with big-endian data.
4388   tmp = crc;
4389 #endif
4390 
4391   int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
4392 
4393   align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
4394   BIND(L_mainLoop);
4395     update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
4396     bdnz(L_mainLoop);
4397 
4398 #ifndef VM_LITTLE_ENDIAN
4399   load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
4400   tmp = crc_rv;                                  // Tmp uses it's original register again.
4401 #endif
4402 
4403   // Restore original table address for tailLoop.
4404   if (reconstructTableOffset != 0) {
4405     addi(table, table, -reconstructTableOffset);
4406   }
4407 
4408   // Process last few (<complexThreshold) bytes of buffer.
4409   BIND(L_tail);
4410   update_byteLoop_crc32(crc, buf, len, table, data, false);
4411 
4412   if (invertCRC) {
4413     nand(crc, crc, crc);                      // 1s complement of crc
4414   }
4415   BLOCK_COMMENT("} kernel_crc32_1word");
4416 }
4417 
4418 /**
4419  * @param crc   register containing existing CRC (32-bit)
4420  * @param buf   register pointing to input byte buffer (byte*)
4421  * @param len   register containing number of bytes
4422  * @param table register pointing to CRC table
4423  *
4424  * Uses R7_ARG5, R8_ARG6 as work registers.
4425  */
4426 void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len, Register table,
4427                                         Register t0,  Register t1,  Register t2,  Register t3,
4428                                         bool invertCRC) {
4429   assert_different_registers(crc, buf, len, table);
4430 
4431   Register  data = t0;                   // Holds the current byte to be folded into crc.
4432 
4433   BLOCK_COMMENT("kernel_crc32_1byte {");
4434 
4435   if (invertCRC) {
4436     nand(crc, crc, crc);                      // 1s complement of crc
4437   }
4438 
4439   // Process all bytes in a single-byte loop.
4440   update_byteLoop_crc32(crc, buf, len, table, data, true);
4441 
4442   if (invertCRC) {
4443     nand(crc, crc, crc);                      // 1s complement of crc
4444   }
4445   BLOCK_COMMENT("} kernel_crc32_1byte");
4446 }
4447 
4448 /**
4449  * @param crc             register containing existing CRC (32-bit)
4450  * @param buf             register pointing to input byte buffer (byte*)
4451  * @param len             register containing number of bytes
4452  * @param table           register pointing to CRC table
4453  * @param constants       register pointing to CRC table for 128-bit aligned memory
4454  * @param barretConstants register pointing to table for barrett reduction
4455  * @param t0              volatile register
4456  * @param t1              volatile register
4457  * @param t2              volatile register
4458  * @param t3              volatile register
4459  */
4460 void MacroAssembler::kernel_crc32_1word_vpmsumd(Register crc, Register buf, Register len, Register table,
4461                                                 Register constants,  Register barretConstants,
4462                                                 Register t0,  Register t1, Register t2, Register t3, Register t4,
4463                                                 bool invertCRC) {
4464   assert_different_registers(crc, buf, len, table);
4465 
4466   Label L_alignedHead, L_tail, L_alignTail, L_start, L_end;
4467 
4468   Register  prealign     = t0;
4469   Register  postalign    = t0;
4470 
4471   BLOCK_COMMENT("kernel_crc32_1word_vpmsumb {");
4472 
4473   // 1. use kernel_crc32_1word for shorter than 384bit
4474   clrldi(len, len, 32);
4475   cmpdi(CCR0, len, 384);
4476   bge(CCR0, L_start);
4477 
4478     Register tc0 = t4;
4479     Register tc1 = constants;
4480     Register tc2 = barretConstants;
4481     kernel_crc32_1word(crc, buf, len, table,t0, t1, t2, t3, tc0, tc1, tc2, table, invertCRC);
4482     b(L_end);
4483 
4484   BIND(L_start);
4485 
4486     // 2. ~c
4487     if (invertCRC) {
4488       nand(crc, crc, crc);                      // 1s complement of crc
4489     }
4490 
4491     // 3. calculate from 0 to first 128bit-aligned address
4492     clrldi_(prealign, buf, 57);
4493     beq(CCR0, L_alignedHead);
4494 
4495     subfic(prealign, prealign, 128);
4496 
4497     subf(len, prealign, len);
4498     update_byteLoop_crc32(crc, buf, prealign, table, t2, false);
4499 
4500     // 4. calculate from first 128bit-aligned address to last 128bit-aligned address
4501     BIND(L_alignedHead);
4502 
4503     clrldi(postalign, len, 57);
4504     subf(len, postalign, len);
4505 
4506     // len must be more than 256bit
4507     kernel_crc32_1word_aligned(crc, buf, len, constants, barretConstants, t1, t2, t3);
4508 
4509     // 5. calculate remaining
4510     cmpdi(CCR0, postalign, 0);
4511     beq(CCR0, L_tail);
4512 
4513     update_byteLoop_crc32(crc, buf, postalign, table, t2, false);
4514 
4515     BIND(L_tail);
4516 
4517     // 6. ~c
4518     if (invertCRC) {
4519       nand(crc, crc, crc);                      // 1s complement of crc
4520     }
4521 
4522   BIND(L_end);
4523 
4524   BLOCK_COMMENT("} kernel_crc32_1word_vpmsumb");
4525 }
4526 
4527 /**
4528  * @param crc             register containing existing CRC (32-bit)
4529  * @param buf             register pointing to input byte buffer (byte*)
4530  * @param len             register containing number of bytes
4531  * @param constants       register pointing to CRC table for 128-bit aligned memory
4532  * @param barretConstants register pointing to table for barrett reduction
4533  * @param t0              volatile register
4534  * @param t1              volatile register
4535  * @param t2              volatile register
4536  */
4537 void MacroAssembler::kernel_crc32_1word_aligned(Register crc, Register buf, Register len,
4538     Register constants, Register barretConstants, Register t0, Register t1, Register t2) {
4539   Label L_mainLoop, L_tail, L_alignTail, L_barrett_reduction, L_end, L_first_warm_up_done, L_first_cool_down, L_second_cool_down, L_XOR, L_test;
4540   Label L_lv0, L_lv1, L_lv2, L_lv3, L_lv4, L_lv5, L_lv6, L_lv7, L_lv8, L_lv9, L_lv10, L_lv11, L_lv12, L_lv13, L_lv14, L_lv15;
4541   Label L_1, L_2, L_3, L_4;
4542 
4543   Register  rLoaded      = t0;
4544   Register  rTmp1        = t1;
4545   Register  rTmp2        = t2;
4546   Register  off16        = R22;
4547   Register  off32        = R23;
4548   Register  off48        = R24;
4549   Register  off64        = R25;
4550   Register  off80        = R26;
4551   Register  off96        = R27;
4552   Register  off112       = R28;
4553   Register  rIdx         = R29;
4554   Register  rMax         = R30;
4555   Register  constantsPos = R31;
4556 
4557   VectorRegister mask_32bit = VR24;
4558   VectorRegister mask_64bit = VR25;
4559   VectorRegister zeroes     = VR26;
4560   VectorRegister const1     = VR27;
4561   VectorRegister const2     = VR28;
4562 
4563   // Save non-volatile vector registers (frameless).
4564   Register offset = t1;   int offsetInt = 0;
4565   offsetInt -= 16; li(offset, -16);           stvx(VR20, offset, R1_SP);
4566   offsetInt -= 16; addi(offset, offset, -16); stvx(VR21, offset, R1_SP);
4567   offsetInt -= 16; addi(offset, offset, -16); stvx(VR22, offset, R1_SP);
4568   offsetInt -= 16; addi(offset, offset, -16); stvx(VR23, offset, R1_SP);
4569   offsetInt -= 16; addi(offset, offset, -16); stvx(VR24, offset, R1_SP);
4570   offsetInt -= 16; addi(offset, offset, -16); stvx(VR25, offset, R1_SP);
4571   offsetInt -= 16; addi(offset, offset, -16); stvx(VR26, offset, R1_SP);
4572   offsetInt -= 16; addi(offset, offset, -16); stvx(VR27, offset, R1_SP);
4573   offsetInt -= 16; addi(offset, offset, -16); stvx(VR28, offset, R1_SP);
4574   offsetInt -= 8; std(R22, offsetInt, R1_SP);
4575   offsetInt -= 8; std(R23, offsetInt, R1_SP);
4576   offsetInt -= 8; std(R24, offsetInt, R1_SP);
4577   offsetInt -= 8; std(R25, offsetInt, R1_SP);
4578   offsetInt -= 8; std(R26, offsetInt, R1_SP);
4579   offsetInt -= 8; std(R27, offsetInt, R1_SP);
4580   offsetInt -= 8; std(R28, offsetInt, R1_SP);
4581   offsetInt -= 8; std(R29, offsetInt, R1_SP);
4582   offsetInt -= 8; std(R30, offsetInt, R1_SP);
4583   offsetInt -= 8; std(R31, offsetInt, R1_SP);
4584 
4585   // Set constants
4586   li(off16, 16);
4587   li(off32, 32);
4588   li(off48, 48);
4589   li(off64, 64);
4590   li(off80, 80);
4591   li(off96, 96);
4592   li(off112, 112);
4593 
4594   clrldi(crc, crc, 32);
4595 
4596   vxor(zeroes, zeroes, zeroes);
4597   vspltisw(VR0, -1);
4598 
4599   vsldoi(mask_32bit, zeroes, VR0, 4);
4600   vsldoi(mask_64bit, zeroes, VR0, 8);
4601 
4602   // Get the initial value into v8
4603   vxor(VR8, VR8, VR8);
4604   mtvrd(VR8, crc);
4605   vsldoi(VR8, zeroes, VR8, 8); // shift into bottom 32 bits
4606 
4607   li (rLoaded, 0);
4608 
4609   rldicr(rIdx, len, 0, 56);
4610 
4611   {
4612     BIND(L_1);
4613     // Checksum in blocks of MAX_SIZE (32768)
4614     lis(rMax, 0);
4615     ori(rMax, rMax, 32768);
4616     mr(rTmp2, rMax);
4617     cmpd(CCR0, rIdx, rMax);
4618     bgt(CCR0, L_2);
4619     mr(rMax, rIdx);
4620 
4621     BIND(L_2);
4622     subf(rIdx, rMax, rIdx);
4623 
4624     // our main loop does 128 bytes at a time
4625     srdi(rMax, rMax, 7);
4626 
4627     /*
4628      * Work out the offset into the constants table to start at. Each
4629      * constant is 16 bytes, and it is used against 128 bytes of input
4630      * data - 128 / 16 = 8
4631      */
4632     sldi(rTmp1, rMax, 4);
4633     srdi(rTmp2, rTmp2, 3);
4634     subf(rTmp1, rTmp1, rTmp2);
4635 
4636     // We reduce our final 128 bytes in a separate step
4637     addi(rMax, rMax, -1);
4638     mtctr(rMax);
4639 
4640     // Find the start of our constants
4641     add(constantsPos, constants, rTmp1);
4642 
4643     // zero VR0-v7 which will contain our checksums
4644     vxor(VR0, VR0, VR0);
4645     vxor(VR1, VR1, VR1);
4646     vxor(VR2, VR2, VR2);
4647     vxor(VR3, VR3, VR3);
4648     vxor(VR4, VR4, VR4);
4649     vxor(VR5, VR5, VR5);
4650     vxor(VR6, VR6, VR6);
4651     vxor(VR7, VR7, VR7);
4652 
4653     lvx(const1, constantsPos);
4654 
4655     /*
4656      * If we are looping back to consume more data we use the values
4657      * already in VR16-v23.
4658      */
4659     cmpdi(CCR0, rLoaded, 1);
4660     beq(CCR0, L_3);
4661     {
4662 
4663       // First warm up pass
4664       lvx(VR16, buf);
4665       lvx(VR17, off16, buf);
4666       lvx(VR18, off32, buf);
4667       lvx(VR19, off48, buf);
4668       lvx(VR20, off64, buf);
4669       lvx(VR21, off80, buf);
4670       lvx(VR22, off96, buf);
4671       lvx(VR23, off112, buf);
4672       addi(buf, buf, 8*16);
4673 
4674       // xor in initial value
4675       vxor(VR16, VR16, VR8);
4676     }
4677 
4678     BIND(L_3);
4679     bdz(L_first_warm_up_done);
4680 
4681     addi(constantsPos, constantsPos, 16);
4682     lvx(const2, constantsPos);
4683 
4684     // Second warm up pass
4685     vpmsumd(VR8, VR16, const1);
4686     lvx(VR16, buf);
4687 
4688     vpmsumd(VR9, VR17, const1);
4689     lvx(VR17, off16, buf);
4690 
4691     vpmsumd(VR10, VR18, const1);
4692     lvx(VR18, off32, buf);
4693 
4694     vpmsumd(VR11, VR19, const1);
4695     lvx(VR19, off48, buf);
4696 
4697     vpmsumd(VR12, VR20, const1);
4698     lvx(VR20, off64, buf);
4699 
4700     vpmsumd(VR13, VR21, const1);
4701     lvx(VR21, off80, buf);
4702 
4703     vpmsumd(VR14, VR22, const1);
4704     lvx(VR22, off96, buf);
4705 
4706     vpmsumd(VR15, VR23, const1);
4707     lvx(VR23, off112, buf);
4708 
4709     addi(buf, buf, 8 * 16);
4710 
4711     bdz(L_first_cool_down);
4712 
4713     /*
4714      * main loop. We modulo schedule it such that it takes three iterations
4715      * to complete - first iteration load, second iteration vpmsum, third
4716      * iteration xor.
4717      */
4718     {
4719       BIND(L_4);
4720       lvx(const1, constantsPos); addi(constantsPos, constantsPos, 16);
4721 
4722       vxor(VR0, VR0, VR8);
4723       vpmsumd(VR8, VR16, const2);
4724       lvx(VR16, buf);
4725 
4726       vxor(VR1, VR1, VR9);
4727       vpmsumd(VR9, VR17, const2);
4728       lvx(VR17, off16, buf);
4729 
4730       vxor(VR2, VR2, VR10);
4731       vpmsumd(VR10, VR18, const2);
4732       lvx(VR18, off32, buf);
4733 
4734       vxor(VR3, VR3, VR11);
4735       vpmsumd(VR11, VR19, const2);
4736       lvx(VR19, off48, buf);
4737       lvx(const2, constantsPos);
4738 
4739       vxor(VR4, VR4, VR12);
4740       vpmsumd(VR12, VR20, const1);
4741       lvx(VR20, off64, buf);
4742 
4743       vxor(VR5, VR5, VR13);
4744       vpmsumd(VR13, VR21, const1);
4745       lvx(VR21, off80, buf);
4746 
4747       vxor(VR6, VR6, VR14);
4748       vpmsumd(VR14, VR22, const1);
4749       lvx(VR22, off96, buf);
4750 
4751       vxor(VR7, VR7, VR15);
4752       vpmsumd(VR15, VR23, const1);
4753       lvx(VR23, off112, buf);
4754 
4755       addi(buf, buf, 8 * 16);
4756 
4757       bdnz(L_4);
4758     }
4759 
4760     BIND(L_first_cool_down);
4761 
4762     // First cool down pass
4763     lvx(const1, constantsPos);
4764     addi(constantsPos, constantsPos, 16);
4765 
4766     vxor(VR0, VR0, VR8);
4767     vpmsumd(VR8, VR16, const1);
4768 
4769     vxor(VR1, VR1, VR9);
4770     vpmsumd(VR9, VR17, const1);
4771 
4772     vxor(VR2, VR2, VR10);
4773     vpmsumd(VR10, VR18, const1);
4774 
4775     vxor(VR3, VR3, VR11);
4776     vpmsumd(VR11, VR19, const1);
4777 
4778     vxor(VR4, VR4, VR12);
4779     vpmsumd(VR12, VR20, const1);
4780 
4781     vxor(VR5, VR5, VR13);
4782     vpmsumd(VR13, VR21, const1);
4783 
4784     vxor(VR6, VR6, VR14);
4785     vpmsumd(VR14, VR22, const1);
4786 
4787     vxor(VR7, VR7, VR15);
4788     vpmsumd(VR15, VR23, const1);
4789 
4790     BIND(L_second_cool_down);
4791     // Second cool down pass
4792     vxor(VR0, VR0, VR8);
4793     vxor(VR1, VR1, VR9);
4794     vxor(VR2, VR2, VR10);
4795     vxor(VR3, VR3, VR11);
4796     vxor(VR4, VR4, VR12);
4797     vxor(VR5, VR5, VR13);
4798     vxor(VR6, VR6, VR14);
4799     vxor(VR7, VR7, VR15);
4800 
4801     /*
4802      * vpmsumd produces a 96 bit result in the least significant bits
4803      * of the register. Since we are bit reflected we have to shift it
4804      * left 32 bits so it occupies the least significant bits in the
4805      * bit reflected domain.
4806      */
4807     vsldoi(VR0, VR0, zeroes, 4);
4808     vsldoi(VR1, VR1, zeroes, 4);
4809     vsldoi(VR2, VR2, zeroes, 4);
4810     vsldoi(VR3, VR3, zeroes, 4);
4811     vsldoi(VR4, VR4, zeroes, 4);
4812     vsldoi(VR5, VR5, zeroes, 4);
4813     vsldoi(VR6, VR6, zeroes, 4);
4814     vsldoi(VR7, VR7, zeroes, 4);
4815 
4816     // xor with last 1024 bits
4817     lvx(VR8, buf);
4818     lvx(VR9, off16, buf);
4819     lvx(VR10, off32, buf);
4820     lvx(VR11, off48, buf);
4821     lvx(VR12, off64, buf);
4822     lvx(VR13, off80, buf);
4823     lvx(VR14, off96, buf);
4824     lvx(VR15, off112, buf);
4825     addi(buf, buf, 8 * 16);
4826 
4827     vxor(VR16, VR0, VR8);
4828     vxor(VR17, VR1, VR9);
4829     vxor(VR18, VR2, VR10);
4830     vxor(VR19, VR3, VR11);
4831     vxor(VR20, VR4, VR12);
4832     vxor(VR21, VR5, VR13);
4833     vxor(VR22, VR6, VR14);
4834     vxor(VR23, VR7, VR15);
4835 
4836     li(rLoaded, 1);
4837     cmpdi(CCR0, rIdx, 0);
4838     addi(rIdx, rIdx, 128);
4839     bne(CCR0, L_1);
4840   }
4841 
4842   // Work out how many bytes we have left
4843   andi_(len, len, 127);
4844 
4845   // Calculate where in the constant table we need to start
4846   subfic(rTmp1, len, 128);
4847   add(constantsPos, constantsPos, rTmp1);
4848 
4849   // How many 16 byte chunks are in the tail
4850   srdi(rIdx, len, 4);
4851   mtctr(rIdx);
4852 
4853   /*
4854    * Reduce the previously calculated 1024 bits to 64 bits, shifting
4855    * 32 bits to include the trailing 32 bits of zeros
4856    */
4857   lvx(VR0, constantsPos);
4858   lvx(VR1, off16, constantsPos);
4859   lvx(VR2, off32, constantsPos);
4860   lvx(VR3, off48, constantsPos);
4861   lvx(VR4, off64, constantsPos);
4862   lvx(VR5, off80, constantsPos);
4863   lvx(VR6, off96, constantsPos);
4864   lvx(VR7, off112, constantsPos);
4865   addi(constantsPos, constantsPos, 8 * 16);
4866 
4867   vpmsumw(VR0, VR16, VR0);
4868   vpmsumw(VR1, VR17, VR1);
4869   vpmsumw(VR2, VR18, VR2);
4870   vpmsumw(VR3, VR19, VR3);
4871   vpmsumw(VR4, VR20, VR4);
4872   vpmsumw(VR5, VR21, VR5);
4873   vpmsumw(VR6, VR22, VR6);
4874   vpmsumw(VR7, VR23, VR7);
4875 
4876   // Now reduce the tail (0 - 112 bytes)
4877   cmpdi(CCR0, rIdx, 0);
4878   beq(CCR0, L_XOR);
4879 
4880   lvx(VR16, buf); addi(buf, buf, 16);
4881   lvx(VR17, constantsPos);
4882   vpmsumw(VR16, VR16, VR17);
4883   vxor(VR0, VR0, VR16);
4884   beq(CCR0, L_XOR);
4885 
4886   lvx(VR16, buf); addi(buf, buf, 16);
4887   lvx(VR17, off16, constantsPos);
4888   vpmsumw(VR16, VR16, VR17);
4889   vxor(VR0, VR0, VR16);
4890   beq(CCR0, L_XOR);
4891 
4892   lvx(VR16, buf); addi(buf, buf, 16);
4893   lvx(VR17, off32, constantsPos);
4894   vpmsumw(VR16, VR16, VR17);
4895   vxor(VR0, VR0, VR16);
4896   beq(CCR0, L_XOR);
4897 
4898   lvx(VR16, buf); addi(buf, buf, 16);
4899   lvx(VR17, off48,constantsPos);
4900   vpmsumw(VR16, VR16, VR17);
4901   vxor(VR0, VR0, VR16);
4902   beq(CCR0, L_XOR);
4903 
4904   lvx(VR16, buf); addi(buf, buf, 16);
4905   lvx(VR17, off64, constantsPos);
4906   vpmsumw(VR16, VR16, VR17);
4907   vxor(VR0, VR0, VR16);
4908   beq(CCR0, L_XOR);
4909 
4910   lvx(VR16, buf); addi(buf, buf, 16);
4911   lvx(VR17, off80, constantsPos);
4912   vpmsumw(VR16, VR16, VR17);
4913   vxor(VR0, VR0, VR16);
4914   beq(CCR0, L_XOR);
4915 
4916   lvx(VR16, buf); addi(buf, buf, 16);
4917   lvx(VR17, off96, constantsPos);
4918   vpmsumw(VR16, VR16, VR17);
4919   vxor(VR0, VR0, VR16);
4920 
4921   // Now xor all the parallel chunks together
4922   BIND(L_XOR);
4923   vxor(VR0, VR0, VR1);
4924   vxor(VR2, VR2, VR3);
4925   vxor(VR4, VR4, VR5);
4926   vxor(VR6, VR6, VR7);
4927 
4928   vxor(VR0, VR0, VR2);
4929   vxor(VR4, VR4, VR6);
4930 
4931   vxor(VR0, VR0, VR4);
4932 
4933   b(L_barrett_reduction);
4934 
4935   BIND(L_first_warm_up_done);
4936   lvx(const1, constantsPos);
4937   addi(constantsPos, constantsPos, 16);
4938   vpmsumd(VR8,  VR16, const1);
4939   vpmsumd(VR9,  VR17, const1);
4940   vpmsumd(VR10, VR18, const1);
4941   vpmsumd(VR11, VR19, const1);
4942   vpmsumd(VR12, VR20, const1);
4943   vpmsumd(VR13, VR21, const1);
4944   vpmsumd(VR14, VR22, const1);
4945   vpmsumd(VR15, VR23, const1);
4946   b(L_second_cool_down);
4947 
4948   BIND(L_barrett_reduction);
4949 
4950   lvx(const1, barretConstants);
4951   addi(barretConstants, barretConstants, 16);
4952   lvx(const2, barretConstants);
4953 
4954   vsldoi(VR1, VR0, VR0, 8);
4955   vxor(VR0, VR0, VR1);    // xor two 64 bit results together
4956 
4957   // shift left one bit
4958   vspltisb(VR1, 1);
4959   vsl(VR0, VR0, VR1);
4960 
4961   vand(VR0, VR0, mask_64bit);
4962 
4963   /*
4964    * The reflected version of Barrett reduction. Instead of bit
4965    * reflecting our data (which is expensive to do), we bit reflect our
4966    * constants and our algorithm, which means the intermediate data in
4967    * our vector registers goes from 0-63 instead of 63-0. We can reflect
4968    * the algorithm because we don't carry in mod 2 arithmetic.
4969    */
4970   vand(VR1, VR0, mask_32bit);  // bottom 32 bits of a
4971   vpmsumd(VR1, VR1, const1);   // ma
4972   vand(VR1, VR1, mask_32bit);  // bottom 32bits of ma
4973   vpmsumd(VR1, VR1, const2);   // qn */
4974   vxor(VR0, VR0, VR1);         // a - qn, subtraction is xor in GF(2)
4975 
4976   /*
4977    * Since we are bit reflected, the result (ie the low 32 bits) is in
4978    * the high 32 bits. We just need to shift it left 4 bytes
4979    * V0 [ 0 1 X 3 ]
4980    * V0 [ 0 X 2 3 ]
4981    */
4982   vsldoi(VR0, VR0, zeroes, 4);    // shift result into top 64 bits of
4983 
4984   // Get it into r3
4985   mfvrd(crc, VR0);
4986 
4987   BIND(L_end);
4988 
4989   offsetInt = 0;
4990   // Restore non-volatile Vector registers (frameless).
4991   offsetInt -= 16; li(offset, -16);           lvx(VR20, offset, R1_SP);
4992   offsetInt -= 16; addi(offset, offset, -16); lvx(VR21, offset, R1_SP);
4993   offsetInt -= 16; addi(offset, offset, -16); lvx(VR22, offset, R1_SP);
4994   offsetInt -= 16; addi(offset, offset, -16); lvx(VR23, offset, R1_SP);
4995   offsetInt -= 16; addi(offset, offset, -16); lvx(VR24, offset, R1_SP);
4996   offsetInt -= 16; addi(offset, offset, -16); lvx(VR25, offset, R1_SP);
4997   offsetInt -= 16; addi(offset, offset, -16); lvx(VR26, offset, R1_SP);
4998   offsetInt -= 16; addi(offset, offset, -16); lvx(VR27, offset, R1_SP);
4999   offsetInt -= 16; addi(offset, offset, -16); lvx(VR28, offset, R1_SP);
5000   offsetInt -= 8;  ld(R22, offsetInt, R1_SP);
5001   offsetInt -= 8;  ld(R23, offsetInt, R1_SP);
5002   offsetInt -= 8;  ld(R24, offsetInt, R1_SP);
5003   offsetInt -= 8;  ld(R25, offsetInt, R1_SP);
5004   offsetInt -= 8;  ld(R26, offsetInt, R1_SP);
5005   offsetInt -= 8;  ld(R27, offsetInt, R1_SP);
5006   offsetInt -= 8;  ld(R28, offsetInt, R1_SP);
5007   offsetInt -= 8;  ld(R29, offsetInt, R1_SP);
5008   offsetInt -= 8;  ld(R30, offsetInt, R1_SP);
5009   offsetInt -= 8;  ld(R31, offsetInt, R1_SP);
5010 }
5011 
5012 void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp, bool invertCRC) {
5013   assert_different_registers(crc, buf, /* len,  not used!! */ table, tmp);
5014 
5015   BLOCK_COMMENT("kernel_crc32_singleByte:");
5016   if (invertCRC) {
5017     nand(crc, crc, crc);                // 1s complement of crc
5018   }
5019 
5020   lbz(tmp, 0, buf);                     // Byte from buffer, zero-extended.
5021   update_byte_crc32(crc, tmp, table);
5022 
5023   if (invertCRC) {
5024     nand(crc, crc, crc);                // 1s complement of crc
5025   }
5026 }
5027 
5028 void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) {
5029   assert_different_registers(crc, val, table);
5030 
5031   BLOCK_COMMENT("kernel_crc32_singleByteReg:");
5032   if (invertCRC) {
5033     nand(crc, crc, crc);                // 1s complement of crc
5034   }
5035 
5036   update_byte_crc32(crc, val, table);
5037 
5038   if (invertCRC) {
5039     nand(crc, crc, crc);                // 1s complement of crc
5040   }
5041 }
5042 
5043 // dest_lo += src1 + src2
5044 // dest_hi += carry1 + carry2
5045 void MacroAssembler::add2_with_carry(Register dest_hi,
5046                                      Register dest_lo,
5047                                      Register src1, Register src2) {
5048   li(R0, 0);
5049   addc(dest_lo, dest_lo, src1);
5050   adde(dest_hi, dest_hi, R0);
5051   addc(dest_lo, dest_lo, src2);
5052   adde(dest_hi, dest_hi, R0);
5053 }
5054 
5055 // Multiply 64 bit by 64 bit first loop.
5056 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,
5057                                            Register x_xstart,
5058                                            Register y, Register y_idx,
5059                                            Register z,
5060                                            Register carry,
5061                                            Register product_high, Register product,
5062                                            Register idx, Register kdx,
5063                                            Register tmp) {
5064   //  jlong carry, x[], y[], z[];
5065   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
5066   //    huge_128 product = y[idx] * x[xstart] + carry;
5067   //    z[kdx] = (jlong)product;
5068   //    carry  = (jlong)(product >>> 64);
5069   //  }
5070   //  z[xstart] = carry;
5071 
5072   Label L_first_loop, L_first_loop_exit;
5073   Label L_one_x, L_one_y, L_multiply;
5074 
5075   addic_(xstart, xstart, -1);
5076   blt(CCR0, L_one_x);   // Special case: length of x is 1.
5077 
5078   // Load next two integers of x.
5079   sldi(tmp, xstart, LogBytesPerInt);
5080   ldx(x_xstart, x, tmp);
5081 #ifdef VM_LITTLE_ENDIAN
5082   rldicl(x_xstart, x_xstart, 32, 0);
5083 #endif
5084 
5085   align(32, 16);
5086   bind(L_first_loop);
5087 
5088   cmpdi(CCR0, idx, 1);
5089   blt(CCR0, L_first_loop_exit);
5090   addi(idx, idx, -2);
5091   beq(CCR0, L_one_y);
5092 
5093   // Load next two integers of y.
5094   sldi(tmp, idx, LogBytesPerInt);
5095   ldx(y_idx, y, tmp);
5096 #ifdef VM_LITTLE_ENDIAN
5097   rldicl(y_idx, y_idx, 32, 0);
5098 #endif
5099 
5100 
5101   bind(L_multiply);
5102   multiply64(product_high, product, x_xstart, y_idx);
5103 
5104   li(tmp, 0);
5105   addc(product, product, carry);         // Add carry to result.
5106   adde(product_high, product_high, tmp); // Add carry of the last addition.
5107   addi(kdx, kdx, -2);
5108 
5109   // Store result.
5110 #ifdef VM_LITTLE_ENDIAN
5111   rldicl(product, product, 32, 0);
5112 #endif
5113   sldi(tmp, kdx, LogBytesPerInt);
5114   stdx(product, z, tmp);
5115   mr_if_needed(carry, product_high);
5116   b(L_first_loop);
5117 
5118 
5119   bind(L_one_y); // Load one 32 bit portion of y as (0,value).
5120 
5121   lwz(y_idx, 0, y);
5122   b(L_multiply);
5123 
5124 
5125   bind(L_one_x); // Load one 32 bit portion of x as (0,value).
5126 
5127   lwz(x_xstart, 0, x);
5128   b(L_first_loop);
5129 
5130   bind(L_first_loop_exit);
5131 }
5132 
5133 // Multiply 64 bit by 64 bit and add 128 bit.
5134 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y,
5135                                             Register z, Register yz_idx,
5136                                             Register idx, Register carry,
5137                                             Register product_high, Register product,
5138                                             Register tmp, int offset) {
5139 
5140   //  huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
5141   //  z[kdx] = (jlong)product;
5142 
5143   sldi(tmp, idx, LogBytesPerInt);
5144   if (offset) {
5145     addi(tmp, tmp, offset);
5146   }
5147   ldx(yz_idx, y, tmp);
5148 #ifdef VM_LITTLE_ENDIAN
5149   rldicl(yz_idx, yz_idx, 32, 0);
5150 #endif
5151 
5152   multiply64(product_high, product, x_xstart, yz_idx);
5153   ldx(yz_idx, z, tmp);
5154 #ifdef VM_LITTLE_ENDIAN
5155   rldicl(yz_idx, yz_idx, 32, 0);
5156 #endif
5157 
5158   add2_with_carry(product_high, product, carry, yz_idx);
5159 
5160   sldi(tmp, idx, LogBytesPerInt);
5161   if (offset) {
5162     addi(tmp, tmp, offset);
5163   }
5164 #ifdef VM_LITTLE_ENDIAN
5165   rldicl(product, product, 32, 0);
5166 #endif
5167   stdx(product, z, tmp);
5168 }
5169 
5170 // Multiply 128 bit by 128 bit. Unrolled inner loop.
5171 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart,
5172                                              Register y, Register z,
5173                                              Register yz_idx, Register idx, Register carry,
5174                                              Register product_high, Register product,
5175                                              Register carry2, Register tmp) {
5176 
5177   //  jlong carry, x[], y[], z[];
5178   //  int kdx = ystart+1;
5179   //  for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
5180   //    huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
5181   //    z[kdx+idx+1] = (jlong)product;
5182   //    jlong carry2 = (jlong)(product >>> 64);
5183   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
5184   //    z[kdx+idx] = (jlong)product;
5185   //    carry = (jlong)(product >>> 64);
5186   //  }
5187   //  idx += 2;
5188   //  if (idx > 0) {
5189   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
5190   //    z[kdx+idx] = (jlong)product;
5191   //    carry = (jlong)(product >>> 64);
5192   //  }
5193 
5194   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
5195   const Register jdx = R0;
5196 
5197   // Scale the index.
5198   srdi_(jdx, idx, 2);
5199   beq(CCR0, L_third_loop_exit);
5200   mtctr(jdx);
5201 
5202   align(32, 16);
5203   bind(L_third_loop);
5204 
5205   addi(idx, idx, -4);
5206 
5207   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8);
5208   mr_if_needed(carry2, product_high);
5209 
5210   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0);
5211   mr_if_needed(carry, product_high);
5212   bdnz(L_third_loop);
5213 
5214   bind(L_third_loop_exit);  // Handle any left-over operand parts.
5215 
5216   andi_(idx, idx, 0x3);
5217   beq(CCR0, L_post_third_loop_done);
5218 
5219   Label L_check_1;
5220 
5221   addic_(idx, idx, -2);
5222   blt(CCR0, L_check_1);
5223 
5224   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0);
5225   mr_if_needed(carry, product_high);
5226 
5227   bind(L_check_1);
5228 
5229   addi(idx, idx, 0x2);
5230   andi_(idx, idx, 0x1);
5231   addic_(idx, idx, -1);
5232   blt(CCR0, L_post_third_loop_done);
5233 
5234   sldi(tmp, idx, LogBytesPerInt);
5235   lwzx(yz_idx, y, tmp);
5236   multiply64(product_high, product, x_xstart, yz_idx);
5237   lwzx(yz_idx, z, tmp);
5238 
5239   add2_with_carry(product_high, product, yz_idx, carry);
5240 
5241   sldi(tmp, idx, LogBytesPerInt);
5242   stwx(product, z, tmp);
5243   srdi(product, product, 32);
5244 
5245   sldi(product_high, product_high, 32);
5246   orr(product, product, product_high);
5247   mr_if_needed(carry, product);
5248 
5249   bind(L_post_third_loop_done);
5250 }   // multiply_128_x_128_loop
5251 
5252 void MacroAssembler::muladd(Register out, Register in,
5253                             Register offset, Register len, Register k,
5254                             Register tmp1, Register tmp2, Register carry) {
5255 
5256   // Labels
5257   Label LOOP, SKIP;
5258 
5259   // Make sure length is positive.
5260   cmpdi  (CCR0,    len,     0);
5261 
5262   // Prepare variables
5263   subi   (offset,  offset,  4);
5264   li     (carry,   0);
5265   ble    (CCR0,    SKIP);
5266 
5267   mtctr  (len);
5268   subi   (len,     len,     1    );
5269   sldi   (len,     len,     2    );
5270 
5271   // Main loop
5272   bind(LOOP);
5273   lwzx   (tmp1,    len,     in   );
5274   lwzx   (tmp2,    offset,  out  );
5275   mulld  (tmp1,    tmp1,    k    );
5276   add    (tmp2,    carry,   tmp2 );
5277   add    (tmp2,    tmp1,    tmp2 );
5278   stwx   (tmp2,    offset,  out  );
5279   srdi   (carry,   tmp2,    32   );
5280   subi   (offset,  offset,  4    );
5281   subi   (len,     len,     4    );
5282   bdnz   (LOOP);
5283   bind(SKIP);
5284 }
5285 
5286 void MacroAssembler::multiply_to_len(Register x, Register xlen,
5287                                      Register y, Register ylen,
5288                                      Register z, Register zlen,
5289                                      Register tmp1, Register tmp2,
5290                                      Register tmp3, Register tmp4,
5291                                      Register tmp5, Register tmp6,
5292                                      Register tmp7, Register tmp8,
5293                                      Register tmp9, Register tmp10,
5294                                      Register tmp11, Register tmp12,
5295                                      Register tmp13) {
5296 
5297   ShortBranchVerifier sbv(this);
5298 
5299   assert_different_registers(x, xlen, y, ylen, z, zlen,
5300                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
5301   assert_different_registers(x, xlen, y, ylen, z, zlen,
5302                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp7);
5303   assert_different_registers(x, xlen, y, ylen, z, zlen,
5304                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp8);
5305 
5306   const Register idx = tmp1;
5307   const Register kdx = tmp2;
5308   const Register xstart = tmp3;
5309 
5310   const Register y_idx = tmp4;
5311   const Register carry = tmp5;
5312   const Register product = tmp6;
5313   const Register product_high = tmp7;
5314   const Register x_xstart = tmp8;
5315   const Register tmp = tmp9;
5316 
5317   // First Loop.
5318   //
5319   //  final static long LONG_MASK = 0xffffffffL;
5320   //  int xstart = xlen - 1;
5321   //  int ystart = ylen - 1;
5322   //  long carry = 0;
5323   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
5324   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
5325   //    z[kdx] = (int)product;
5326   //    carry = product >>> 32;
5327   //  }
5328   //  z[xstart] = (int)carry;
5329 
5330   mr_if_needed(idx, ylen);        // idx = ylen
5331   mr_if_needed(kdx, zlen);        // kdx = xlen + ylen
5332   li(carry, 0);                   // carry = 0
5333 
5334   Label L_done;
5335 
5336   addic_(xstart, xlen, -1);
5337   blt(CCR0, L_done);
5338 
5339   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z,
5340                         carry, product_high, product, idx, kdx, tmp);
5341 
5342   Label L_second_loop;
5343 
5344   cmpdi(CCR0, kdx, 0);
5345   beq(CCR0, L_second_loop);
5346 
5347   Label L_carry;
5348 
5349   addic_(kdx, kdx, -1);
5350   beq(CCR0, L_carry);
5351 
5352   // Store lower 32 bits of carry.
5353   sldi(tmp, kdx, LogBytesPerInt);
5354   stwx(carry, z, tmp);
5355   srdi(carry, carry, 32);
5356   addi(kdx, kdx, -1);
5357 
5358 
5359   bind(L_carry);
5360 
5361   // Store upper 32 bits of carry.
5362   sldi(tmp, kdx, LogBytesPerInt);
5363   stwx(carry, z, tmp);
5364 
5365   // Second and third (nested) loops.
5366   //
5367   //  for (int i = xstart-1; i >= 0; i--) { // Second loop
5368   //    carry = 0;
5369   //    for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
5370   //      long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
5371   //                     (z[k] & LONG_MASK) + carry;
5372   //      z[k] = (int)product;
5373   //      carry = product >>> 32;
5374   //    }
5375   //    z[i] = (int)carry;
5376   //  }
5377   //
5378   //  i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
5379 
5380   bind(L_second_loop);
5381 
5382   li(carry, 0);                   // carry = 0;
5383 
5384   addic_(xstart, xstart, -1);     // i = xstart-1;
5385   blt(CCR0, L_done);
5386 
5387   Register zsave = tmp10;
5388 
5389   mr(zsave, z);
5390 
5391 
5392   Label L_last_x;
5393 
5394   sldi(tmp, xstart, LogBytesPerInt);
5395   add(z, z, tmp);                 // z = z + k - j
5396   addi(z, z, 4);
5397   addic_(xstart, xstart, -1);     // i = xstart-1;
5398   blt(CCR0, L_last_x);
5399 
5400   sldi(tmp, xstart, LogBytesPerInt);
5401   ldx(x_xstart, x, tmp);
5402 #ifdef VM_LITTLE_ENDIAN
5403   rldicl(x_xstart, x_xstart, 32, 0);
5404 #endif
5405 
5406 
5407   Label L_third_loop_prologue;
5408 
5409   bind(L_third_loop_prologue);
5410 
5411   Register xsave = tmp11;
5412   Register xlensave = tmp12;
5413   Register ylensave = tmp13;
5414 
5415   mr(xsave, x);
5416   mr(xlensave, xstart);
5417   mr(ylensave, ylen);
5418 
5419 
5420   multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen,
5421                           carry, product_high, product, x, tmp);
5422 
5423   mr(z, zsave);
5424   mr(x, xsave);
5425   mr(xlen, xlensave);   // This is the decrement of the loop counter!
5426   mr(ylen, ylensave);
5427 
5428   addi(tmp3, xlen, 1);
5429   sldi(tmp, tmp3, LogBytesPerInt);
5430   stwx(carry, z, tmp);
5431   addic_(tmp3, tmp3, -1);
5432   blt(CCR0, L_done);
5433 
5434   srdi(carry, carry, 32);
5435   sldi(tmp, tmp3, LogBytesPerInt);
5436   stwx(carry, z, tmp);
5437   b(L_second_loop);
5438 
5439   // Next infrequent code is moved outside loops.
5440   bind(L_last_x);
5441 
5442   lwz(x_xstart, 0, x);
5443   b(L_third_loop_prologue);
5444 
5445   bind(L_done);
5446 }   // multiply_to_len
5447 
5448 void MacroAssembler::asm_assert(bool check_equal, const char *msg, int id) {
5449 #ifdef ASSERT
5450   Label ok;
5451   if (check_equal) {
5452     beq(CCR0, ok);
5453   } else {
5454     bne(CCR0, ok);
5455   }
5456   stop(msg, id);
5457   bind(ok);
5458 #endif
5459 }
5460 
5461 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset,
5462                                           Register mem_base, const char* msg, int id) {
5463 #ifdef ASSERT
5464   switch (size) {
5465     case 4:
5466       lwz(R0, mem_offset, mem_base);
5467       cmpwi(CCR0, R0, 0);
5468       break;
5469     case 8:
5470       ld(R0, mem_offset, mem_base);
5471       cmpdi(CCR0, R0, 0);
5472       break;
5473     default:
5474       ShouldNotReachHere();
5475   }
5476   asm_assert(check_equal, msg, id);
5477 #endif // ASSERT
5478 }
5479 
5480 void MacroAssembler::verify_thread() {
5481   if (VerifyThread) {
5482     unimplemented("'VerifyThread' currently not implemented on PPC");
5483   }
5484 }
5485 
5486 // READ: oop. KILL: R0. Volatile floats perhaps.
5487 void MacroAssembler::verify_oop(Register oop, const char* msg) {
5488   if (!VerifyOops) {
5489     return;
5490   }
5491 
5492   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
5493   const Register tmp = R11; // Will be preserved.
5494   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
5495   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
5496 
5497   mr_if_needed(R4_ARG2, oop);
5498   save_LR_CR(tmp); // save in old frame
5499   push_frame_reg_args(nbytes_save, tmp);
5500   // load FunctionDescriptor** / entry_address *
5501   load_const_optimized(tmp, fd, R0);
5502   // load FunctionDescriptor* / entry_address
5503   ld(tmp, 0, tmp);
5504   load_const_optimized(R3_ARG1, (address)msg, R0);
5505   // Call destination for its side effect.
5506   call_c(tmp);
5507 
5508   pop_frame();
5509   restore_LR_CR(tmp);
5510   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
5511 }
5512 
5513 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) {
5514   if (!VerifyOops) {
5515     return;
5516   }
5517 
5518   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
5519   const Register tmp = R11; // Will be preserved.
5520   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
5521   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
5522 
5523   ld(R4_ARG2, offs, base);
5524   save_LR_CR(tmp); // save in old frame
5525   push_frame_reg_args(nbytes_save, tmp);
5526   // load FunctionDescriptor** / entry_address *
5527   load_const_optimized(tmp, fd, R0);
5528   // load FunctionDescriptor* / entry_address
5529   ld(tmp, 0, tmp);
5530   load_const_optimized(R3_ARG1, (address)msg, R0);
5531   // Call destination for its side effect.
5532   call_c(tmp);
5533 
5534   pop_frame();
5535   restore_LR_CR(tmp);
5536   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
5537 }
5538 
5539 const char* stop_types[] = {
5540   "stop",
5541   "untested",
5542   "unimplemented",
5543   "shouldnotreachhere"
5544 };
5545 
5546 static void stop_on_request(int tp, const char* msg) {
5547   tty->print("PPC assembly code requires stop: (%s) %s\n", stop_types[tp%/*stop_end*/4], msg);
5548   guarantee(false, "PPC assembly code requires stop: %s", msg);
5549 }
5550 
5551 // Call a C-function that prints output.
5552 void MacroAssembler::stop(int type, const char* msg, int id) {
5553 #ifndef PRODUCT
5554   block_comment(err_msg("stop: %s %s {", stop_types[type%stop_end], msg));
5555 #else
5556   block_comment("stop {");
5557 #endif
5558 
5559   // setup arguments
5560   load_const_optimized(R3_ARG1, type);
5561   load_const_optimized(R4_ARG2, (void *)msg, /*tmp=*/R0);
5562   call_VM_leaf(CAST_FROM_FN_PTR(address, stop_on_request), R3_ARG1, R4_ARG2);
5563   illtrap();
5564   emit_int32(id);
5565   block_comment("} stop;");
5566 }
5567 
5568 #ifndef PRODUCT
5569 // Write pattern 0x0101010101010101 in memory region [low-before, high+after].
5570 // Val, addr are temp registers.
5571 // If low == addr, addr is killed.
5572 // High is preserved.
5573 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) {
5574   if (!ZapMemory) return;
5575 
5576   assert_different_registers(low, val);
5577 
5578   BLOCK_COMMENT("zap memory region {");
5579   load_const_optimized(val, 0x0101010101010101);
5580   int size = before + after;
5581   if (low == high && size < 5 && size > 0) {
5582     int offset = -before*BytesPerWord;
5583     for (int i = 0; i < size; ++i) {
5584       std(val, offset, low);
5585       offset += (1*BytesPerWord);
5586     }
5587   } else {
5588     addi(addr, low, -before*BytesPerWord);
5589     assert_different_registers(high, val);
5590     if (after) addi(high, high, after * BytesPerWord);
5591     Label loop;
5592     bind(loop);
5593     std(val, 0, addr);
5594     addi(addr, addr, 8);
5595     cmpd(CCR6, addr, high);
5596     ble(CCR6, loop);
5597     if (after) addi(high, high, -after * BytesPerWord);  // Correct back to old value.
5598   }
5599   BLOCK_COMMENT("} zap memory region");
5600 }
5601 
5602 #endif // !PRODUCT
5603 
5604 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() {
5605   int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true);
5606   assert(sizeof(bool) == 1, "PowerPC ABI");
5607   masm->lbz(temp, simm16_offset, temp);
5608   masm->cmpwi(CCR0, temp, 0);
5609   masm->beq(CCR0, _label);
5610 }
5611 
5612 SkipIfEqualZero::~SkipIfEqualZero() {
5613   _masm->bind(_label);
5614 }