1 /*
   2  * Copyright (c) 1997, 2017, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2012, 2017, SAP SE. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "compiler/disassembler.hpp"
  29 #include "gc/shared/cardTableModRefBS.hpp"
  30 #include "gc/shared/collectedHeap.inline.hpp"
  31 #include "interpreter/interpreter.hpp"
  32 #include "memory/resourceArea.hpp"
  33 #include "nativeInst_ppc.hpp"
  34 #include "prims/methodHandles.hpp"
  35 #include "runtime/biasedLocking.hpp"
  36 #include "runtime/icache.hpp"
  37 #include "runtime/interfaceSupport.hpp"
  38 #include "runtime/objectMonitor.hpp"
  39 #include "runtime/os.hpp"
  40 #include "runtime/sharedRuntime.hpp"
  41 #include "runtime/stubRoutines.hpp"
  42 #include "utilities/macros.hpp"
  43 #if INCLUDE_ALL_GCS
  44 #include "gc/g1/g1CollectedHeap.inline.hpp"
  45 #include "gc/g1/g1SATBCardTableModRefBS.hpp"
  46 #include "gc/g1/heapRegion.hpp"
  47 #endif // INCLUDE_ALL_GCS
  48 #ifdef COMPILER2
  49 #include "opto/intrinsicnode.hpp"
  50 #endif
  51 
  52 #ifdef PRODUCT
  53 #define BLOCK_COMMENT(str) // nothing
  54 #else
  55 #define BLOCK_COMMENT(str) block_comment(str)
  56 #endif
  57 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  58 
  59 #ifdef ASSERT
  60 // On RISC, there's no benefit to verifying instruction boundaries.
  61 bool AbstractAssembler::pd_check_instruction_mark() { return false; }
  62 #endif
  63 
  64 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) {
  65   assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range");
  66   if (Assembler::is_simm(si31, 16)) {
  67     ld(d, si31, a);
  68     if (emit_filler_nop) nop();
  69   } else {
  70     const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31);
  71     const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31);
  72     addis(d, a, hi);
  73     ld(d, lo, d);
  74   }
  75 }
  76 
  77 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) {
  78   assert_different_registers(d, a);
  79   ld_largeoffset_unchecked(d, si31, a, emit_filler_nop);
  80 }
  81 
  82 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base,
  83                                       size_t size_in_bytes, bool is_signed) {
  84   switch (size_in_bytes) {
  85   case  8:              ld(dst, offs, base);                         break;
  86   case  4:  is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break;
  87   case  2:  is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break;
  88   case  1:  lbz(dst, offs, base); if (is_signed) extsb(dst, dst);    break; // lba doesn't exist :(
  89   default:  ShouldNotReachHere();
  90   }
  91 }
  92 
  93 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base,
  94                                        size_t size_in_bytes) {
  95   switch (size_in_bytes) {
  96   case  8:  std(dst, offs, base); break;
  97   case  4:  stw(dst, offs, base); break;
  98   case  2:  sth(dst, offs, base); break;
  99   case  1:  stb(dst, offs, base); break;
 100   default:  ShouldNotReachHere();
 101   }
 102 }
 103 
 104 void MacroAssembler::align(int modulus, int max, int rem) {
 105   int padding = (rem + modulus - (offset() % modulus)) % modulus;
 106   if (padding > max) return;
 107   for (int c = (padding >> 2); c > 0; --c) { nop(); }
 108 }
 109 
 110 // Issue instructions that calculate given TOC from global TOC.
 111 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16,
 112                                                        bool add_relocation, bool emit_dummy_addr) {
 113   int offset = -1;
 114   if (emit_dummy_addr) {
 115     offset = -128; // dummy address
 116   } else if (addr != (address)(intptr_t)-1) {
 117     offset = MacroAssembler::offset_to_global_toc(addr);
 118   }
 119 
 120   if (hi16) {
 121     addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset));
 122   }
 123   if (lo16) {
 124     if (add_relocation) {
 125       // Relocate at the addi to avoid confusion with a load from the method's TOC.
 126       relocate(internal_word_Relocation::spec(addr));
 127     }
 128     addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset));
 129   }
 130 }
 131 
 132 address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) {
 133   const int offset = MacroAssembler::offset_to_global_toc(addr);
 134 
 135   const address inst2_addr = a;
 136   const int inst2 = *(int *)inst2_addr;
 137 
 138   // The relocation points to the second instruction, the addi,
 139   // and the addi reads and writes the same register dst.
 140   const int dst = inv_rt_field(inst2);
 141   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 142 
 143   // Now, find the preceding addis which writes to dst.
 144   int inst1 = 0;
 145   address inst1_addr = inst2_addr - BytesPerInstWord;
 146   while (inst1_addr >= bound) {
 147     inst1 = *(int *) inst1_addr;
 148     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 149       // Stop, found the addis which writes dst.
 150       break;
 151     }
 152     inst1_addr -= BytesPerInstWord;
 153   }
 154 
 155   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 156   set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset));
 157   set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset));
 158   return inst1_addr;
 159 }
 160 
 161 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) {
 162   const address inst2_addr = a;
 163   const int inst2 = *(int *)inst2_addr;
 164 
 165   // The relocation points to the second instruction, the addi,
 166   // and the addi reads and writes the same register dst.
 167   const int dst = inv_rt_field(inst2);
 168   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 169 
 170   // Now, find the preceding addis which writes to dst.
 171   int inst1 = 0;
 172   address inst1_addr = inst2_addr - BytesPerInstWord;
 173   while (inst1_addr >= bound) {
 174     inst1 = *(int *) inst1_addr;
 175     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 176       // stop, found the addis which writes dst
 177       break;
 178     }
 179     inst1_addr -= BytesPerInstWord;
 180   }
 181 
 182   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 183 
 184   int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0);
 185   // -1 is a special case
 186   if (offset == -1) {
 187     return (address)(intptr_t)-1;
 188   } else {
 189     return global_toc() + offset;
 190   }
 191 }
 192 
 193 #ifdef _LP64
 194 // Patch compressed oops or klass constants.
 195 // Assembler sequence is
 196 // 1) compressed oops:
 197 //    lis  rx = const.hi
 198 //    ori rx = rx | const.lo
 199 // 2) compressed klass:
 200 //    lis  rx = const.hi
 201 //    clrldi rx = rx & 0xFFFFffff // clearMS32b, optional
 202 //    ori rx = rx | const.lo
 203 // Clrldi will be passed by.
 204 address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) {
 205   assert(UseCompressedOops, "Should only patch compressed oops");
 206 
 207   const address inst2_addr = a;
 208   const int inst2 = *(int *)inst2_addr;
 209 
 210   // The relocation points to the second instruction, the ori,
 211   // and the ori reads and writes the same register dst.
 212   const int dst = inv_rta_field(inst2);
 213   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 214   // Now, find the preceding addis which writes to dst.
 215   int inst1 = 0;
 216   address inst1_addr = inst2_addr - BytesPerInstWord;
 217   bool inst1_found = false;
 218   while (inst1_addr >= bound) {
 219     inst1 = *(int *)inst1_addr;
 220     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; }
 221     inst1_addr -= BytesPerInstWord;
 222   }
 223   assert(inst1_found, "inst is not lis");
 224 
 225   int xc = (data >> 16) & 0xffff;
 226   int xd = (data >>  0) & 0xffff;
 227 
 228   set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo
 229   set_imm((int *)inst2_addr,        (xd)); // unsigned int
 230   return inst1_addr;
 231 }
 232 
 233 // Get compressed oop or klass constant.
 234 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) {
 235   assert(UseCompressedOops, "Should only patch compressed oops");
 236 
 237   const address inst2_addr = a;
 238   const int inst2 = *(int *)inst2_addr;
 239 
 240   // The relocation points to the second instruction, the ori,
 241   // and the ori reads and writes the same register dst.
 242   const int dst = inv_rta_field(inst2);
 243   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 244   // Now, find the preceding lis which writes to dst.
 245   int inst1 = 0;
 246   address inst1_addr = inst2_addr - BytesPerInstWord;
 247   bool inst1_found = false;
 248 
 249   while (inst1_addr >= bound) {
 250     inst1 = *(int *) inst1_addr;
 251     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;}
 252     inst1_addr -= BytesPerInstWord;
 253   }
 254   assert(inst1_found, "inst is not lis");
 255 
 256   uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff));
 257   uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16);
 258 
 259   return (int) (xl | xh);
 260 }
 261 #endif // _LP64
 262 
 263 // Returns true if successful.
 264 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a,
 265                                                 Register toc, bool fixed_size) {
 266   int toc_offset = 0;
 267   // Use RelocationHolder::none for the constant pool entry, otherwise
 268   // we will end up with a failing NativeCall::verify(x) where x is
 269   // the address of the constant pool entry.
 270   // FIXME: We should insert relocation information for oops at the constant
 271   // pool entries instead of inserting it at the loads; patching of a constant
 272   // pool entry should be less expensive.
 273   address const_address = address_constant((address)a.value(), RelocationHolder::none);
 274   if (const_address == NULL) { return false; } // allocation failure
 275   // Relocate at the pc of the load.
 276   relocate(a.rspec());
 277   toc_offset = (int)(const_address - code()->consts()->start());
 278   ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size);
 279   return true;
 280 }
 281 
 282 bool MacroAssembler::is_load_const_from_method_toc_at(address a) {
 283   const address inst1_addr = a;
 284   const int inst1 = *(int *)inst1_addr;
 285 
 286    // The relocation points to the ld or the addis.
 287    return (is_ld(inst1)) ||
 288           (is_addis(inst1) && inv_ra_field(inst1) != 0);
 289 }
 290 
 291 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) {
 292   assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc");
 293 
 294   const address inst1_addr = a;
 295   const int inst1 = *(int *)inst1_addr;
 296 
 297   if (is_ld(inst1)) {
 298     return inv_d1_field(inst1);
 299   } else if (is_addis(inst1)) {
 300     const int dst = inv_rt_field(inst1);
 301 
 302     // Now, find the succeeding ld which reads and writes to dst.
 303     address inst2_addr = inst1_addr + BytesPerInstWord;
 304     int inst2 = 0;
 305     while (true) {
 306       inst2 = *(int *) inst2_addr;
 307       if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) {
 308         // Stop, found the ld which reads and writes dst.
 309         break;
 310       }
 311       inst2_addr += BytesPerInstWord;
 312     }
 313     return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2);
 314   }
 315   ShouldNotReachHere();
 316   return 0;
 317 }
 318 
 319 // Get the constant from a `load_const' sequence.
 320 long MacroAssembler::get_const(address a) {
 321   assert(is_load_const_at(a), "not a load of a constant");
 322   const int *p = (const int*) a;
 323   unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48);
 324   if (is_ori(*(p+1))) {
 325     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32);
 326     x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16);
 327     x |= (((unsigned long) (get_imm(a,4) & 0xffff)));
 328   } else if (is_lis(*(p+1))) {
 329     x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32);
 330     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16);
 331     x |= (((unsigned long) (get_imm(a,3) & 0xffff)));
 332   } else {
 333     ShouldNotReachHere();
 334     return (long) 0;
 335   }
 336   return (long) x;
 337 }
 338 
 339 // Patch the 64 bit constant of a `load_const' sequence. This is a low
 340 // level procedure. It neither flushes the instruction cache nor is it
 341 // mt safe.
 342 void MacroAssembler::patch_const(address a, long x) {
 343   assert(is_load_const_at(a), "not a load of a constant");
 344   int *p = (int*) a;
 345   if (is_ori(*(p+1))) {
 346     set_imm(0 + p, (x >> 48) & 0xffff);
 347     set_imm(1 + p, (x >> 32) & 0xffff);
 348     set_imm(3 + p, (x >> 16) & 0xffff);
 349     set_imm(4 + p, x & 0xffff);
 350   } else if (is_lis(*(p+1))) {
 351     set_imm(0 + p, (x >> 48) & 0xffff);
 352     set_imm(2 + p, (x >> 32) & 0xffff);
 353     set_imm(1 + p, (x >> 16) & 0xffff);
 354     set_imm(3 + p, x & 0xffff);
 355   } else {
 356     ShouldNotReachHere();
 357   }
 358 }
 359 
 360 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) {
 361   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
 362   int index = oop_recorder()->allocate_metadata_index(obj);
 363   RelocationHolder rspec = metadata_Relocation::spec(index);
 364   return AddressLiteral((address)obj, rspec);
 365 }
 366 
 367 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) {
 368   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
 369   int index = oop_recorder()->find_index(obj);
 370   RelocationHolder rspec = metadata_Relocation::spec(index);
 371   return AddressLiteral((address)obj, rspec);
 372 }
 373 
 374 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) {
 375   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
 376   int oop_index = oop_recorder()->allocate_oop_index(obj);
 377   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 378 }
 379 
 380 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) {
 381   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
 382   int oop_index = oop_recorder()->find_index(obj);
 383   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 384 }
 385 
 386 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
 387                                                       Register tmp, int offset) {
 388   intptr_t value = *delayed_value_addr;
 389   if (value != 0) {
 390     return RegisterOrConstant(value + offset);
 391   }
 392 
 393   // Load indirectly to solve generation ordering problem.
 394   // static address, no relocation
 395   int simm16_offset = load_const_optimized(tmp, delayed_value_addr, noreg, true);
 396   ld(tmp, simm16_offset, tmp); // must be aligned ((xa & 3) == 0)
 397 
 398   if (offset != 0) {
 399     addi(tmp, tmp, offset);
 400   }
 401 
 402   return RegisterOrConstant(tmp);
 403 }
 404 
 405 #ifndef PRODUCT
 406 void MacroAssembler::pd_print_patched_instruction(address branch) {
 407   Unimplemented(); // TODO: PPC port
 408 }
 409 #endif // ndef PRODUCT
 410 
 411 // Conditional far branch for destinations encodable in 24+2 bits.
 412 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) {
 413 
 414   // If requested by flag optimize, relocate the bc_far as a
 415   // runtime_call and prepare for optimizing it when the code gets
 416   // relocated.
 417   if (optimize == bc_far_optimize_on_relocate) {
 418     relocate(relocInfo::runtime_call_type);
 419   }
 420 
 421   // variant 2:
 422   //
 423   //    b!cxx SKIP
 424   //    bxx   DEST
 425   //  SKIP:
 426   //
 427 
 428   const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 429                                                 opposite_bcond(inv_boint_bcond(boint)));
 430 
 431   // We emit two branches.
 432   // First, a conditional branch which jumps around the far branch.
 433   const address not_taken_pc = pc() + 2 * BytesPerInstWord;
 434   const address bc_pc        = pc();
 435   bc(opposite_boint, biint, not_taken_pc);
 436 
 437   const int bc_instr = *(int*)bc_pc;
 438   assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition");
 439   assert(opposite_boint == inv_bo_field(bc_instr), "postcondition");
 440   assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))),
 441                                      opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))),
 442          "postcondition");
 443   assert(biint == inv_bi_field(bc_instr), "postcondition");
 444 
 445   // Second, an unconditional far branch which jumps to dest.
 446   // Note: target(dest) remembers the current pc (see CodeSection::target)
 447   //       and returns the current pc if the label is not bound yet; when
 448   //       the label gets bound, the unconditional far branch will be patched.
 449   const address target_pc = target(dest);
 450   const address b_pc  = pc();
 451   b(target_pc);
 452 
 453   assert(not_taken_pc == pc(),                     "postcondition");
 454   assert(dest.is_bound() || target_pc == b_pc, "postcondition");
 455 }
 456 
 457 // 1 or 2 instructions
 458 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) {
 459   if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) {
 460     bc(boint, biint, dest);
 461   } else {
 462     bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate);
 463   }
 464 }
 465 
 466 bool MacroAssembler::is_bc_far_at(address instruction_addr) {
 467   return is_bc_far_variant1_at(instruction_addr) ||
 468          is_bc_far_variant2_at(instruction_addr) ||
 469          is_bc_far_variant3_at(instruction_addr);
 470 }
 471 
 472 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) {
 473   if (is_bc_far_variant1_at(instruction_addr)) {
 474     const address instruction_1_addr = instruction_addr;
 475     const int instruction_1 = *(int*)instruction_1_addr;
 476     return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr);
 477   } else if (is_bc_far_variant2_at(instruction_addr)) {
 478     const address instruction_2_addr = instruction_addr + 4;
 479     return bxx_destination(instruction_2_addr);
 480   } else if (is_bc_far_variant3_at(instruction_addr)) {
 481     return instruction_addr + 8;
 482   }
 483   // variant 4 ???
 484   ShouldNotReachHere();
 485   return NULL;
 486 }
 487 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) {
 488 
 489   if (is_bc_far_variant3_at(instruction_addr)) {
 490     // variant 3, far cond branch to the next instruction, already patched to nops:
 491     //
 492     //    nop
 493     //    endgroup
 494     //  SKIP/DEST:
 495     //
 496     return;
 497   }
 498 
 499   // first, extract boint and biint from the current branch
 500   int boint = 0;
 501   int biint = 0;
 502 
 503   ResourceMark rm;
 504   const int code_size = 2 * BytesPerInstWord;
 505   CodeBuffer buf(instruction_addr, code_size);
 506   MacroAssembler masm(&buf);
 507   if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) {
 508     // Far branch to next instruction: Optimize it by patching nops (produce variant 3).
 509     masm.nop();
 510     masm.endgroup();
 511   } else {
 512     if (is_bc_far_variant1_at(instruction_addr)) {
 513       // variant 1, the 1st instruction contains the destination address:
 514       //
 515       //    bcxx  DEST
 516       //    nop
 517       //
 518       const int instruction_1 = *(int*)(instruction_addr);
 519       boint = inv_bo_field(instruction_1);
 520       biint = inv_bi_field(instruction_1);
 521     } else if (is_bc_far_variant2_at(instruction_addr)) {
 522       // variant 2, the 2nd instruction contains the destination address:
 523       //
 524       //    b!cxx SKIP
 525       //    bxx   DEST
 526       //  SKIP:
 527       //
 528       const int instruction_1 = *(int*)(instruction_addr);
 529       boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))),
 530           opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1))));
 531       biint = inv_bi_field(instruction_1);
 532     } else {
 533       // variant 4???
 534       ShouldNotReachHere();
 535     }
 536 
 537     // second, set the new branch destination and optimize the code
 538     if (dest != instruction_addr + 4 && // the bc_far is still unbound!
 539         masm.is_within_range_of_bcxx(dest, instruction_addr)) {
 540       // variant 1:
 541       //
 542       //    bcxx  DEST
 543       //    nop
 544       //
 545       masm.bc(boint, biint, dest);
 546       masm.nop();
 547     } else {
 548       // variant 2:
 549       //
 550       //    b!cxx SKIP
 551       //    bxx   DEST
 552       //  SKIP:
 553       //
 554       const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 555                                                     opposite_bcond(inv_boint_bcond(boint)));
 556       const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord;
 557       masm.bc(opposite_boint, biint, not_taken_pc);
 558       masm.b(dest);
 559     }
 560   }
 561   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 562 }
 563 
 564 // Emit a NOT mt-safe patchable 64 bit absolute call/jump.
 565 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) {
 566   // get current pc
 567   uint64_t start_pc = (uint64_t) pc();
 568 
 569   const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last
 570   const address pc_of_b  = (address) (start_pc + (0*BytesPerInstWord)); // b is first
 571 
 572   // relocate here
 573   if (rt != relocInfo::none) {
 574     relocate(rt);
 575   }
 576 
 577   if ( ReoptimizeCallSequences &&
 578        (( link && is_within_range_of_b(dest, pc_of_bl)) ||
 579         (!link && is_within_range_of_b(dest, pc_of_b)))) {
 580     // variant 2:
 581     // Emit an optimized, pc-relative call/jump.
 582 
 583     if (link) {
 584       // some padding
 585       nop();
 586       nop();
 587       nop();
 588       nop();
 589       nop();
 590       nop();
 591 
 592       // do the call
 593       assert(pc() == pc_of_bl, "just checking");
 594       bl(dest, relocInfo::none);
 595     } else {
 596       // do the jump
 597       assert(pc() == pc_of_b, "just checking");
 598       b(dest, relocInfo::none);
 599 
 600       // some padding
 601       nop();
 602       nop();
 603       nop();
 604       nop();
 605       nop();
 606       nop();
 607     }
 608 
 609     // Assert that we can identify the emitted call/jump.
 610     assert(is_bxx64_patchable_variant2_at((address)start_pc, link),
 611            "can't identify emitted call");
 612   } else {
 613     // variant 1:
 614     mr(R0, R11);  // spill R11 -> R0.
 615 
 616     // Load the destination address into CTR,
 617     // calculate destination relative to global toc.
 618     calculate_address_from_global_toc(R11, dest, true, true, false);
 619 
 620     mtctr(R11);
 621     mr(R11, R0);  // spill R11 <- R0.
 622     nop();
 623 
 624     // do the call/jump
 625     if (link) {
 626       bctrl();
 627     } else{
 628       bctr();
 629     }
 630     // Assert that we can identify the emitted call/jump.
 631     assert(is_bxx64_patchable_variant1b_at((address)start_pc, link),
 632            "can't identify emitted call");
 633   }
 634 
 635   // Assert that we can identify the emitted call/jump.
 636   assert(is_bxx64_patchable_at((address)start_pc, link),
 637          "can't identify emitted call");
 638   assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest,
 639          "wrong encoding of dest address");
 640 }
 641 
 642 // Identify a bxx64_patchable instruction.
 643 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) {
 644   return is_bxx64_patchable_variant1b_at(instruction_addr, link)
 645     //|| is_bxx64_patchable_variant1_at(instruction_addr, link)
 646       || is_bxx64_patchable_variant2_at(instruction_addr, link);
 647 }
 648 
 649 // Does the call64_patchable instruction use a pc-relative encoding of
 650 // the call destination?
 651 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) {
 652   // variant 2 is pc-relative
 653   return is_bxx64_patchable_variant2_at(instruction_addr, link);
 654 }
 655 
 656 // Identify variant 1.
 657 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) {
 658   unsigned int* instr = (unsigned int*) instruction_addr;
 659   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 660       && is_mtctr(instr[5]) // mtctr
 661     && is_load_const_at(instruction_addr);
 662 }
 663 
 664 // Identify variant 1b: load destination relative to global toc.
 665 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) {
 666   unsigned int* instr = (unsigned int*) instruction_addr;
 667   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 668     && is_mtctr(instr[3]) // mtctr
 669     && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr);
 670 }
 671 
 672 // Identify variant 2.
 673 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) {
 674   unsigned int* instr = (unsigned int*) instruction_addr;
 675   if (link) {
 676     return is_bl (instr[6])  // bl dest is last
 677       && is_nop(instr[0])  // nop
 678       && is_nop(instr[1])  // nop
 679       && is_nop(instr[2])  // nop
 680       && is_nop(instr[3])  // nop
 681       && is_nop(instr[4])  // nop
 682       && is_nop(instr[5]); // nop
 683   } else {
 684     return is_b  (instr[0])  // b  dest is first
 685       && is_nop(instr[1])  // nop
 686       && is_nop(instr[2])  // nop
 687       && is_nop(instr[3])  // nop
 688       && is_nop(instr[4])  // nop
 689       && is_nop(instr[5])  // nop
 690       && is_nop(instr[6]); // nop
 691   }
 692 }
 693 
 694 // Set dest address of a bxx64_patchable instruction.
 695 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) {
 696   ResourceMark rm;
 697   int code_size = MacroAssembler::bxx64_patchable_size;
 698   CodeBuffer buf(instruction_addr, code_size);
 699   MacroAssembler masm(&buf);
 700   masm.bxx64_patchable(dest, relocInfo::none, link);
 701   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 702 }
 703 
 704 // Get dest address of a bxx64_patchable instruction.
 705 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) {
 706   if (is_bxx64_patchable_variant1_at(instruction_addr, link)) {
 707     return (address) (unsigned long) get_const(instruction_addr);
 708   } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) {
 709     unsigned int* instr = (unsigned int*) instruction_addr;
 710     if (link) {
 711       const int instr_idx = 6; // bl is last
 712       int branchoffset = branch_destination(instr[instr_idx], 0);
 713       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 714     } else {
 715       const int instr_idx = 0; // b is first
 716       int branchoffset = branch_destination(instr[instr_idx], 0);
 717       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 718     }
 719   // Load dest relative to global toc.
 720   } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) {
 721     return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord,
 722                                                                instruction_addr);
 723   } else {
 724     ShouldNotReachHere();
 725     return NULL;
 726   }
 727 }
 728 
 729 // Uses ordering which corresponds to ABI:
 730 //    _savegpr0_14:  std  r14,-144(r1)
 731 //    _savegpr0_15:  std  r15,-136(r1)
 732 //    _savegpr0_16:  std  r16,-128(r1)
 733 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) {
 734   std(R14, offset, dst);   offset += 8;
 735   std(R15, offset, dst);   offset += 8;
 736   std(R16, offset, dst);   offset += 8;
 737   std(R17, offset, dst);   offset += 8;
 738   std(R18, offset, dst);   offset += 8;
 739   std(R19, offset, dst);   offset += 8;
 740   std(R20, offset, dst);   offset += 8;
 741   std(R21, offset, dst);   offset += 8;
 742   std(R22, offset, dst);   offset += 8;
 743   std(R23, offset, dst);   offset += 8;
 744   std(R24, offset, dst);   offset += 8;
 745   std(R25, offset, dst);   offset += 8;
 746   std(R26, offset, dst);   offset += 8;
 747   std(R27, offset, dst);   offset += 8;
 748   std(R28, offset, dst);   offset += 8;
 749   std(R29, offset, dst);   offset += 8;
 750   std(R30, offset, dst);   offset += 8;
 751   std(R31, offset, dst);   offset += 8;
 752 
 753   stfd(F14, offset, dst);   offset += 8;
 754   stfd(F15, offset, dst);   offset += 8;
 755   stfd(F16, offset, dst);   offset += 8;
 756   stfd(F17, offset, dst);   offset += 8;
 757   stfd(F18, offset, dst);   offset += 8;
 758   stfd(F19, offset, dst);   offset += 8;
 759   stfd(F20, offset, dst);   offset += 8;
 760   stfd(F21, offset, dst);   offset += 8;
 761   stfd(F22, offset, dst);   offset += 8;
 762   stfd(F23, offset, dst);   offset += 8;
 763   stfd(F24, offset, dst);   offset += 8;
 764   stfd(F25, offset, dst);   offset += 8;
 765   stfd(F26, offset, dst);   offset += 8;
 766   stfd(F27, offset, dst);   offset += 8;
 767   stfd(F28, offset, dst);   offset += 8;
 768   stfd(F29, offset, dst);   offset += 8;
 769   stfd(F30, offset, dst);   offset += 8;
 770   stfd(F31, offset, dst);
 771 }
 772 
 773 // Uses ordering which corresponds to ABI:
 774 //    _restgpr0_14:  ld   r14,-144(r1)
 775 //    _restgpr0_15:  ld   r15,-136(r1)
 776 //    _restgpr0_16:  ld   r16,-128(r1)
 777 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) {
 778   ld(R14, offset, src);   offset += 8;
 779   ld(R15, offset, src);   offset += 8;
 780   ld(R16, offset, src);   offset += 8;
 781   ld(R17, offset, src);   offset += 8;
 782   ld(R18, offset, src);   offset += 8;
 783   ld(R19, offset, src);   offset += 8;
 784   ld(R20, offset, src);   offset += 8;
 785   ld(R21, offset, src);   offset += 8;
 786   ld(R22, offset, src);   offset += 8;
 787   ld(R23, offset, src);   offset += 8;
 788   ld(R24, offset, src);   offset += 8;
 789   ld(R25, offset, src);   offset += 8;
 790   ld(R26, offset, src);   offset += 8;
 791   ld(R27, offset, src);   offset += 8;
 792   ld(R28, offset, src);   offset += 8;
 793   ld(R29, offset, src);   offset += 8;
 794   ld(R30, offset, src);   offset += 8;
 795   ld(R31, offset, src);   offset += 8;
 796 
 797   // FP registers
 798   lfd(F14, offset, src);   offset += 8;
 799   lfd(F15, offset, src);   offset += 8;
 800   lfd(F16, offset, src);   offset += 8;
 801   lfd(F17, offset, src);   offset += 8;
 802   lfd(F18, offset, src);   offset += 8;
 803   lfd(F19, offset, src);   offset += 8;
 804   lfd(F20, offset, src);   offset += 8;
 805   lfd(F21, offset, src);   offset += 8;
 806   lfd(F22, offset, src);   offset += 8;
 807   lfd(F23, offset, src);   offset += 8;
 808   lfd(F24, offset, src);   offset += 8;
 809   lfd(F25, offset, src);   offset += 8;
 810   lfd(F26, offset, src);   offset += 8;
 811   lfd(F27, offset, src);   offset += 8;
 812   lfd(F28, offset, src);   offset += 8;
 813   lfd(F29, offset, src);   offset += 8;
 814   lfd(F30, offset, src);   offset += 8;
 815   lfd(F31, offset, src);
 816 }
 817 
 818 // For verify_oops.
 819 void MacroAssembler::save_volatile_gprs(Register dst, int offset) {
 820   std(R2,  offset, dst);   offset += 8;
 821   std(R3,  offset, dst);   offset += 8;
 822   std(R4,  offset, dst);   offset += 8;
 823   std(R5,  offset, dst);   offset += 8;
 824   std(R6,  offset, dst);   offset += 8;
 825   std(R7,  offset, dst);   offset += 8;
 826   std(R8,  offset, dst);   offset += 8;
 827   std(R9,  offset, dst);   offset += 8;
 828   std(R10, offset, dst);   offset += 8;
 829   std(R11, offset, dst);   offset += 8;
 830   std(R12, offset, dst);   offset += 8;
 831 
 832   stfd(F0, offset, dst);   offset += 8;
 833   stfd(F1, offset, dst);   offset += 8;
 834   stfd(F2, offset, dst);   offset += 8;
 835   stfd(F3, offset, dst);   offset += 8;
 836   stfd(F4, offset, dst);   offset += 8;
 837   stfd(F5, offset, dst);   offset += 8;
 838   stfd(F6, offset, dst);   offset += 8;
 839   stfd(F7, offset, dst);   offset += 8;
 840   stfd(F8, offset, dst);   offset += 8;
 841   stfd(F9, offset, dst);   offset += 8;
 842   stfd(F10, offset, dst);  offset += 8;
 843   stfd(F11, offset, dst);  offset += 8;
 844   stfd(F12, offset, dst);  offset += 8;
 845   stfd(F13, offset, dst);
 846 }
 847 
 848 // For verify_oops.
 849 void MacroAssembler::restore_volatile_gprs(Register src, int offset) {
 850   ld(R2,  offset, src);   offset += 8;
 851   ld(R3,  offset, src);   offset += 8;
 852   ld(R4,  offset, src);   offset += 8;
 853   ld(R5,  offset, src);   offset += 8;
 854   ld(R6,  offset, src);   offset += 8;
 855   ld(R7,  offset, src);   offset += 8;
 856   ld(R8,  offset, src);   offset += 8;
 857   ld(R9,  offset, src);   offset += 8;
 858   ld(R10, offset, src);   offset += 8;
 859   ld(R11, offset, src);   offset += 8;
 860   ld(R12, offset, src);   offset += 8;
 861 
 862   lfd(F0, offset, src);   offset += 8;
 863   lfd(F1, offset, src);   offset += 8;
 864   lfd(F2, offset, src);   offset += 8;
 865   lfd(F3, offset, src);   offset += 8;
 866   lfd(F4, offset, src);   offset += 8;
 867   lfd(F5, offset, src);   offset += 8;
 868   lfd(F6, offset, src);   offset += 8;
 869   lfd(F7, offset, src);   offset += 8;
 870   lfd(F8, offset, src);   offset += 8;
 871   lfd(F9, offset, src);   offset += 8;
 872   lfd(F10, offset, src);  offset += 8;
 873   lfd(F11, offset, src);  offset += 8;
 874   lfd(F12, offset, src);  offset += 8;
 875   lfd(F13, offset, src);
 876 }
 877 
 878 void MacroAssembler::save_LR_CR(Register tmp) {
 879   mfcr(tmp);
 880   std(tmp, _abi(cr), R1_SP);
 881   mflr(tmp);
 882   std(tmp, _abi(lr), R1_SP);
 883   // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad)
 884 }
 885 
 886 void MacroAssembler::restore_LR_CR(Register tmp) {
 887   assert(tmp != R1_SP, "must be distinct");
 888   ld(tmp, _abi(lr), R1_SP);
 889   mtlr(tmp);
 890   ld(tmp, _abi(cr), R1_SP);
 891   mtcr(tmp);
 892 }
 893 
 894 address MacroAssembler::get_PC_trash_LR(Register result) {
 895   Label L;
 896   bl(L);
 897   bind(L);
 898   address lr_pc = pc();
 899   mflr(result);
 900   return lr_pc;
 901 }
 902 
 903 void MacroAssembler::resize_frame(Register offset, Register tmp) {
 904 #ifdef ASSERT
 905   assert_different_registers(offset, tmp, R1_SP);
 906   andi_(tmp, offset, frame::alignment_in_bytes-1);
 907   asm_assert_eq("resize_frame: unaligned", 0x204);
 908 #endif
 909 
 910   // tmp <- *(SP)
 911   ld(tmp, _abi(callers_sp), R1_SP);
 912   // addr <- SP + offset;
 913   // *(addr) <- tmp;
 914   // SP <- addr
 915   stdux(tmp, R1_SP, offset);
 916 }
 917 
 918 void MacroAssembler::resize_frame(int offset, Register tmp) {
 919   assert(is_simm(offset, 16), "too big an offset");
 920   assert_different_registers(tmp, R1_SP);
 921   assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned");
 922   // tmp <- *(SP)
 923   ld(tmp, _abi(callers_sp), R1_SP);
 924   // addr <- SP + offset;
 925   // *(addr) <- tmp;
 926   // SP <- addr
 927   stdu(tmp, offset, R1_SP);
 928 }
 929 
 930 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) {
 931   // (addr == tmp1) || (addr == tmp2) is allowed here!
 932   assert(tmp1 != tmp2, "must be distinct");
 933 
 934   // compute offset w.r.t. current stack pointer
 935   // tmp_1 <- addr - SP (!)
 936   subf(tmp1, R1_SP, addr);
 937 
 938   // atomically update SP keeping back link.
 939   resize_frame(tmp1/* offset */, tmp2/* tmp */);
 940 }
 941 
 942 void MacroAssembler::push_frame(Register bytes, Register tmp) {
 943 #ifdef ASSERT
 944   assert(bytes != R0, "r0 not allowed here");
 945   andi_(R0, bytes, frame::alignment_in_bytes-1);
 946   asm_assert_eq("push_frame(Reg, Reg): unaligned", 0x203);
 947 #endif
 948   neg(tmp, bytes);
 949   stdux(R1_SP, R1_SP, tmp);
 950 }
 951 
 952 // Push a frame of size `bytes'.
 953 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) {
 954   long offset = align_addr(bytes, frame::alignment_in_bytes);
 955   if (is_simm(-offset, 16)) {
 956     stdu(R1_SP, -offset, R1_SP);
 957   } else {
 958     load_const_optimized(tmp, -offset);
 959     stdux(R1_SP, R1_SP, tmp);
 960   }
 961 }
 962 
 963 // Push a frame of size `bytes' plus abi_reg_args on top.
 964 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) {
 965   push_frame(bytes + frame::abi_reg_args_size, tmp);
 966 }
 967 
 968 // Setup up a new C frame with a spill area for non-volatile GPRs and
 969 // additional space for local variables.
 970 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes,
 971                                                       Register tmp) {
 972   push_frame(bytes + frame::abi_reg_args_size + frame::spill_nonvolatiles_size, tmp);
 973 }
 974 
 975 // Pop current C frame.
 976 void MacroAssembler::pop_frame() {
 977   ld(R1_SP, _abi(callers_sp), R1_SP);
 978 }
 979 
 980 #if defined(ABI_ELFv2)
 981 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) {
 982   // TODO(asmundak): make sure the caller uses R12 as function descriptor
 983   // most of the times.
 984   if (R12 != r_function_entry) {
 985     mr(R12, r_function_entry);
 986   }
 987   mtctr(R12);
 988   // Do a call or a branch.
 989   if (and_link) {
 990     bctrl();
 991   } else {
 992     bctr();
 993   }
 994   _last_calls_return_pc = pc();
 995 
 996   return _last_calls_return_pc;
 997 }
 998 
 999 // Call a C function via a function descriptor and use full C
1000 // calling conventions. Updates and returns _last_calls_return_pc.
1001 address MacroAssembler::call_c(Register r_function_entry) {
1002   return branch_to(r_function_entry, /*and_link=*/true);
1003 }
1004 
1005 // For tail calls: only branch, don't link, so callee returns to caller of this function.
1006 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) {
1007   return branch_to(r_function_entry, /*and_link=*/false);
1008 }
1009 
1010 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) {
1011   load_const(R12, function_entry, R0);
1012   return branch_to(R12,  /*and_link=*/true);
1013 }
1014 
1015 #else
1016 // Generic version of a call to C function via a function descriptor
1017 // with variable support for C calling conventions (TOC, ENV, etc.).
1018 // Updates and returns _last_calls_return_pc.
1019 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call,
1020                                   bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) {
1021   // we emit standard ptrgl glue code here
1022   assert((function_descriptor != R0), "function_descriptor cannot be R0");
1023 
1024   // retrieve necessary entries from the function descriptor
1025   ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor);
1026   mtctr(R0);
1027 
1028   if (load_toc_of_callee) {
1029     ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor);
1030   }
1031   if (load_env_of_callee) {
1032     ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor);
1033   } else if (load_toc_of_callee) {
1034     li(R11, 0);
1035   }
1036 
1037   // do a call or a branch
1038   if (and_link) {
1039     bctrl();
1040   } else {
1041     bctr();
1042   }
1043   _last_calls_return_pc = pc();
1044 
1045   return _last_calls_return_pc;
1046 }
1047 
1048 // Call a C function via a function descriptor and use full C calling
1049 // conventions.
1050 // We don't use the TOC in generated code, so there is no need to save
1051 // and restore its value.
1052 address MacroAssembler::call_c(Register fd) {
1053   return branch_to(fd, /*and_link=*/true,
1054                        /*save toc=*/false,
1055                        /*restore toc=*/false,
1056                        /*load toc=*/true,
1057                        /*load env=*/true);
1058 }
1059 
1060 address MacroAssembler::call_c_and_return_to_caller(Register fd) {
1061   return branch_to(fd, /*and_link=*/false,
1062                        /*save toc=*/false,
1063                        /*restore toc=*/false,
1064                        /*load toc=*/true,
1065                        /*load env=*/true);
1066 }
1067 
1068 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) {
1069   if (rt != relocInfo::none) {
1070     // this call needs to be relocatable
1071     if (!ReoptimizeCallSequences
1072         || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1073         || fd == NULL   // support code-size estimation
1074         || !fd->is_friend_function()
1075         || fd->entry() == NULL) {
1076       // it's not a friend function as defined by class FunctionDescriptor,
1077       // so do a full call-c here.
1078       load_const(R11, (address)fd, R0);
1079 
1080       bool has_env = (fd != NULL && fd->env() != NULL);
1081       return branch_to(R11, /*and_link=*/true,
1082                             /*save toc=*/false,
1083                             /*restore toc=*/false,
1084                             /*load toc=*/true,
1085                             /*load env=*/has_env);
1086     } else {
1087       // It's a friend function. Load the entry point and don't care about
1088       // toc and env. Use an optimizable call instruction, but ensure the
1089       // same code-size as in the case of a non-friend function.
1090       nop();
1091       nop();
1092       nop();
1093       bl64_patchable(fd->entry(), rt);
1094       _last_calls_return_pc = pc();
1095       return _last_calls_return_pc;
1096     }
1097   } else {
1098     // This call does not need to be relocatable, do more aggressive
1099     // optimizations.
1100     if (!ReoptimizeCallSequences
1101       || !fd->is_friend_function()) {
1102       // It's not a friend function as defined by class FunctionDescriptor,
1103       // so do a full call-c here.
1104       load_const(R11, (address)fd, R0);
1105       return branch_to(R11, /*and_link=*/true,
1106                             /*save toc=*/false,
1107                             /*restore toc=*/false,
1108                             /*load toc=*/true,
1109                             /*load env=*/true);
1110     } else {
1111       // it's a friend function, load the entry point and don't care about
1112       // toc and env.
1113       address dest = fd->entry();
1114       if (is_within_range_of_b(dest, pc())) {
1115         bl(dest);
1116       } else {
1117         bl64_patchable(dest, rt);
1118       }
1119       _last_calls_return_pc = pc();
1120       return _last_calls_return_pc;
1121     }
1122   }
1123 }
1124 
1125 // Call a C function.  All constants needed reside in TOC.
1126 //
1127 // Read the address to call from the TOC.
1128 // Read env from TOC, if fd specifies an env.
1129 // Read new TOC from TOC.
1130 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd,
1131                                          relocInfo::relocType rt, Register toc) {
1132   if (!ReoptimizeCallSequences
1133     || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1134     || !fd->is_friend_function()) {
1135     // It's not a friend function as defined by class FunctionDescriptor,
1136     // so do a full call-c here.
1137     assert(fd->entry() != NULL, "function must be linked");
1138 
1139     AddressLiteral fd_entry(fd->entry());
1140     bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true);
1141     mtctr(R11);
1142     if (fd->env() == NULL) {
1143       li(R11, 0);
1144       nop();
1145     } else {
1146       AddressLiteral fd_env(fd->env());
1147       success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true);
1148     }
1149     AddressLiteral fd_toc(fd->toc());
1150     // Set R2_TOC (load from toc)
1151     success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true);
1152     bctrl();
1153     _last_calls_return_pc = pc();
1154     if (!success) { return NULL; }
1155   } else {
1156     // It's a friend function, load the entry point and don't care about
1157     // toc and env. Use an optimizable call instruction, but ensure the
1158     // same code-size as in the case of a non-friend function.
1159     nop();
1160     bl64_patchable(fd->entry(), rt);
1161     _last_calls_return_pc = pc();
1162   }
1163   return _last_calls_return_pc;
1164 }
1165 #endif // ABI_ELFv2
1166 
1167 void MacroAssembler::call_VM_base(Register oop_result,
1168                                   Register last_java_sp,
1169                                   address  entry_point,
1170                                   bool     check_exceptions) {
1171   BLOCK_COMMENT("call_VM {");
1172   // Determine last_java_sp register.
1173   if (!last_java_sp->is_valid()) {
1174     last_java_sp = R1_SP;
1175   }
1176   set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1);
1177 
1178   // ARG1 must hold thread address.
1179   mr(R3_ARG1, R16_thread);
1180 #if defined(ABI_ELFv2)
1181   address return_pc = call_c(entry_point, relocInfo::none);
1182 #else
1183   address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none);
1184 #endif
1185 
1186   reset_last_Java_frame();
1187 
1188   // Check for pending exceptions.
1189   if (check_exceptions) {
1190     // We don't check for exceptions here.
1191     ShouldNotReachHere();
1192   }
1193 
1194   // Get oop result if there is one and reset the value in the thread.
1195   if (oop_result->is_valid()) {
1196     get_vm_result(oop_result);
1197   }
1198 
1199   _last_calls_return_pc = return_pc;
1200   BLOCK_COMMENT("} call_VM");
1201 }
1202 
1203 void MacroAssembler::call_VM_leaf_base(address entry_point) {
1204   BLOCK_COMMENT("call_VM_leaf {");
1205 #if defined(ABI_ELFv2)
1206   call_c(entry_point, relocInfo::none);
1207 #else
1208   call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none);
1209 #endif
1210   BLOCK_COMMENT("} call_VM_leaf");
1211 }
1212 
1213 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) {
1214   call_VM_base(oop_result, noreg, entry_point, check_exceptions);
1215 }
1216 
1217 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1,
1218                              bool check_exceptions) {
1219   // R3_ARG1 is reserved for the thread.
1220   mr_if_needed(R4_ARG2, arg_1);
1221   call_VM(oop_result, entry_point, check_exceptions);
1222 }
1223 
1224 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2,
1225                              bool check_exceptions) {
1226   // R3_ARG1 is reserved for the thread
1227   mr_if_needed(R4_ARG2, arg_1);
1228   assert(arg_2 != R4_ARG2, "smashed argument");
1229   mr_if_needed(R5_ARG3, arg_2);
1230   call_VM(oop_result, entry_point, check_exceptions);
1231 }
1232 
1233 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3,
1234                              bool check_exceptions) {
1235   // R3_ARG1 is reserved for the thread
1236   mr_if_needed(R4_ARG2, arg_1);
1237   assert(arg_2 != R4_ARG2, "smashed argument");
1238   mr_if_needed(R5_ARG3, arg_2);
1239   mr_if_needed(R6_ARG4, arg_3);
1240   call_VM(oop_result, entry_point, check_exceptions);
1241 }
1242 
1243 void MacroAssembler::call_VM_leaf(address entry_point) {
1244   call_VM_leaf_base(entry_point);
1245 }
1246 
1247 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) {
1248   mr_if_needed(R3_ARG1, arg_1);
1249   call_VM_leaf(entry_point);
1250 }
1251 
1252 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) {
1253   mr_if_needed(R3_ARG1, arg_1);
1254   assert(arg_2 != R3_ARG1, "smashed argument");
1255   mr_if_needed(R4_ARG2, arg_2);
1256   call_VM_leaf(entry_point);
1257 }
1258 
1259 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) {
1260   mr_if_needed(R3_ARG1, arg_1);
1261   assert(arg_2 != R3_ARG1, "smashed argument");
1262   mr_if_needed(R4_ARG2, arg_2);
1263   assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument");
1264   mr_if_needed(R5_ARG3, arg_3);
1265   call_VM_leaf(entry_point);
1266 }
1267 
1268 // Check whether instruction is a read access to the polling page
1269 // which was emitted by load_from_polling_page(..).
1270 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext,
1271                                                address* polling_address_ptr) {
1272   if (!is_ld(instruction))
1273     return false; // It's not a ld. Fail.
1274 
1275   int rt = inv_rt_field(instruction);
1276   int ra = inv_ra_field(instruction);
1277   int ds = inv_ds_field(instruction);
1278   if (!(ds == 0 && ra != 0 && rt == 0)) {
1279     return false; // It's not a ld(r0, X, ra). Fail.
1280   }
1281 
1282   if (!ucontext) {
1283     // Set polling address.
1284     if (polling_address_ptr != NULL) {
1285       *polling_address_ptr = NULL;
1286     }
1287     return true; // No ucontext given. Can't check value of ra. Assume true.
1288   }
1289 
1290 #ifdef LINUX
1291   // Ucontext given. Check that register ra contains the address of
1292   // the safepoing polling page.
1293   ucontext_t* uc = (ucontext_t*) ucontext;
1294   // Set polling address.
1295   address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds;
1296   if (polling_address_ptr != NULL) {
1297     *polling_address_ptr = addr;
1298   }
1299   return os::is_poll_address(addr);
1300 #else
1301   // Not on Linux, ucontext must be NULL.
1302   ShouldNotReachHere();
1303   return false;
1304 #endif
1305 }
1306 
1307 bool MacroAssembler::is_memory_serialization(int instruction, JavaThread* thread, void* ucontext) {
1308 #ifdef LINUX
1309   ucontext_t* uc = (ucontext_t*) ucontext;
1310 
1311   if (is_stwx(instruction) || is_stwux(instruction)) {
1312     int ra = inv_ra_field(instruction);
1313     int rb = inv_rb_field(instruction);
1314 
1315     // look up content of ra and rb in ucontext
1316     address ra_val=(address)uc->uc_mcontext.regs->gpr[ra];
1317     long rb_val=(long)uc->uc_mcontext.regs->gpr[rb];
1318     return os::is_memory_serialize_page(thread, ra_val+rb_val);
1319   } else if (is_stw(instruction) || is_stwu(instruction)) {
1320     int ra = inv_ra_field(instruction);
1321     int d1 = inv_d1_field(instruction);
1322 
1323     // look up content of ra in ucontext
1324     address ra_val=(address)uc->uc_mcontext.regs->gpr[ra];
1325     return os::is_memory_serialize_page(thread, ra_val+d1);
1326   } else {
1327     return false;
1328   }
1329 #else
1330   // workaround not needed on !LINUX :-)
1331   ShouldNotCallThis();
1332   return false;
1333 #endif
1334 }
1335 
1336 void MacroAssembler::bang_stack_with_offset(int offset) {
1337   // When increasing the stack, the old stack pointer will be written
1338   // to the new top of stack according to the PPC64 abi.
1339   // Therefore, stack banging is not necessary when increasing
1340   // the stack by <= os::vm_page_size() bytes.
1341   // When increasing the stack by a larger amount, this method is
1342   // called repeatedly to bang the intermediate pages.
1343 
1344   // Stack grows down, caller passes positive offset.
1345   assert(offset > 0, "must bang with positive offset");
1346 
1347   long stdoffset = -offset;
1348 
1349   if (is_simm(stdoffset, 16)) {
1350     // Signed 16 bit offset, a simple std is ok.
1351     if (UseLoadInstructionsForStackBangingPPC64) {
1352       ld(R0, (int)(signed short)stdoffset, R1_SP);
1353     } else {
1354       std(R0,(int)(signed short)stdoffset, R1_SP);
1355     }
1356   } else if (is_simm(stdoffset, 31)) {
1357     const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset);
1358     const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset);
1359 
1360     Register tmp = R11;
1361     addis(tmp, R1_SP, hi);
1362     if (UseLoadInstructionsForStackBangingPPC64) {
1363       ld(R0,  lo, tmp);
1364     } else {
1365       std(R0, lo, tmp);
1366     }
1367   } else {
1368     ShouldNotReachHere();
1369   }
1370 }
1371 
1372 // If instruction is a stack bang of the form
1373 //    std    R0,    x(Ry),       (see bang_stack_with_offset())
1374 //    stdu   R1_SP, x(R1_SP),    (see push_frame(), resize_frame())
1375 // or stdux  R1_SP, Rx, R1_SP    (see push_frame(), resize_frame())
1376 // return the banged address. Otherwise, return 0.
1377 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) {
1378 #ifdef LINUX
1379   ucontext_t* uc = (ucontext_t*) ucontext;
1380   int rs = inv_rs_field(instruction);
1381   int ra = inv_ra_field(instruction);
1382   if (   (is_ld(instruction)   && rs == 0 &&  UseLoadInstructionsForStackBangingPPC64)
1383       || (is_std(instruction)  && rs == 0 && !UseLoadInstructionsForStackBangingPPC64)
1384       || (is_stdu(instruction) && rs == 1)) {
1385     int ds = inv_ds_field(instruction);
1386     // return banged address
1387     return ds+(address)uc->uc_mcontext.regs->gpr[ra];
1388   } else if (is_stdux(instruction) && rs == 1) {
1389     int rb = inv_rb_field(instruction);
1390     address sp = (address)uc->uc_mcontext.regs->gpr[1];
1391     long rb_val = (long)uc->uc_mcontext.regs->gpr[rb];
1392     return ra != 1 || rb_val >= 0 ? NULL         // not a stack bang
1393                                   : sp + rb_val; // banged address
1394   }
1395   return NULL; // not a stack bang
1396 #else
1397   // workaround not needed on !LINUX :-)
1398   ShouldNotCallThis();
1399   return NULL;
1400 #endif
1401 }
1402 
1403 void MacroAssembler::reserved_stack_check(Register return_pc) {
1404   // Test if reserved zone needs to be enabled.
1405   Label no_reserved_zone_enabling;
1406 
1407   ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread);
1408   cmpld(CCR0, R1_SP, R0);
1409   blt_predict_taken(CCR0, no_reserved_zone_enabling);
1410 
1411   // Enable reserved zone again, throw stack overflow exception.
1412   push_frame_reg_args(0, R0);
1413   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread);
1414   pop_frame();
1415   mtlr(return_pc);
1416   load_const_optimized(R0, StubRoutines::throw_delayed_StackOverflowError_entry());
1417   mtctr(R0);
1418   bctr();
1419 
1420   should_not_reach_here();
1421 
1422   bind(no_reserved_zone_enabling);
1423 }
1424 
1425 void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base,
1426                                 bool cmpxchgx_hint) {
1427   Label retry;
1428   bind(retry);
1429   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1430   stdcx_(exchange_value, addr_base);
1431   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1432     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1433   } else {
1434     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1435   }
1436 }
1437 
1438 void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base,
1439                                 Register tmp, bool cmpxchgx_hint) {
1440   Label retry;
1441   bind(retry);
1442   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1443   add(tmp, dest_current_value, inc_value);
1444   stdcx_(tmp, addr_base);
1445   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1446     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1447   } else {
1448     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1449   }
1450 }
1451 
1452 // Word/sub-word atomic helper functions
1453 
1454 // Temps and addr_base are killed if size < 4 and processor does not support respective instructions.
1455 // Only signed types are supported with size < 4.
1456 // Atomic add always kills tmp1.
1457 void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value,
1458                                                    Register addr_base, Register tmp1, Register tmp2, Register tmp3,
1459                                                    bool cmpxchgx_hint, bool is_add, int size) {
1460   // Sub-word instructions are available since Power 8.
1461   // For older processors, instruction_type != size holds, and we
1462   // emulate the sub-word instructions by constructing a 4-byte value
1463   // that leaves the other bytes unchanged.
1464   const int instruction_type = VM_Version::has_lqarx() ? size : 4;
1465 
1466   Label retry;
1467   Register shift_amount = noreg,
1468            val32 = dest_current_value,
1469            modval = is_add ? tmp1 : exchange_value;
1470 
1471   if (instruction_type != size) {
1472     assert_different_registers(tmp1, tmp2, tmp3, dest_current_value, exchange_value, addr_base);
1473     modval = tmp1;
1474     shift_amount = tmp2;
1475     val32 = tmp3;
1476     // Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.
1477 #ifdef VM_LITTLE_ENDIAN
1478     rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;
1479     clrrdi(addr_base, addr_base, 2);
1480 #else
1481     xori(shift_amount, addr_base, (size == 1) ? 3 : 2);
1482     clrrdi(addr_base, addr_base, 2);
1483     rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;
1484 #endif
1485   }
1486 
1487   // atomic emulation loop
1488   bind(retry);
1489 
1490   switch (instruction_type) {
1491     case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1492     case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1493     case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1494     default: ShouldNotReachHere();
1495   }
1496 
1497   if (instruction_type != size) {
1498     srw(dest_current_value, val32, shift_amount);
1499   }
1500 
1501   if (is_add) { add(modval, dest_current_value, exchange_value); }
1502 
1503   if (instruction_type != size) {
1504     // Transform exchange value such that the replacement can be done by one xor instruction.
1505     xorr(modval, dest_current_value, is_add ? modval : exchange_value);
1506     clrldi(modval, modval, (size == 1) ? 56 : 48);
1507     slw(modval, modval, shift_amount);
1508     xorr(modval, val32, modval);
1509   }
1510 
1511   switch (instruction_type) {
1512     case 4: stwcx_(modval, addr_base); break;
1513     case 2: sthcx_(modval, addr_base); break;
1514     case 1: stbcx_(modval, addr_base); break;
1515     default: ShouldNotReachHere();
1516   }
1517 
1518   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1519     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1520   } else {
1521     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1522   }
1523 
1524   // l?arx zero-extends, but Java wants byte/short values sign-extended.
1525   if (size == 1) {
1526     extsb(dest_current_value, dest_current_value);
1527   } else if (size == 2) {
1528     extsh(dest_current_value, dest_current_value);
1529   };
1530 }
1531 
1532 // Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions.
1533 // Only signed types are supported with size < 4.
1534 void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value,
1535                                        Register compare_value, Register exchange_value,
1536                                        Register addr_base, Register tmp1, Register tmp2,
1537                                        Label &retry, Label &failed, bool cmpxchgx_hint, int size) {
1538   // Sub-word instructions are available since Power 8.
1539   // For older processors, instruction_type != size holds, and we
1540   // emulate the sub-word instructions by constructing a 4-byte value
1541   // that leaves the other bytes unchanged.
1542   const int instruction_type = VM_Version::has_lqarx() ? size : 4;
1543 
1544   Register shift_amount = noreg,
1545            val32 = dest_current_value,
1546            modval = exchange_value;
1547 
1548   if (instruction_type != size) {
1549     assert_different_registers(tmp1, tmp2, dest_current_value, compare_value, exchange_value, addr_base);
1550     shift_amount = tmp1;
1551     val32 = tmp2;
1552     modval = tmp2;
1553     // Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.
1554 #ifdef VM_LITTLE_ENDIAN
1555     rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;
1556     clrrdi(addr_base, addr_base, 2);
1557 #else
1558     xori(shift_amount, addr_base, (size == 1) ? 3 : 2);
1559     clrrdi(addr_base, addr_base, 2);
1560     rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;
1561 #endif
1562     // Transform exchange value such that the replacement can be done by one xor instruction.
1563     xorr(exchange_value, compare_value, exchange_value);
1564     clrldi(exchange_value, exchange_value, (size == 1) ? 56 : 48);
1565     slw(exchange_value, exchange_value, shift_amount);
1566   }
1567 
1568   // atomic emulation loop
1569   bind(retry);
1570 
1571   switch (instruction_type) {
1572     case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1573     case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1574     case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1575     default: ShouldNotReachHere();
1576   }
1577 
1578   if (instruction_type != size) {
1579     srw(dest_current_value, val32, shift_amount);
1580   }
1581   if (size == 1) {
1582     extsb(dest_current_value, dest_current_value);
1583   } else if (size == 2) {
1584     extsh(dest_current_value, dest_current_value);
1585   };
1586 
1587   cmpw(flag, dest_current_value, compare_value);
1588   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1589     bne_predict_not_taken(flag, failed);
1590   } else {
1591     bne(                  flag, failed);
1592   }
1593   // branch to done  => (flag == ne), (dest_current_value != compare_value)
1594   // fall through    => (flag == eq), (dest_current_value == compare_value)
1595 
1596   if (instruction_type != size) {
1597     xorr(modval, val32, exchange_value);
1598   }
1599 
1600   switch (instruction_type) {
1601     case 4: stwcx_(modval, addr_base); break;
1602     case 2: sthcx_(modval, addr_base); break;
1603     case 1: stbcx_(modval, addr_base); break;
1604     default: ShouldNotReachHere();
1605   }
1606 }
1607 
1608 // CmpxchgX sets condition register to cmpX(current, compare).
1609 void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value,
1610                                      Register compare_value, Register exchange_value,
1611                                      Register addr_base, Register tmp1, Register tmp2,
1612                                      int semantics, bool cmpxchgx_hint,
1613                                      Register int_flag_success, bool contention_hint, bool weak, int size) {
1614   Label retry;
1615   Label failed;
1616   Label done;
1617 
1618   // Save one branch if result is returned via register and
1619   // result register is different from the other ones.
1620   bool use_result_reg    = (int_flag_success != noreg);
1621   bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value &&
1622                             int_flag_success != exchange_value && int_flag_success != addr_base &&
1623                             int_flag_success != tmp1 && int_flag_success != tmp2);
1624   assert(!weak || flag == CCR0, "weak only supported with CCR0");
1625   assert(size == 1 || size == 2 || size == 4, "unsupported");
1626 
1627   if (use_result_reg && preset_result_reg) {
1628     li(int_flag_success, 0); // preset (assume cas failed)
1629   }
1630 
1631   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1632   if (contention_hint) { // Don't try to reserve if cmp fails.
1633     switch (size) {
1634       case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break;
1635       case 2: lha(dest_current_value, 0, addr_base); break;
1636       case 4: lwz(dest_current_value, 0, addr_base); break;
1637       default: ShouldNotReachHere();
1638     }
1639     cmpw(flag, dest_current_value, compare_value);
1640     bne(flag, failed);
1641   }
1642 
1643   // release/fence semantics
1644   if (semantics & MemBarRel) {
1645     release();
1646   }
1647 
1648   cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base, tmp1, tmp2,
1649                     retry, failed, cmpxchgx_hint, size);
1650   if (!weak || use_result_reg) {
1651     if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1652       bne_predict_not_taken(CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1653     } else {
1654       bne(                  CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1655     }
1656   }
1657   // fall through    => (flag == eq), (dest_current_value == compare_value), (swapped)
1658 
1659   // Result in register (must do this at the end because int_flag_success can be the
1660   // same register as one above).
1661   if (use_result_reg) {
1662     li(int_flag_success, 1);
1663   }
1664 
1665   if (semantics & MemBarFenceAfter) {
1666     fence();
1667   } else if (semantics & MemBarAcq) {
1668     isync();
1669   }
1670 
1671   if (use_result_reg && !preset_result_reg) {
1672     b(done);
1673   }
1674 
1675   bind(failed);
1676   if (use_result_reg && !preset_result_reg) {
1677     li(int_flag_success, 0);
1678   }
1679 
1680   bind(done);
1681   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1682   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1683 }
1684 
1685 // Preforms atomic compare exchange:
1686 //   if (compare_value == *addr_base)
1687 //     *addr_base = exchange_value
1688 //     int_flag_success = 1;
1689 //   else
1690 //     int_flag_success = 0;
1691 //
1692 // ConditionRegister flag       = cmp(compare_value, *addr_base)
1693 // Register dest_current_value  = *addr_base
1694 // Register compare_value       Used to compare with value in memory
1695 // Register exchange_value      Written to memory if compare_value == *addr_base
1696 // Register addr_base           The memory location to compareXChange
1697 // Register int_flag_success    Set to 1 if exchange_value was written to *addr_base
1698 //
1699 // To avoid the costly compare exchange the value is tested beforehand.
1700 // Several special cases exist to avoid that unnecessary information is generated.
1701 //
1702 void MacroAssembler::cmpxchgd(ConditionRegister flag,
1703                               Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value,
1704                               Register addr_base, int semantics, bool cmpxchgx_hint,
1705                               Register int_flag_success, Label* failed_ext, bool contention_hint, bool weak) {
1706   Label retry;
1707   Label failed_int;
1708   Label& failed = (failed_ext != NULL) ? *failed_ext : failed_int;
1709   Label done;
1710 
1711   // Save one branch if result is returned via register and result register is different from the other ones.
1712   bool use_result_reg    = (int_flag_success!=noreg);
1713   bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() &&
1714                             int_flag_success!=exchange_value && int_flag_success!=addr_base);
1715   assert(!weak || flag == CCR0, "weak only supported with CCR0");
1716   assert(int_flag_success == noreg || failed_ext == NULL, "cannot have both");
1717 
1718   if (use_result_reg && preset_result_reg) {
1719     li(int_flag_success, 0); // preset (assume cas failed)
1720   }
1721 
1722   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1723   if (contention_hint) { // Don't try to reserve if cmp fails.
1724     ld(dest_current_value, 0, addr_base);
1725     cmpd(flag, compare_value, dest_current_value);
1726     bne(flag, failed);
1727   }
1728 
1729   // release/fence semantics
1730   if (semantics & MemBarRel) {
1731     release();
1732   }
1733 
1734   // atomic emulation loop
1735   bind(retry);
1736 
1737   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1738   cmpd(flag, compare_value, dest_current_value);
1739   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1740     bne_predict_not_taken(flag, failed);
1741   } else {
1742     bne(                  flag, failed);
1743   }
1744 
1745   stdcx_(exchange_value, addr_base);
1746   if (!weak || use_result_reg || failed_ext) {
1747     if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1748       bne_predict_not_taken(CCR0, weak ? failed : retry); // stXcx_ sets CCR0
1749     } else {
1750       bne(                  CCR0, weak ? failed : retry); // stXcx_ sets CCR0
1751     }
1752   }
1753 
1754   // result in register (must do this at the end because int_flag_success can be the same register as one above)
1755   if (use_result_reg) {
1756     li(int_flag_success, 1);
1757   }
1758 
1759   if (semantics & MemBarFenceAfter) {
1760     fence();
1761   } else if (semantics & MemBarAcq) {
1762     isync();
1763   }
1764 
1765   if (use_result_reg && !preset_result_reg) {
1766     b(done);
1767   }
1768 
1769   bind(failed_int);
1770   if (use_result_reg && !preset_result_reg) {
1771     li(int_flag_success, 0);
1772   }
1773 
1774   bind(done);
1775   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1776   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1777 }
1778 
1779 // Look up the method for a megamorphic invokeinterface call.
1780 // The target method is determined by <intf_klass, itable_index>.
1781 // The receiver klass is in recv_klass.
1782 // On success, the result will be in method_result, and execution falls through.
1783 // On failure, execution transfers to the given label.
1784 void MacroAssembler::lookup_interface_method(Register recv_klass,
1785                                              Register intf_klass,
1786                                              RegisterOrConstant itable_index,
1787                                              Register method_result,
1788                                              Register scan_temp,
1789                                              Register sethi_temp,
1790                                              Label& L_no_such_interface) {
1791   assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
1792   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
1793          "caller must use same register for non-constant itable index as for method");
1794 
1795   // Compute start of first itableOffsetEntry (which is at the end of the vtable).
1796   int vtable_base = in_bytes(Klass::vtable_start_offset());
1797   int itentry_off = itableMethodEntry::method_offset_in_bytes();
1798   int logMEsize   = exact_log2(itableMethodEntry::size() * wordSize);
1799   int scan_step   = itableOffsetEntry::size() * wordSize;
1800   int log_vte_size= exact_log2(vtableEntry::size_in_bytes());
1801 
1802   lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass);
1803   // %%% We should store the aligned, prescaled offset in the klassoop.
1804   // Then the next several instructions would fold away.
1805 
1806   sldi(scan_temp, scan_temp, log_vte_size);
1807   addi(scan_temp, scan_temp, vtable_base);
1808   add(scan_temp, recv_klass, scan_temp);
1809 
1810   // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1811   if (itable_index.is_register()) {
1812     Register itable_offset = itable_index.as_register();
1813     sldi(itable_offset, itable_offset, logMEsize);
1814     if (itentry_off) addi(itable_offset, itable_offset, itentry_off);
1815     add(recv_klass, itable_offset, recv_klass);
1816   } else {
1817     long itable_offset = (long)itable_index.as_constant();
1818     load_const_optimized(sethi_temp, (itable_offset<<logMEsize)+itentry_off); // static address, no relocation
1819     add(recv_klass, sethi_temp, recv_klass);
1820   }
1821 
1822   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
1823   //   if (scan->interface() == intf) {
1824   //     result = (klass + scan->offset() + itable_index);
1825   //   }
1826   // }
1827   Label search, found_method;
1828 
1829   for (int peel = 1; peel >= 0; peel--) {
1830     // %%%% Could load both offset and interface in one ldx, if they were
1831     // in the opposite order. This would save a load.
1832     ld(method_result, itableOffsetEntry::interface_offset_in_bytes(), scan_temp);
1833 
1834     // Check that this entry is non-null. A null entry means that
1835     // the receiver class doesn't implement the interface, and wasn't the
1836     // same as when the caller was compiled.
1837     cmpd(CCR0, method_result, intf_klass);
1838 
1839     if (peel) {
1840       beq(CCR0, found_method);
1841     } else {
1842       bne(CCR0, search);
1843       // (invert the test to fall through to found_method...)
1844     }
1845 
1846     if (!peel) break;
1847 
1848     bind(search);
1849 
1850     cmpdi(CCR0, method_result, 0);
1851     beq(CCR0, L_no_such_interface);
1852     addi(scan_temp, scan_temp, scan_step);
1853   }
1854 
1855   bind(found_method);
1856 
1857   // Got a hit.
1858   int ito_offset = itableOffsetEntry::offset_offset_in_bytes();
1859   lwz(scan_temp, ito_offset, scan_temp);
1860   ldx(method_result, scan_temp, recv_klass);
1861 }
1862 
1863 // virtual method calling
1864 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1865                                            RegisterOrConstant vtable_index,
1866                                            Register method_result) {
1867 
1868   assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg());
1869 
1870   const int base = in_bytes(Klass::vtable_start_offset());
1871   assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1872 
1873   if (vtable_index.is_register()) {
1874     sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord);
1875     add(recv_klass, vtable_index.as_register(), recv_klass);
1876   } else {
1877     addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord);
1878   }
1879   ld(R19_method, base + vtableEntry::method_offset_in_bytes(), recv_klass);
1880 }
1881 
1882 /////////////////////////////////////////// subtype checking ////////////////////////////////////////////
1883 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1884                                                    Register super_klass,
1885                                                    Register temp1_reg,
1886                                                    Register temp2_reg,
1887                                                    Label* L_success,
1888                                                    Label* L_failure,
1889                                                    Label* L_slow_path,
1890                                                    RegisterOrConstant super_check_offset) {
1891 
1892   const Register check_cache_offset = temp1_reg;
1893   const Register cached_super       = temp2_reg;
1894 
1895   assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super);
1896 
1897   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1898   int sc_offset  = in_bytes(Klass::secondary_super_cache_offset());
1899 
1900   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1901   bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset);
1902 
1903   Label L_fallthrough;
1904   int label_nulls = 0;
1905   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1906   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1907   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1908   assert(label_nulls <= 1 ||
1909          (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path),
1910          "at most one NULL in the batch, usually");
1911 
1912   // If the pointers are equal, we are done (e.g., String[] elements).
1913   // This self-check enables sharing of secondary supertype arrays among
1914   // non-primary types such as array-of-interface. Otherwise, each such
1915   // type would need its own customized SSA.
1916   // We move this check to the front of the fast path because many
1917   // type checks are in fact trivially successful in this manner,
1918   // so we get a nicely predicted branch right at the start of the check.
1919   cmpd(CCR0, sub_klass, super_klass);
1920   beq(CCR0, *L_success);
1921 
1922   // Check the supertype display:
1923   if (must_load_sco) {
1924     // The super check offset is always positive...
1925     lwz(check_cache_offset, sco_offset, super_klass);
1926     super_check_offset = RegisterOrConstant(check_cache_offset);
1927     // super_check_offset is register.
1928     assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register());
1929   }
1930   // The loaded value is the offset from KlassOopDesc.
1931 
1932   ld(cached_super, super_check_offset, sub_klass);
1933   cmpd(CCR0, cached_super, super_klass);
1934 
1935   // This check has worked decisively for primary supers.
1936   // Secondary supers are sought in the super_cache ('super_cache_addr').
1937   // (Secondary supers are interfaces and very deeply nested subtypes.)
1938   // This works in the same check above because of a tricky aliasing
1939   // between the super_cache and the primary super display elements.
1940   // (The 'super_check_addr' can address either, as the case requires.)
1941   // Note that the cache is updated below if it does not help us find
1942   // what we need immediately.
1943   // So if it was a primary super, we can just fail immediately.
1944   // Otherwise, it's the slow path for us (no success at this point).
1945 
1946 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); }
1947 
1948   if (super_check_offset.is_register()) {
1949     beq(CCR0, *L_success);
1950     cmpwi(CCR0, super_check_offset.as_register(), sc_offset);
1951     if (L_failure == &L_fallthrough) {
1952       beq(CCR0, *L_slow_path);
1953     } else {
1954       bne(CCR0, *L_failure);
1955       FINAL_JUMP(*L_slow_path);
1956     }
1957   } else {
1958     if (super_check_offset.as_constant() == sc_offset) {
1959       // Need a slow path; fast failure is impossible.
1960       if (L_slow_path == &L_fallthrough) {
1961         beq(CCR0, *L_success);
1962       } else {
1963         bne(CCR0, *L_slow_path);
1964         FINAL_JUMP(*L_success);
1965       }
1966     } else {
1967       // No slow path; it's a fast decision.
1968       if (L_failure == &L_fallthrough) {
1969         beq(CCR0, *L_success);
1970       } else {
1971         bne(CCR0, *L_failure);
1972         FINAL_JUMP(*L_success);
1973       }
1974     }
1975   }
1976 
1977   bind(L_fallthrough);
1978 #undef FINAL_JUMP
1979 }
1980 
1981 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1982                                                    Register super_klass,
1983                                                    Register temp1_reg,
1984                                                    Register temp2_reg,
1985                                                    Label* L_success,
1986                                                    Register result_reg) {
1987   const Register array_ptr = temp1_reg; // current value from cache array
1988   const Register temp      = temp2_reg;
1989 
1990   assert_different_registers(sub_klass, super_klass, array_ptr, temp);
1991 
1992   int source_offset = in_bytes(Klass::secondary_supers_offset());
1993   int target_offset = in_bytes(Klass::secondary_super_cache_offset());
1994 
1995   int length_offset = Array<Klass*>::length_offset_in_bytes();
1996   int base_offset   = Array<Klass*>::base_offset_in_bytes();
1997 
1998   Label hit, loop, failure, fallthru;
1999 
2000   ld(array_ptr, source_offset, sub_klass);
2001 
2002   // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated.");
2003   lwz(temp, length_offset, array_ptr);
2004   cmpwi(CCR0, temp, 0);
2005   beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0
2006 
2007   mtctr(temp); // load ctr
2008 
2009   bind(loop);
2010   // Oops in table are NO MORE compressed.
2011   ld(temp, base_offset, array_ptr);
2012   cmpd(CCR0, temp, super_klass);
2013   beq(CCR0, hit);
2014   addi(array_ptr, array_ptr, BytesPerWord);
2015   bdnz(loop);
2016 
2017   bind(failure);
2018   if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss)
2019   b(fallthru);
2020 
2021   bind(hit);
2022   std(super_klass, target_offset, sub_klass); // save result to cache
2023   if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit)
2024   if (L_success != NULL) { b(*L_success); }
2025   else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided
2026 
2027   bind(fallthru);
2028 }
2029 
2030 // Try fast path, then go to slow one if not successful
2031 void MacroAssembler::check_klass_subtype(Register sub_klass,
2032                          Register super_klass,
2033                          Register temp1_reg,
2034                          Register temp2_reg,
2035                          Label& L_success) {
2036   Label L_failure;
2037   check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure);
2038   check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success);
2039   bind(L_failure); // Fallthru if not successful.
2040 }
2041 
2042 void MacroAssembler::check_method_handle_type(Register mtype_reg, Register mh_reg,
2043                                               Register temp_reg,
2044                                               Label& wrong_method_type) {
2045   assert_different_registers(mtype_reg, mh_reg, temp_reg);
2046   // Compare method type against that of the receiver.
2047   load_heap_oop_not_null(temp_reg, delayed_value(java_lang_invoke_MethodHandle::type_offset_in_bytes, temp_reg), mh_reg);
2048   cmpd(CCR0, temp_reg, mtype_reg);
2049   bne(CCR0, wrong_method_type);
2050 }
2051 
2052 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot,
2053                                                    Register temp_reg,
2054                                                    int extra_slot_offset) {
2055   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
2056   int stackElementSize = Interpreter::stackElementSize;
2057   int offset = extra_slot_offset * stackElementSize;
2058   if (arg_slot.is_constant()) {
2059     offset += arg_slot.as_constant() * stackElementSize;
2060     return offset;
2061   } else {
2062     assert(temp_reg != noreg, "must specify");
2063     sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize));
2064     if (offset != 0)
2065       addi(temp_reg, temp_reg, offset);
2066     return temp_reg;
2067   }
2068 }
2069 
2070 // Supports temp2_reg = R0.
2071 void MacroAssembler::biased_locking_enter(ConditionRegister cr_reg, Register obj_reg,
2072                                           Register mark_reg, Register temp_reg,
2073                                           Register temp2_reg, Label& done, Label* slow_case) {
2074   assert(UseBiasedLocking, "why call this otherwise?");
2075 
2076 #ifdef ASSERT
2077   assert_different_registers(obj_reg, mark_reg, temp_reg, temp2_reg);
2078 #endif
2079 
2080   Label cas_label;
2081 
2082   // Branch to done if fast path fails and no slow_case provided.
2083   Label *slow_case_int = (slow_case != NULL) ? slow_case : &done;
2084 
2085   // Biased locking
2086   // See whether the lock is currently biased toward our thread and
2087   // whether the epoch is still valid
2088   // Note that the runtime guarantees sufficient alignment of JavaThread
2089   // pointers to allow age to be placed into low bits
2090   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits,
2091          "biased locking makes assumptions about bit layout");
2092 
2093   if (PrintBiasedLockingStatistics) {
2094     load_const(temp2_reg, (address) BiasedLocking::total_entry_count_addr(), temp_reg);
2095     lwzx(temp_reg, temp2_reg);
2096     addi(temp_reg, temp_reg, 1);
2097     stwx(temp_reg, temp2_reg);
2098   }
2099 
2100   andi(temp_reg, mark_reg, markOopDesc::biased_lock_mask_in_place);
2101   cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern);
2102   bne(cr_reg, cas_label);
2103 
2104   load_klass(temp_reg, obj_reg);
2105 
2106   load_const_optimized(temp2_reg, ~((int) markOopDesc::age_mask_in_place));
2107   ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
2108   orr(temp_reg, R16_thread, temp_reg);
2109   xorr(temp_reg, mark_reg, temp_reg);
2110   andr(temp_reg, temp_reg, temp2_reg);
2111   cmpdi(cr_reg, temp_reg, 0);
2112   if (PrintBiasedLockingStatistics) {
2113     Label l;
2114     bne(cr_reg, l);
2115     load_const(temp2_reg, (address) BiasedLocking::biased_lock_entry_count_addr());
2116     lwzx(mark_reg, temp2_reg);
2117     addi(mark_reg, mark_reg, 1);
2118     stwx(mark_reg, temp2_reg);
2119     // restore mark_reg
2120     ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);
2121     bind(l);
2122   }
2123   beq(cr_reg, done);
2124 
2125   Label try_revoke_bias;
2126   Label try_rebias;
2127 
2128   // At this point we know that the header has the bias pattern and
2129   // that we are not the bias owner in the current epoch. We need to
2130   // figure out more details about the state of the header in order to
2131   // know what operations can be legally performed on the object's
2132   // header.
2133 
2134   // If the low three bits in the xor result aren't clear, that means
2135   // the prototype header is no longer biased and we have to revoke
2136   // the bias on this object.
2137   andi(temp2_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
2138   cmpwi(cr_reg, temp2_reg, 0);
2139   bne(cr_reg, try_revoke_bias);
2140 
2141   // Biasing is still enabled for this data type. See whether the
2142   // epoch of the current bias is still valid, meaning that the epoch
2143   // bits of the mark word are equal to the epoch bits of the
2144   // prototype header. (Note that the prototype header's epoch bits
2145   // only change at a safepoint.) If not, attempt to rebias the object
2146   // toward the current thread. Note that we must be absolutely sure
2147   // that the current epoch is invalid in order to do this because
2148   // otherwise the manipulations it performs on the mark word are
2149   // illegal.
2150 
2151   int shift_amount = 64 - markOopDesc::epoch_shift;
2152   // rotate epoch bits to right (little) end and set other bits to 0
2153   // [ big part | epoch | little part ] -> [ 0..0 | epoch ]
2154   rldicl_(temp2_reg, temp_reg, shift_amount, 64 - markOopDesc::epoch_bits);
2155   // branch if epoch bits are != 0, i.e. they differ, because the epoch has been incremented
2156   bne(CCR0, try_rebias);
2157 
2158   // The epoch of the current bias is still valid but we know nothing
2159   // about the owner; it might be set or it might be clear. Try to
2160   // acquire the bias of the object using an atomic operation. If this
2161   // fails we will go in to the runtime to revoke the object's bias.
2162   // Note that we first construct the presumed unbiased header so we
2163   // don't accidentally blow away another thread's valid bias.
2164   andi(mark_reg, mark_reg, (markOopDesc::biased_lock_mask_in_place |
2165                                 markOopDesc::age_mask_in_place |
2166                                 markOopDesc::epoch_mask_in_place));
2167   orr(temp_reg, R16_thread, mark_reg);
2168 
2169   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2170 
2171   // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
2172   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
2173            /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
2174            /*where=*/obj_reg,
2175            MacroAssembler::MemBarAcq,
2176            MacroAssembler::cmpxchgx_hint_acquire_lock(),
2177            noreg, slow_case_int); // bail out if failed
2178 
2179   // If the biasing toward our thread failed, this means that
2180   // another thread succeeded in biasing it toward itself and we
2181   // need to revoke that bias. The revocation will occur in the
2182   // interpreter runtime in the slow case.
2183   if (PrintBiasedLockingStatistics) {
2184     load_const(temp2_reg, (address) BiasedLocking::anonymously_biased_lock_entry_count_addr(), temp_reg);
2185     lwzx(temp_reg, temp2_reg);
2186     addi(temp_reg, temp_reg, 1);
2187     stwx(temp_reg, temp2_reg);
2188   }
2189   b(done);
2190 
2191   bind(try_rebias);
2192   // At this point we know the epoch has expired, meaning that the
2193   // current "bias owner", if any, is actually invalid. Under these
2194   // circumstances _only_, we are allowed to use the current header's
2195   // value as the comparison value when doing the cas to acquire the
2196   // bias in the current epoch. In other words, we allow transfer of
2197   // the bias from one thread to another directly in this situation.
2198   load_klass(temp_reg, obj_reg);
2199   andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place);
2200   orr(temp2_reg, R16_thread, temp2_reg);
2201   ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
2202   orr(temp_reg, temp2_reg, temp_reg);
2203 
2204   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2205 
2206   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
2207                  /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
2208                  /*where=*/obj_reg,
2209                  MacroAssembler::MemBarAcq,
2210                  MacroAssembler::cmpxchgx_hint_acquire_lock(),
2211                  noreg, slow_case_int); // bail out if failed
2212 
2213   // If the biasing toward our thread failed, this means that
2214   // another thread succeeded in biasing it toward itself and we
2215   // need to revoke that bias. The revocation will occur in the
2216   // interpreter runtime in the slow case.
2217   if (PrintBiasedLockingStatistics) {
2218     load_const(temp2_reg, (address) BiasedLocking::rebiased_lock_entry_count_addr(), temp_reg);
2219     lwzx(temp_reg, temp2_reg);
2220     addi(temp_reg, temp_reg, 1);
2221     stwx(temp_reg, temp2_reg);
2222   }
2223   b(done);
2224 
2225   bind(try_revoke_bias);
2226   // The prototype mark in the klass doesn't have the bias bit set any
2227   // more, indicating that objects of this data type are not supposed
2228   // to be biased any more. We are going to try to reset the mark of
2229   // this object to the prototype value and fall through to the
2230   // CAS-based locking scheme. Note that if our CAS fails, it means
2231   // that another thread raced us for the privilege of revoking the
2232   // bias of this particular object, so it's okay to continue in the
2233   // normal locking code.
2234   load_klass(temp_reg, obj_reg);
2235   ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
2236   andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place);
2237   orr(temp_reg, temp_reg, temp2_reg);
2238 
2239   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2240 
2241   // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
2242   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
2243                  /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
2244                  /*where=*/obj_reg,
2245                  MacroAssembler::MemBarAcq,
2246                  MacroAssembler::cmpxchgx_hint_acquire_lock());
2247 
2248   // reload markOop in mark_reg before continuing with lightweight locking
2249   ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);
2250 
2251   // Fall through to the normal CAS-based lock, because no matter what
2252   // the result of the above CAS, some thread must have succeeded in
2253   // removing the bias bit from the object's header.
2254   if (PrintBiasedLockingStatistics) {
2255     Label l;
2256     bne(cr_reg, l);
2257     load_const(temp2_reg, (address) BiasedLocking::revoked_lock_entry_count_addr(), temp_reg);
2258     lwzx(temp_reg, temp2_reg);
2259     addi(temp_reg, temp_reg, 1);
2260     stwx(temp_reg, temp2_reg);
2261     bind(l);
2262   }
2263 
2264   bind(cas_label);
2265 }
2266 
2267 void MacroAssembler::biased_locking_exit (ConditionRegister cr_reg, Register mark_addr, Register temp_reg, Label& done) {
2268   // Check for biased locking unlock case, which is a no-op
2269   // Note: we do not have to check the thread ID for two reasons.
2270   // First, the interpreter checks for IllegalMonitorStateException at
2271   // a higher level. Second, if the bias was revoked while we held the
2272   // lock, the object could not be rebiased toward another thread, so
2273   // the bias bit would be clear.
2274 
2275   ld(temp_reg, 0, mark_addr);
2276   andi(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
2277 
2278   cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern);
2279   beq(cr_reg, done);
2280 }
2281 
2282 // allocation (for C1)
2283 void MacroAssembler::eden_allocate(
2284   Register obj,                      // result: pointer to object after successful allocation
2285   Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
2286   int      con_size_in_bytes,        // object size in bytes if   known at compile time
2287   Register t1,                       // temp register
2288   Register t2,                       // temp register
2289   Label&   slow_case                 // continuation point if fast allocation fails
2290 ) {
2291   b(slow_case);
2292 }
2293 
2294 void MacroAssembler::tlab_allocate(
2295   Register obj,                      // result: pointer to object after successful allocation
2296   Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
2297   int      con_size_in_bytes,        // object size in bytes if   known at compile time
2298   Register t1,                       // temp register
2299   Label&   slow_case                 // continuation point if fast allocation fails
2300 ) {
2301   // make sure arguments make sense
2302   assert_different_registers(obj, var_size_in_bytes, t1);
2303   assert(0 <= con_size_in_bytes && is_simm13(con_size_in_bytes), "illegal object size");
2304   assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment");
2305 
2306   const Register new_top = t1;
2307   //verify_tlab(); not implemented
2308 
2309   ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2310   ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread);
2311   if (var_size_in_bytes == noreg) {
2312     addi(new_top, obj, con_size_in_bytes);
2313   } else {
2314     add(new_top, obj, var_size_in_bytes);
2315   }
2316   cmpld(CCR0, new_top, R0);
2317   bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case);
2318 
2319 #ifdef ASSERT
2320   // make sure new free pointer is properly aligned
2321   {
2322     Label L;
2323     andi_(R0, new_top, MinObjAlignmentInBytesMask);
2324     beq(CCR0, L);
2325     stop("updated TLAB free is not properly aligned", 0x934);
2326     bind(L);
2327   }
2328 #endif // ASSERT
2329 
2330   // update the tlab top pointer
2331   std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2332   //verify_tlab(); not implemented
2333 }
2334 void MacroAssembler::tlab_refill(Label& retry_tlab, Label& try_eden, Label& slow_case) {
2335   unimplemented("tlab_refill");
2336 }
2337 void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register t1, Register t2) {
2338   unimplemented("incr_allocated_bytes");
2339 }
2340 
2341 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset,
2342                                              int insts_call_instruction_offset, Register Rtoc) {
2343   // Start the stub.
2344   address stub = start_a_stub(64);
2345   if (stub == NULL) { return NULL; } // CodeCache full: bail out
2346 
2347   // Create a trampoline stub relocation which relates this trampoline stub
2348   // with the call instruction at insts_call_instruction_offset in the
2349   // instructions code-section.
2350   relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset));
2351   const int stub_start_offset = offset();
2352 
2353   // For java_to_interp stubs we use R11_scratch1 as scratch register
2354   // and in call trampoline stubs we use R12_scratch2. This way we
2355   // can distinguish them (see is_NativeCallTrampolineStub_at()).
2356   Register reg_scratch = R12_scratch2;
2357 
2358   // Now, create the trampoline stub's code:
2359   // - load the TOC
2360   // - load the call target from the constant pool
2361   // - call
2362   if (Rtoc == noreg) {
2363     calculate_address_from_global_toc(reg_scratch, method_toc());
2364     Rtoc = reg_scratch;
2365   }
2366 
2367   ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false);
2368   mtctr(reg_scratch);
2369   bctr();
2370 
2371   const address stub_start_addr = addr_at(stub_start_offset);
2372 
2373   // Assert that the encoded destination_toc_offset can be identified and that it is correct.
2374   assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(),
2375          "encoded offset into the constant pool must match");
2376   // Trampoline_stub_size should be good.
2377   assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size");
2378   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
2379 
2380   // End the stub.
2381   end_a_stub();
2382   return stub;
2383 }
2384 
2385 // TM on PPC64.
2386 void MacroAssembler::atomic_inc_ptr(Register addr, Register result, int simm16) {
2387   Label retry;
2388   bind(retry);
2389   ldarx(result, addr, /*hint*/ false);
2390   addi(result, result, simm16);
2391   stdcx_(result, addr);
2392   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
2393     bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
2394   } else {
2395     bne(                  CCR0, retry); // stXcx_ sets CCR0
2396   }
2397 }
2398 
2399 void MacroAssembler::atomic_ori_int(Register addr, Register result, int uimm16) {
2400   Label retry;
2401   bind(retry);
2402   lwarx(result, addr, /*hint*/ false);
2403   ori(result, result, uimm16);
2404   stwcx_(result, addr);
2405   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
2406     bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
2407   } else {
2408     bne(                  CCR0, retry); // stXcx_ sets CCR0
2409   }
2410 }
2411 
2412 #if INCLUDE_RTM_OPT
2413 
2414 // Update rtm_counters based on abort status
2415 // input: abort_status
2416 //        rtm_counters (RTMLockingCounters*)
2417 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters_Reg) {
2418   // Mapping to keep PreciseRTMLockingStatistics similar to x86.
2419   // x86 ppc (! means inverted, ? means not the same)
2420   //  0   31  Set if abort caused by XABORT instruction.
2421   //  1  ! 7  If set, the transaction may succeed on a retry. This bit is always clear if bit 0 is set.
2422   //  2   13  Set if another logical processor conflicted with a memory address that was part of the transaction that aborted.
2423   //  3   10  Set if an internal buffer overflowed.
2424   //  4  ?12  Set if a debug breakpoint was hit.
2425   //  5  ?32  Set if an abort occurred during execution of a nested transaction.
2426   const  int tm_failure_bit[] = {Assembler::tm_tabort, // Note: Seems like signal handler sets this, too.
2427                                  Assembler::tm_failure_persistent, // inverted: transient
2428                                  Assembler::tm_trans_cf,
2429                                  Assembler::tm_footprint_of,
2430                                  Assembler::tm_non_trans_cf,
2431                                  Assembler::tm_suspended};
2432   const bool tm_failure_inv[] = {false, true, false, false, false, false};
2433   assert(sizeof(tm_failure_bit)/sizeof(int) == RTMLockingCounters::ABORT_STATUS_LIMIT, "adapt mapping!");
2434 
2435   const Register addr_Reg = R0;
2436   // Keep track of offset to where rtm_counters_Reg had pointed to.
2437   int counters_offs = RTMLockingCounters::abort_count_offset();
2438   addi(addr_Reg, rtm_counters_Reg, counters_offs);
2439   const Register temp_Reg = rtm_counters_Reg;
2440 
2441   //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically
2442   ldx(temp_Reg, addr_Reg);
2443   addi(temp_Reg, temp_Reg, 1);
2444   stdx(temp_Reg, addr_Reg);
2445 
2446   if (PrintPreciseRTMLockingStatistics) {
2447     int counters_offs_delta = RTMLockingCounters::abortX_count_offset() - counters_offs;
2448 
2449     //mftexasr(abort_status); done by caller
2450     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
2451       counters_offs += counters_offs_delta;
2452       li(temp_Reg, counters_offs_delta); // can't use addi with R0
2453       add(addr_Reg, addr_Reg, temp_Reg); // point to next counter
2454       counters_offs_delta = sizeof(uintx);
2455 
2456       Label check_abort;
2457       rldicr_(temp_Reg, abort_status, tm_failure_bit[i], 0);
2458       if (tm_failure_inv[i]) {
2459         bne(CCR0, check_abort);
2460       } else {
2461         beq(CCR0, check_abort);
2462       }
2463       //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically
2464       ldx(temp_Reg, addr_Reg);
2465       addi(temp_Reg, temp_Reg, 1);
2466       stdx(temp_Reg, addr_Reg);
2467       bind(check_abort);
2468     }
2469   }
2470   li(temp_Reg, -counters_offs); // can't use addi with R0
2471   add(rtm_counters_Reg, addr_Reg, temp_Reg); // restore
2472 }
2473 
2474 // Branch if (random & (count-1) != 0), count is 2^n
2475 // tmp and CR0 are killed
2476 void MacroAssembler::branch_on_random_using_tb(Register tmp, int count, Label& brLabel) {
2477   mftb(tmp);
2478   andi_(tmp, tmp, count-1);
2479   bne(CCR0, brLabel);
2480 }
2481 
2482 // Perform abort ratio calculation, set no_rtm bit if high ratio.
2483 // input:  rtm_counters_Reg (RTMLockingCounters* address) - KILLED
2484 void MacroAssembler::rtm_abort_ratio_calculation(Register rtm_counters_Reg,
2485                                                  RTMLockingCounters* rtm_counters,
2486                                                  Metadata* method_data) {
2487   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
2488 
2489   if (RTMLockingCalculationDelay > 0) {
2490     // Delay calculation.
2491     ld(rtm_counters_Reg, (RegisterOrConstant)(intptr_t)RTMLockingCounters::rtm_calculation_flag_addr());
2492     cmpdi(CCR0, rtm_counters_Reg, 0);
2493     beq(CCR0, L_done);
2494     load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2495   }
2496   // Abort ratio calculation only if abort_count > RTMAbortThreshold.
2497   //   Aborted transactions = abort_count * 100
2498   //   All transactions = total_count *  RTMTotalCountIncrRate
2499   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
2500   ld(R0, RTMLockingCounters::abort_count_offset(), rtm_counters_Reg);
2501   if (is_simm(RTMAbortThreshold, 16)) {   // cmpdi can handle 16bit immediate only.
2502     cmpdi(CCR0, R0, RTMAbortThreshold);
2503     blt(CCR0, L_check_always_rtm2);  // reload of rtm_counters_Reg not necessary
2504   } else {
2505     load_const_optimized(rtm_counters_Reg, RTMAbortThreshold);
2506     cmpd(CCR0, R0, rtm_counters_Reg);
2507     blt(CCR0, L_check_always_rtm1);  // reload of rtm_counters_Reg required
2508   }
2509   mulli(R0, R0, 100);
2510 
2511   const Register tmpReg = rtm_counters_Reg;
2512   ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2513   mulli(tmpReg, tmpReg, RTMTotalCountIncrRate); // allowable range: int16
2514   mulli(tmpReg, tmpReg, RTMAbortRatio);         // allowable range: int16
2515   cmpd(CCR0, R0, tmpReg);
2516   blt(CCR0, L_check_always_rtm1); // jump to reload
2517   if (method_data != NULL) {
2518     // Set rtm_state to "no rtm" in MDO.
2519     // Not using a metadata relocation. Method and Class Loader are kept alive anyway.
2520     // (See nmethod::metadata_do and CodeBuffer::finalize_oop_references.)
2521     load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2522     atomic_ori_int(R0, tmpReg, NoRTM);
2523   }
2524   b(L_done);
2525 
2526   bind(L_check_always_rtm1);
2527   load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2528   bind(L_check_always_rtm2);
2529   ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2530   int64_t thresholdValue = RTMLockingThreshold / RTMTotalCountIncrRate;
2531   if (is_simm(thresholdValue, 16)) {   // cmpdi can handle 16bit immediate only.
2532     cmpdi(CCR0, tmpReg, thresholdValue);
2533   } else {
2534     load_const_optimized(R0, thresholdValue);
2535     cmpd(CCR0, tmpReg, R0);
2536   }
2537   blt(CCR0, L_done);
2538   if (method_data != NULL) {
2539     // Set rtm_state to "always rtm" in MDO.
2540     // Not using a metadata relocation. See above.
2541     load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2542     atomic_ori_int(R0, tmpReg, UseRTM);
2543   }
2544   bind(L_done);
2545 }
2546 
2547 // Update counters and perform abort ratio calculation.
2548 // input: abort_status_Reg
2549 void MacroAssembler::rtm_profiling(Register abort_status_Reg, Register temp_Reg,
2550                                    RTMLockingCounters* rtm_counters,
2551                                    Metadata* method_data,
2552                                    bool profile_rtm) {
2553 
2554   assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2555   // Update rtm counters based on state at abort.
2556   // Reads abort_status_Reg, updates flags.
2557   assert_different_registers(abort_status_Reg, temp_Reg);
2558   load_const_optimized(temp_Reg, (address)rtm_counters, R0);
2559   rtm_counters_update(abort_status_Reg, temp_Reg);
2560   if (profile_rtm) {
2561     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2562     rtm_abort_ratio_calculation(temp_Reg, rtm_counters, method_data);
2563   }
2564 }
2565 
2566 // Retry on abort if abort's status indicates non-persistent failure.
2567 // inputs: retry_count_Reg
2568 //       : abort_status_Reg
2569 // output: retry_count_Reg decremented by 1
2570 void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg,
2571                                              Label& retryLabel, Label* checkRetry) {
2572   Label doneRetry;
2573   rldicr_(R0, abort_status_Reg, tm_failure_persistent, 0);
2574   bne(CCR0, doneRetry);
2575   if (checkRetry) { bind(*checkRetry); }
2576   addic_(retry_count_Reg, retry_count_Reg, -1);
2577   blt(CCR0, doneRetry);
2578   smt_yield(); // Can't use wait(). No permission (SIGILL).
2579   b(retryLabel);
2580   bind(doneRetry);
2581 }
2582 
2583 // Spin and retry if lock is busy.
2584 // inputs: owner_addr_Reg (monitor address)
2585 //       : retry_count_Reg
2586 // output: retry_count_Reg decremented by 1
2587 // CTR is killed
2588 void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register owner_addr_Reg, Label& retryLabel) {
2589   Label SpinLoop, doneRetry;
2590   addic_(retry_count_Reg, retry_count_Reg, -1);
2591   blt(CCR0, doneRetry);
2592 
2593   if (RTMSpinLoopCount > 1) {
2594     li(R0, RTMSpinLoopCount);
2595     mtctr(R0);
2596   }
2597 
2598   bind(SpinLoop);
2599   smt_yield(); // Can't use waitrsv(). No permission (SIGILL).
2600 
2601   if (RTMSpinLoopCount > 1) {
2602     bdz(retryLabel);
2603     ld(R0, 0, owner_addr_Reg);
2604     cmpdi(CCR0, R0, 0);
2605     bne(CCR0, SpinLoop);
2606   }
2607 
2608   b(retryLabel);
2609 
2610   bind(doneRetry);
2611 }
2612 
2613 // Use RTM for normal stack locks.
2614 // Input: objReg (object to lock)
2615 void MacroAssembler::rtm_stack_locking(ConditionRegister flag,
2616                                        Register obj, Register mark_word, Register tmp,
2617                                        Register retry_on_abort_count_Reg,
2618                                        RTMLockingCounters* stack_rtm_counters,
2619                                        Metadata* method_data, bool profile_rtm,
2620                                        Label& DONE_LABEL, Label& IsInflated) {
2621   assert(UseRTMForStackLocks, "why call this otherwise?");
2622   assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
2623   Label L_rtm_retry, L_decrement_retry, L_on_abort;
2624 
2625   if (RTMRetryCount > 0) {
2626     load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
2627     bind(L_rtm_retry);
2628   }
2629   andi_(R0, mark_word, markOopDesc::monitor_value);  // inflated vs stack-locked|neutral|biased
2630   bne(CCR0, IsInflated);
2631 
2632   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2633     Label L_noincrement;
2634     if (RTMTotalCountIncrRate > 1) {
2635       branch_on_random_using_tb(tmp, RTMTotalCountIncrRate, L_noincrement);
2636     }
2637     assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
2638     load_const_optimized(tmp, (address)stack_rtm_counters->total_count_addr(), R0);
2639     //atomic_inc_ptr(tmp, /*temp, will be reloaded*/mark_word); We don't increment atomically
2640     ldx(mark_word, tmp);
2641     addi(mark_word, mark_word, 1);
2642     stdx(mark_word, tmp);
2643     bind(L_noincrement);
2644   }
2645   tbegin_();
2646   beq(CCR0, L_on_abort);
2647   ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);         // Reload in transaction, conflicts need to be tracked.
2648   andi(R0, mark_word, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
2649   cmpwi(flag, R0, markOopDesc::unlocked_value);                // bits = 001 unlocked
2650   beq(flag, DONE_LABEL);                                       // all done if unlocked
2651 
2652   if (UseRTMXendForLockBusy) {
2653     tend_();
2654     b(L_decrement_retry);
2655   } else {
2656     tabort_();
2657   }
2658   bind(L_on_abort);
2659   const Register abort_status_Reg = tmp;
2660   mftexasr(abort_status_Reg);
2661   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2662     rtm_profiling(abort_status_Reg, /*temp*/mark_word, stack_rtm_counters, method_data, profile_rtm);
2663   }
2664   ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // reload
2665   if (RTMRetryCount > 0) {
2666     // Retry on lock abort if abort status is not permanent.
2667     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry, &L_decrement_retry);
2668   } else {
2669     bind(L_decrement_retry);
2670   }
2671 }
2672 
2673 // Use RTM for inflating locks
2674 // inputs: obj       (object to lock)
2675 //         mark_word (current header - KILLED)
2676 //         boxReg    (on-stack box address (displaced header location) - KILLED)
2677 void MacroAssembler::rtm_inflated_locking(ConditionRegister flag,
2678                                           Register obj, Register mark_word, Register boxReg,
2679                                           Register retry_on_busy_count_Reg, Register retry_on_abort_count_Reg,
2680                                           RTMLockingCounters* rtm_counters,
2681                                           Metadata* method_data, bool profile_rtm,
2682                                           Label& DONE_LABEL) {
2683   assert(UseRTMLocking, "why call this otherwise?");
2684   Label L_rtm_retry, L_decrement_retry, L_on_abort;
2685   // Clean monitor_value bit to get valid pointer.
2686   int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markOopDesc::monitor_value;
2687 
2688   // Store non-null, using boxReg instead of (intptr_t)markOopDesc::unused_mark().
2689   std(boxReg, BasicLock::displaced_header_offset_in_bytes(), boxReg);
2690   const Register tmpReg = boxReg;
2691   const Register owner_addr_Reg = mark_word;
2692   addi(owner_addr_Reg, mark_word, owner_offset);
2693 
2694   if (RTMRetryCount > 0) {
2695     load_const_optimized(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy.
2696     load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort.
2697     bind(L_rtm_retry);
2698   }
2699   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2700     Label L_noincrement;
2701     if (RTMTotalCountIncrRate > 1) {
2702       branch_on_random_using_tb(R0, RTMTotalCountIncrRate, L_noincrement);
2703     }
2704     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2705     load_const(R0, (address)rtm_counters->total_count_addr(), tmpReg);
2706     //atomic_inc_ptr(R0, tmpReg); We don't increment atomically
2707     ldx(tmpReg, R0);
2708     addi(tmpReg, tmpReg, 1);
2709     stdx(tmpReg, R0);
2710     bind(L_noincrement);
2711   }
2712   tbegin_();
2713   beq(CCR0, L_on_abort);
2714   // We don't reload mark word. Will only be reset at safepoint.
2715   ld(R0, 0, owner_addr_Reg); // Load in transaction, conflicts need to be tracked.
2716   cmpdi(flag, R0, 0);
2717   beq(flag, DONE_LABEL);
2718 
2719   if (UseRTMXendForLockBusy) {
2720     tend_();
2721     b(L_decrement_retry);
2722   } else {
2723     tabort_();
2724   }
2725   bind(L_on_abort);
2726   const Register abort_status_Reg = tmpReg;
2727   mftexasr(abort_status_Reg);
2728   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2729     rtm_profiling(abort_status_Reg, /*temp*/ owner_addr_Reg, rtm_counters, method_data, profile_rtm);
2730     // Restore owner_addr_Reg
2731     ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);
2732 #ifdef ASSERT
2733     andi_(R0, mark_word, markOopDesc::monitor_value);
2734     asm_assert_ne("must be inflated", 0xa754); // Deflating only allowed at safepoint.
2735 #endif
2736     addi(owner_addr_Reg, mark_word, owner_offset);
2737   }
2738   if (RTMRetryCount > 0) {
2739     // Retry on lock abort if abort status is not permanent.
2740     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
2741   }
2742 
2743   // Appears unlocked - try to swing _owner from null to non-null.
2744   cmpxchgd(flag, /*current val*/ R0, (intptr_t)0, /*new val*/ R16_thread, owner_addr_Reg,
2745            MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2746            MacroAssembler::cmpxchgx_hint_acquire_lock(), noreg, &L_decrement_retry, true);
2747 
2748   if (RTMRetryCount > 0) {
2749     // success done else retry
2750     b(DONE_LABEL);
2751     bind(L_decrement_retry);
2752     // Spin and retry if lock is busy.
2753     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, owner_addr_Reg, L_rtm_retry);
2754   } else {
2755     bind(L_decrement_retry);
2756   }
2757 }
2758 
2759 #endif //  INCLUDE_RTM_OPT
2760 
2761 // "The box" is the space on the stack where we copy the object mark.
2762 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box,
2763                                                Register temp, Register displaced_header, Register current_header,
2764                                                bool try_bias,
2765                                                RTMLockingCounters* rtm_counters,
2766                                                RTMLockingCounters* stack_rtm_counters,
2767                                                Metadata* method_data,
2768                                                bool use_rtm, bool profile_rtm) {
2769   assert_different_registers(oop, box, temp, displaced_header, current_header);
2770   assert(flag != CCR0, "bad condition register");
2771   Label cont;
2772   Label object_has_monitor;
2773   Label cas_failed;
2774 
2775   // Load markOop from object into displaced_header.
2776   ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop);
2777 
2778 
2779   // Always do locking in runtime.
2780   if (EmitSync & 0x01) {
2781     cmpdi(flag, oop, 0); // Oop can't be 0 here => always false.
2782     return;
2783   }
2784 
2785   if (try_bias) {
2786     biased_locking_enter(flag, oop, displaced_header, temp, current_header, cont);
2787   }
2788 
2789 #if INCLUDE_RTM_OPT
2790   if (UseRTMForStackLocks && use_rtm) {
2791     rtm_stack_locking(flag, oop, displaced_header, temp, /*temp*/ current_header,
2792                       stack_rtm_counters, method_data, profile_rtm,
2793                       cont, object_has_monitor);
2794   }
2795 #endif // INCLUDE_RTM_OPT
2796 
2797   // Handle existing monitor.
2798   if ((EmitSync & 0x02) == 0) {
2799     // The object has an existing monitor iff (mark & monitor_value) != 0.
2800     andi_(temp, displaced_header, markOopDesc::monitor_value);
2801     bne(CCR0, object_has_monitor);
2802   }
2803 
2804   // Set displaced_header to be (markOop of object | UNLOCK_VALUE).
2805   ori(displaced_header, displaced_header, markOopDesc::unlocked_value);
2806 
2807   // Load Compare Value application register.
2808 
2809   // Initialize the box. (Must happen before we update the object mark!)
2810   std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2811 
2812   // Must fence, otherwise, preceding store(s) may float below cmpxchg.
2813   // Compare object markOop with mark and if equal exchange scratch1 with object markOop.
2814   cmpxchgd(/*flag=*/flag,
2815            /*current_value=*/current_header,
2816            /*compare_value=*/displaced_header,
2817            /*exchange_value=*/box,
2818            /*where=*/oop,
2819            MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2820            MacroAssembler::cmpxchgx_hint_acquire_lock(),
2821            noreg,
2822            &cas_failed,
2823            /*check without membar and ldarx first*/true);
2824   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2825 
2826   // If the compare-and-exchange succeeded, then we found an unlocked
2827   // object and we have now locked it.
2828   b(cont);
2829 
2830   bind(cas_failed);
2831   // We did not see an unlocked object so try the fast recursive case.
2832 
2833   // Check if the owner is self by comparing the value in the markOop of object
2834   // (current_header) with the stack pointer.
2835   sub(current_header, current_header, R1_SP);
2836   load_const_optimized(temp, ~(os::vm_page_size()-1) | markOopDesc::lock_mask_in_place);
2837 
2838   and_(R0/*==0?*/, current_header, temp);
2839   // If condition is true we are cont and hence we can store 0 as the
2840   // displaced header in the box, which indicates that it is a recursive lock.
2841   mcrf(flag,CCR0);
2842   std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box);
2843 
2844   // Handle existing monitor.
2845   if ((EmitSync & 0x02) == 0) {
2846     b(cont);
2847 
2848     bind(object_has_monitor);
2849     // The object's monitor m is unlocked iff m->owner == NULL,
2850     // otherwise m->owner may contain a thread or a stack address.
2851 
2852 #if INCLUDE_RTM_OPT
2853     // Use the same RTM locking code in 32- and 64-bit VM.
2854     if (use_rtm) {
2855       rtm_inflated_locking(flag, oop, displaced_header, box, temp, /*temp*/ current_header,
2856                            rtm_counters, method_data, profile_rtm, cont);
2857     } else {
2858 #endif // INCLUDE_RTM_OPT
2859 
2860     // Try to CAS m->owner from NULL to current thread.
2861     addi(temp, displaced_header, ObjectMonitor::owner_offset_in_bytes()-markOopDesc::monitor_value);
2862     cmpxchgd(/*flag=*/flag,
2863              /*current_value=*/current_header,
2864              /*compare_value=*/(intptr_t)0,
2865              /*exchange_value=*/R16_thread,
2866              /*where=*/temp,
2867              MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2868              MacroAssembler::cmpxchgx_hint_acquire_lock());
2869 
2870     // Store a non-null value into the box.
2871     std(box, BasicLock::displaced_header_offset_in_bytes(), box);
2872 
2873 #   ifdef ASSERT
2874     bne(flag, cont);
2875     // We have acquired the monitor, check some invariants.
2876     addi(/*monitor=*/temp, temp, -ObjectMonitor::owner_offset_in_bytes());
2877     // Invariant 1: _recursions should be 0.
2878     //assert(ObjectMonitor::recursions_size_in_bytes() == 8, "unexpected size");
2879     asm_assert_mem8_is_zero(ObjectMonitor::recursions_offset_in_bytes(), temp,
2880                             "monitor->_recursions should be 0", -1);
2881     // Invariant 2: OwnerIsThread shouldn't be 0.
2882     //assert(ObjectMonitor::OwnerIsThread_size_in_bytes() == 4, "unexpected size");
2883     //asm_assert_mem4_isnot_zero(ObjectMonitor::OwnerIsThread_offset_in_bytes(), temp,
2884     //                           "monitor->OwnerIsThread shouldn't be 0", -1);
2885 #   endif
2886 
2887 #if INCLUDE_RTM_OPT
2888     } // use_rtm()
2889 #endif
2890   }
2891 
2892   bind(cont);
2893   // flag == EQ indicates success
2894   // flag == NE indicates failure
2895 }
2896 
2897 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box,
2898                                                  Register temp, Register displaced_header, Register current_header,
2899                                                  bool try_bias, bool use_rtm) {
2900   assert_different_registers(oop, box, temp, displaced_header, current_header);
2901   assert(flag != CCR0, "bad condition register");
2902   Label cont;
2903   Label object_has_monitor;
2904 
2905   // Always do locking in runtime.
2906   if (EmitSync & 0x01) {
2907     cmpdi(flag, oop, 0); // Oop can't be 0 here => always false.
2908     return;
2909   }
2910 
2911   if (try_bias) {
2912     biased_locking_exit(flag, oop, current_header, cont);
2913   }
2914 
2915 #if INCLUDE_RTM_OPT
2916   if (UseRTMForStackLocks && use_rtm) {
2917     assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
2918     Label L_regular_unlock;
2919     ld(current_header, oopDesc::mark_offset_in_bytes(), oop);         // fetch markword
2920     andi(R0, current_header, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
2921     cmpwi(flag, R0, markOopDesc::unlocked_value);                     // bits = 001 unlocked
2922     bne(flag, L_regular_unlock);                                      // else RegularLock
2923     tend_();                                                          // otherwise end...
2924     b(cont);                                                          // ... and we're done
2925     bind(L_regular_unlock);
2926   }
2927 #endif
2928 
2929   // Find the lock address and load the displaced header from the stack.
2930   ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2931 
2932   // If the displaced header is 0, we have a recursive unlock.
2933   cmpdi(flag, displaced_header, 0);
2934   beq(flag, cont);
2935 
2936   // Handle existing monitor.
2937   if ((EmitSync & 0x02) == 0) {
2938     // The object has an existing monitor iff (mark & monitor_value) != 0.
2939     RTM_OPT_ONLY( if (!(UseRTMForStackLocks && use_rtm)) ) // skip load if already done
2940     ld(current_header, oopDesc::mark_offset_in_bytes(), oop);
2941     andi_(R0, current_header, markOopDesc::monitor_value);
2942     bne(CCR0, object_has_monitor);
2943   }
2944 
2945   // Check if it is still a light weight lock, this is is true if we see
2946   // the stack address of the basicLock in the markOop of the object.
2947   // Cmpxchg sets flag to cmpd(current_header, box).
2948   cmpxchgd(/*flag=*/flag,
2949            /*current_value=*/current_header,
2950            /*compare_value=*/box,
2951            /*exchange_value=*/displaced_header,
2952            /*where=*/oop,
2953            MacroAssembler::MemBarRel,
2954            MacroAssembler::cmpxchgx_hint_release_lock(),
2955            noreg,
2956            &cont);
2957 
2958   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2959 
2960   // Handle existing monitor.
2961   if ((EmitSync & 0x02) == 0) {
2962     b(cont);
2963 
2964     bind(object_has_monitor);
2965     addi(current_header, current_header, -markOopDesc::monitor_value); // monitor
2966     ld(temp,             ObjectMonitor::owner_offset_in_bytes(), current_header);
2967 
2968     // It's inflated.
2969 #if INCLUDE_RTM_OPT
2970     if (use_rtm) {
2971       Label L_regular_inflated_unlock;
2972       // Clean monitor_value bit to get valid pointer
2973       cmpdi(flag, temp, 0);
2974       bne(flag, L_regular_inflated_unlock);
2975       tend_();
2976       b(cont);
2977       bind(L_regular_inflated_unlock);
2978     }
2979 #endif
2980 
2981     ld(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header);
2982     xorr(temp, R16_thread, temp);      // Will be 0 if we are the owner.
2983     orr(temp, temp, displaced_header); // Will be 0 if there are 0 recursions.
2984     cmpdi(flag, temp, 0);
2985     bne(flag, cont);
2986 
2987     ld(temp,             ObjectMonitor::EntryList_offset_in_bytes(), current_header);
2988     ld(displaced_header, ObjectMonitor::cxq_offset_in_bytes(), current_header);
2989     orr(temp, temp, displaced_header); // Will be 0 if both are 0.
2990     cmpdi(flag, temp, 0);
2991     bne(flag, cont);
2992     release();
2993     std(temp, ObjectMonitor::owner_offset_in_bytes(), current_header);
2994   }
2995 
2996   bind(cont);
2997   // flag == EQ indicates success
2998   // flag == NE indicates failure
2999 }
3000 
3001 // Write serialization page so VM thread can do a pseudo remote membar.
3002 // We use the current thread pointer to calculate a thread specific
3003 // offset to write to within the page. This minimizes bus traffic
3004 // due to cache line collision.
3005 void MacroAssembler::serialize_memory(Register thread, Register tmp1, Register tmp2) {
3006   srdi(tmp2, thread, os::get_serialize_page_shift_count());
3007 
3008   int mask = os::vm_page_size() - sizeof(int);
3009   if (Assembler::is_simm(mask, 16)) {
3010     andi(tmp2, tmp2, mask);
3011   } else {
3012     lis(tmp1, (int)((signed short) (mask >> 16)));
3013     ori(tmp1, tmp1, mask & 0x0000ffff);
3014     andr(tmp2, tmp2, tmp1);
3015   }
3016 
3017   load_const(tmp1, (long) os::get_memory_serialize_page());
3018   release();
3019   stwx(R0, tmp1, tmp2);
3020 }
3021 
3022 
3023 // GC barrier helper macros
3024 
3025 // Write the card table byte if needed.
3026 void MacroAssembler::card_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp) {
3027   CardTableModRefBS* bs =
3028     barrier_set_cast<CardTableModRefBS>(Universe::heap()->barrier_set());
3029   assert(bs->kind() == BarrierSet::CardTableForRS ||
3030          bs->kind() == BarrierSet::CardTableExtension, "wrong barrier");
3031 #ifdef ASSERT
3032   cmpdi(CCR0, Rnew_val, 0);
3033   asm_assert_ne("null oop not allowed", 0x321);
3034 #endif
3035   card_table_write(bs->byte_map_base, Rtmp, Rstore_addr);
3036 }
3037 
3038 // Write the card table byte.
3039 void MacroAssembler::card_table_write(jbyte* byte_map_base, Register Rtmp, Register Robj) {
3040   assert_different_registers(Robj, Rtmp, R0);
3041   load_const_optimized(Rtmp, (address)byte_map_base, R0);
3042   srdi(Robj, Robj, CardTableModRefBS::card_shift);
3043   li(R0, 0); // dirty
3044   if (UseConcMarkSweepGC) membar(Assembler::StoreStore);
3045   stbx(R0, Rtmp, Robj);
3046 }
3047 
3048 // Kills R31 if value is a volatile register.
3049 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2, bool needs_frame) {
3050   Label done;
3051   cmpdi(CCR0, value, 0);
3052   beq(CCR0, done);         // Use NULL as-is.
3053 
3054   clrrdi(tmp1, value, JNIHandles::weak_tag_size);
3055 #if INCLUDE_ALL_GCS
3056   if (UseG1GC) { andi_(tmp2, value, JNIHandles::weak_tag_mask); }
3057 #endif
3058   ld(value, 0, tmp1);      // Resolve (untagged) jobject.
3059 
3060 #if INCLUDE_ALL_GCS
3061   if (UseG1GC) {
3062     Label not_weak;
3063     beq(CCR0, not_weak);   // Test for jweak tag.
3064     verify_oop(value);
3065     g1_write_barrier_pre(noreg, // obj
3066                          noreg, // offset
3067                          value, // pre_val
3068                          tmp1, tmp2, needs_frame);
3069     bind(not_weak);
3070   }
3071 #endif // INCLUDE_ALL_GCS
3072   verify_oop(value);
3073   bind(done);
3074 }
3075 
3076 #if INCLUDE_ALL_GCS
3077 // General G1 pre-barrier generator.
3078 // Goal: record the previous value if it is not null.
3079 void MacroAssembler::g1_write_barrier_pre(Register Robj, RegisterOrConstant offset, Register Rpre_val,
3080                                           Register Rtmp1, Register Rtmp2, bool needs_frame) {
3081   Label runtime, filtered;
3082 
3083   // Is marking active?
3084   if (in_bytes(SATBMarkQueue::byte_width_of_active()) == 4) {
3085     lwz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_active()), R16_thread);
3086   } else {
3087     guarantee(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, "Assumption");
3088     lbz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_active()), R16_thread);
3089   }
3090   cmpdi(CCR0, Rtmp1, 0);
3091   beq(CCR0, filtered);
3092 
3093   // Do we need to load the previous value?
3094   if (Robj != noreg) {
3095     // Load the previous value...
3096     if (UseCompressedOops) {
3097       lwz(Rpre_val, offset, Robj);
3098     } else {
3099       ld(Rpre_val, offset, Robj);
3100     }
3101     // Previous value has been loaded into Rpre_val.
3102   }
3103   assert(Rpre_val != noreg, "must have a real register");
3104 
3105   // Is the previous value null?
3106   cmpdi(CCR0, Rpre_val, 0);
3107   beq(CCR0, filtered);
3108 
3109   if (Robj != noreg && UseCompressedOops) {
3110     decode_heap_oop_not_null(Rpre_val);
3111   }
3112 
3113   // OK, it's not filtered, so we'll need to call enqueue. In the normal
3114   // case, pre_val will be a scratch G-reg, but there are some cases in
3115   // which it's an O-reg. In the first case, do a normal call. In the
3116   // latter, do a save here and call the frameless version.
3117 
3118   // Can we store original value in the thread's buffer?
3119   // Is index == 0?
3120   // (The index field is typed as size_t.)
3121   const Register Rbuffer = Rtmp1, Rindex = Rtmp2;
3122 
3123   ld(Rindex, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_index()), R16_thread);
3124   cmpdi(CCR0, Rindex, 0);
3125   beq(CCR0, runtime); // If index == 0, goto runtime.
3126   ld(Rbuffer, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_buf()), R16_thread);
3127 
3128   addi(Rindex, Rindex, -wordSize); // Decrement index.
3129   std(Rindex, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_index()), R16_thread);
3130 
3131   // Record the previous value.
3132   stdx(Rpre_val, Rbuffer, Rindex);
3133   b(filtered);
3134 
3135   bind(runtime);
3136 
3137   // May need to preserve LR. Also needed if current frame is not compatible with C calling convention.
3138   if (needs_frame) {
3139     save_LR_CR(Rtmp1);
3140     push_frame_reg_args(0, Rtmp2);
3141   }
3142 
3143   if (Rpre_val->is_volatile() && Robj == noreg) mr(R31, Rpre_val); // Save pre_val across C call if it was preloaded.
3144   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), Rpre_val, R16_thread);
3145   if (Rpre_val->is_volatile() && Robj == noreg) mr(Rpre_val, R31); // restore
3146 
3147   if (needs_frame) {
3148     pop_frame();
3149     restore_LR_CR(Rtmp1);
3150   }
3151 
3152   bind(filtered);
3153 }
3154 
3155 // General G1 post-barrier generator
3156 // Store cross-region card.
3157 void MacroAssembler::g1_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp1, Register Rtmp2, Register Rtmp3, Label *filtered_ext) {
3158   Label runtime, filtered_int;
3159   Label& filtered = (filtered_ext != NULL) ? *filtered_ext : filtered_int;
3160   assert_different_registers(Rstore_addr, Rnew_val, Rtmp1, Rtmp2);
3161 
3162   G1SATBCardTableLoggingModRefBS* bs =
3163     barrier_set_cast<G1SATBCardTableLoggingModRefBS>(Universe::heap()->barrier_set());
3164 
3165   // Does store cross heap regions?
3166   if (G1RSBarrierRegionFilter) {
3167     xorr(Rtmp1, Rstore_addr, Rnew_val);
3168     srdi_(Rtmp1, Rtmp1, HeapRegion::LogOfHRGrainBytes);
3169     beq(CCR0, filtered);
3170   }
3171 
3172   // Crosses regions, storing NULL?
3173 #ifdef ASSERT
3174   cmpdi(CCR0, Rnew_val, 0);
3175   asm_assert_ne("null oop not allowed (G1)", 0x322); // Checked by caller on PPC64, so following branch is obsolete:
3176   //beq(CCR0, filtered);
3177 #endif
3178 
3179   // Storing region crossing non-NULL, is card already dirty?
3180   assert(sizeof(*bs->byte_map_base) == sizeof(jbyte), "adjust this code");
3181   const Register Rcard_addr = Rtmp1;
3182   Register Rbase = Rtmp2;
3183   load_const_optimized(Rbase, (address)bs->byte_map_base, /*temp*/ Rtmp3);
3184 
3185   srdi(Rcard_addr, Rstore_addr, CardTableModRefBS::card_shift);
3186 
3187   // Get the address of the card.
3188   lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr);
3189   cmpwi(CCR0, Rtmp3, (int)G1SATBCardTableModRefBS::g1_young_card_val());
3190   beq(CCR0, filtered);
3191 
3192   membar(Assembler::StoreLoad);
3193   lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr);  // Reload after membar.
3194   cmpwi(CCR0, Rtmp3 /* card value */, CardTableModRefBS::dirty_card_val());
3195   beq(CCR0, filtered);
3196 
3197   // Storing a region crossing, non-NULL oop, card is clean.
3198   // Dirty card and log.
3199   li(Rtmp3, CardTableModRefBS::dirty_card_val());
3200   //release(); // G1: oops are allowed to get visible after dirty marking.
3201   stbx(Rtmp3, Rbase, Rcard_addr);
3202 
3203   add(Rcard_addr, Rbase, Rcard_addr); // This is the address which needs to get enqueued.
3204   Rbase = noreg; // end of lifetime
3205 
3206   const Register Rqueue_index = Rtmp2,
3207                  Rqueue_buf   = Rtmp3;
3208   ld(Rqueue_index, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_index()), R16_thread);
3209   cmpdi(CCR0, Rqueue_index, 0);
3210   beq(CCR0, runtime); // index == 0 then jump to runtime
3211   ld(Rqueue_buf, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_buf()), R16_thread);
3212 
3213   addi(Rqueue_index, Rqueue_index, -wordSize); // decrement index
3214   std(Rqueue_index, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_index()), R16_thread);
3215 
3216   stdx(Rcard_addr, Rqueue_buf, Rqueue_index); // store card
3217   b(filtered);
3218 
3219   bind(runtime);
3220 
3221   // Save the live input values.
3222   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), Rcard_addr, R16_thread);
3223 
3224   bind(filtered_int);
3225 }
3226 
3227 // Called from init_globals() after universe_init() and before interpreter_init()
3228 void g1_barrier_stubs_init() {
3229   // G1 barrier stubs currently only generated on SPARC
3230 }
3231 #endif // INCLUDE_ALL_GCS
3232 
3233 // Values for last_Java_pc, and last_Java_sp must comply to the rules
3234 // in frame_ppc.hpp.
3235 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) {
3236   // Always set last_Java_pc and flags first because once last_Java_sp
3237   // is visible has_last_Java_frame is true and users will look at the
3238   // rest of the fields. (Note: flags should always be zero before we
3239   // get here so doesn't need to be set.)
3240 
3241   // Verify that last_Java_pc was zeroed on return to Java
3242   asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread,
3243                           "last_Java_pc not zeroed before leaving Java", 0x200);
3244 
3245   // When returning from calling out from Java mode the frame anchor's
3246   // last_Java_pc will always be set to NULL. It is set here so that
3247   // if we are doing a call to native (not VM) that we capture the
3248   // known pc and don't have to rely on the native call having a
3249   // standard frame linkage where we can find the pc.
3250   if (last_Java_pc != noreg)
3251     std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
3252 
3253   // Set last_Java_sp last.
3254   std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
3255 }
3256 
3257 void MacroAssembler::reset_last_Java_frame(void) {
3258   asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()),
3259                              R16_thread, "SP was not set, still zero", 0x202);
3260 
3261   BLOCK_COMMENT("reset_last_Java_frame {");
3262   li(R0, 0);
3263 
3264   // _last_Java_sp = 0
3265   std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
3266 
3267   // _last_Java_pc = 0
3268   std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
3269   BLOCK_COMMENT("} reset_last_Java_frame");
3270 }
3271 
3272 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) {
3273   assert_different_registers(sp, tmp1);
3274 
3275   // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via
3276   // TOP_IJAVA_FRAME_ABI.
3277   // FIXME: assert that we really have a TOP_IJAVA_FRAME here!
3278   address entry = pc();
3279   load_const_optimized(tmp1, entry);
3280 
3281   set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1);
3282 }
3283 
3284 void MacroAssembler::get_vm_result(Register oop_result) {
3285   // Read:
3286   //   R16_thread
3287   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
3288   //
3289   // Updated:
3290   //   oop_result
3291   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
3292 
3293   verify_thread();
3294 
3295   ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread);
3296   li(R0, 0);
3297   std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread);
3298 
3299   verify_oop(oop_result);
3300 }
3301 
3302 void MacroAssembler::get_vm_result_2(Register metadata_result) {
3303   // Read:
3304   //   R16_thread
3305   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
3306   //
3307   // Updated:
3308   //   metadata_result
3309   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
3310 
3311   ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
3312   li(R0, 0);
3313   std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
3314 }
3315 
3316 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3317   Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided.
3318   if (Universe::narrow_klass_base() != 0) {
3319     // Use dst as temp if it is free.
3320     sub_const_optimized(dst, current, Universe::narrow_klass_base(), R0);
3321     current = dst;
3322   }
3323   if (Universe::narrow_klass_shift() != 0) {
3324     srdi(dst, current, Universe::narrow_klass_shift());
3325     current = dst;
3326   }
3327   return current;
3328 }
3329 
3330 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) {
3331   if (UseCompressedClassPointers) {
3332     Register compressedKlass = encode_klass_not_null(ck, klass);
3333     stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop);
3334   } else {
3335     std(klass, oopDesc::klass_offset_in_bytes(), dst_oop);
3336   }
3337 }
3338 
3339 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) {
3340   if (UseCompressedClassPointers) {
3341     if (val == noreg) {
3342       val = R0;
3343       li(val, 0);
3344     }
3345     stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed
3346   }
3347 }
3348 
3349 int MacroAssembler::instr_size_for_decode_klass_not_null() {
3350   if (!UseCompressedClassPointers) return 0;
3351   int num_instrs = 1;  // shift or move
3352   if (Universe::narrow_klass_base() != 0) num_instrs = 7;  // shift + load const + add
3353   return num_instrs * BytesPerInstWord;
3354 }
3355 
3356 void MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3357   assert(dst != R0, "Dst reg may not be R0, as R0 is used here.");
3358   if (src == noreg) src = dst;
3359   Register shifted_src = src;
3360   if (Universe::narrow_klass_shift() != 0 ||
3361       Universe::narrow_klass_base() == 0 && src != dst) {  // Move required.
3362     shifted_src = dst;
3363     sldi(shifted_src, src, Universe::narrow_klass_shift());
3364   }
3365   if (Universe::narrow_klass_base() != 0) {
3366     add_const_optimized(dst, shifted_src, Universe::narrow_klass_base(), R0);
3367   }
3368 }
3369 
3370 void MacroAssembler::load_klass(Register dst, Register src) {
3371   if (UseCompressedClassPointers) {
3372     lwz(dst, oopDesc::klass_offset_in_bytes(), src);
3373     // Attention: no null check here!
3374     decode_klass_not_null(dst, dst);
3375   } else {
3376     ld(dst, oopDesc::klass_offset_in_bytes(), src);
3377   }
3378 }
3379 
3380 // ((OopHandle)result).resolve();
3381 void MacroAssembler::resolve_oop_handle(Register result) {
3382   // OopHandle::resolve is an indirection.
3383   ld(result, 0, result);
3384 }
3385 
3386 void MacroAssembler::load_mirror_from_const_method(Register mirror, Register const_method) {
3387   ld(mirror, in_bytes(ConstMethod::constants_offset()), const_method);
3388   ld(mirror, ConstantPool::pool_holder_offset_in_bytes(), mirror);
3389   ld(mirror, in_bytes(Klass::java_mirror_offset()), mirror);
3390   resolve_oop_handle(mirror);
3391 }
3392 
3393 // Clear Array
3394 // For very short arrays. tmp == R0 is allowed.
3395 void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) {
3396   if (cnt_dwords > 0) { li(tmp, 0); }
3397   for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); }
3398 }
3399 
3400 // Version for constant short array length. Kills base_ptr. tmp == R0 is allowed.
3401 void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) {
3402   if (cnt_dwords < 8) {
3403     clear_memory_unrolled(base_ptr, cnt_dwords, tmp);
3404     return;
3405   }
3406 
3407   Label loop;
3408   const long loopcnt   = cnt_dwords >> 1,
3409              remainder = cnt_dwords & 1;
3410 
3411   li(tmp, loopcnt);
3412   mtctr(tmp);
3413   li(tmp, 0);
3414   bind(loop);
3415     std(tmp, 0, base_ptr);
3416     std(tmp, 8, base_ptr);
3417     addi(base_ptr, base_ptr, 16);
3418     bdnz(loop);
3419   if (remainder) { std(tmp, 0, base_ptr); }
3420 }
3421 
3422 // Kills both input registers. tmp == R0 is allowed.
3423 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) {
3424   // Procedure for large arrays (uses data cache block zero instruction).
3425     Label startloop, fast, fastloop, small_rest, restloop, done;
3426     const int cl_size         = VM_Version::L1_data_cache_line_size(),
3427               cl_dwords       = cl_size >> 3,
3428               cl_dw_addr_bits = exact_log2(cl_dwords),
3429               dcbz_min        = 1,  // Min count of dcbz executions, needs to be >0.
3430               min_cnt         = ((dcbz_min + 1) << cl_dw_addr_bits) - 1;
3431 
3432   if (const_cnt >= 0) {
3433     // Constant case.
3434     if (const_cnt < min_cnt) {
3435       clear_memory_constlen(base_ptr, const_cnt, tmp);
3436       return;
3437     }
3438     load_const_optimized(cnt_dwords, const_cnt, tmp);
3439   } else {
3440     // cnt_dwords already loaded in register. Need to check size.
3441     cmpdi(CCR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included).
3442     blt(CCR1, small_rest);
3443   }
3444     rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line.
3445     beq(CCR0, fast);                                  // Already 128byte aligned.
3446 
3447     subfic(tmp, tmp, cl_dwords);
3448     mtctr(tmp);                        // Set ctr to hit 128byte boundary (0<ctr<cl_dwords).
3449     subf(cnt_dwords, tmp, cnt_dwords); // rest.
3450     li(tmp, 0);
3451 
3452   bind(startloop);                     // Clear at the beginning to reach 128byte boundary.
3453     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3454     addi(base_ptr, base_ptr, 8);
3455     bdnz(startloop);
3456 
3457   bind(fast);                                  // Clear 128byte blocks.
3458     srdi(tmp, cnt_dwords, cl_dw_addr_bits);    // Loop count for 128byte loop (>0).
3459     andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords.
3460     mtctr(tmp);                                // Load counter.
3461 
3462   bind(fastloop);
3463     dcbz(base_ptr);                    // Clear 128byte aligned block.
3464     addi(base_ptr, base_ptr, cl_size);
3465     bdnz(fastloop);
3466 
3467   bind(small_rest);
3468     cmpdi(CCR0, cnt_dwords, 0);        // size 0?
3469     beq(CCR0, done);                   // rest == 0
3470     li(tmp, 0);
3471     mtctr(cnt_dwords);                 // Load counter.
3472 
3473   bind(restloop);                      // Clear rest.
3474     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3475     addi(base_ptr, base_ptr, 8);
3476     bdnz(restloop);
3477 
3478   bind(done);
3479 }
3480 
3481 /////////////////////////////////////////// String intrinsics ////////////////////////////////////////////
3482 
3483 #ifdef COMPILER2
3484 // Intrinsics for CompactStrings
3485 
3486 // Compress char[] to byte[] by compressing 16 bytes at once.
3487 void MacroAssembler::string_compress_16(Register src, Register dst, Register cnt,
3488                                         Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5,
3489                                         Label& Lfailure) {
3490 
3491   const Register tmp0 = R0;
3492   assert_different_registers(src, dst, cnt, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5);
3493   Label Lloop, Lslow;
3494 
3495   // Check if cnt >= 8 (= 16 bytes)
3496   lis(tmp1, 0xFF);                // tmp1 = 0x00FF00FF00FF00FF
3497   srwi_(tmp2, cnt, 3);
3498   beq(CCR0, Lslow);
3499   ori(tmp1, tmp1, 0xFF);
3500   rldimi(tmp1, tmp1, 32, 0);
3501   mtctr(tmp2);
3502 
3503   // 2x unrolled loop
3504   bind(Lloop);
3505   ld(tmp2, 0, src);               // _0_1_2_3 (Big Endian)
3506   ld(tmp4, 8, src);               // _4_5_6_7
3507 
3508   orr(tmp0, tmp2, tmp4);
3509   rldicl(tmp3, tmp2, 6*8, 64-24); // _____1_2
3510   rldimi(tmp2, tmp2, 2*8, 2*8);   // _0_2_3_3
3511   rldicl(tmp5, tmp4, 6*8, 64-24); // _____5_6
3512   rldimi(tmp4, tmp4, 2*8, 2*8);   // _4_6_7_7
3513 
3514   andc_(tmp0, tmp0, tmp1);
3515   bne(CCR0, Lfailure);            // Not latin1.
3516   addi(src, src, 16);
3517 
3518   rlwimi(tmp3, tmp2, 0*8, 24, 31);// _____1_3
3519   srdi(tmp2, tmp2, 3*8);          // ____0_2_
3520   rlwimi(tmp5, tmp4, 0*8, 24, 31);// _____5_7
3521   srdi(tmp4, tmp4, 3*8);          // ____4_6_
3522 
3523   orr(tmp2, tmp2, tmp3);          // ____0123
3524   orr(tmp4, tmp4, tmp5);          // ____4567
3525 
3526   stw(tmp2, 0, dst);
3527   stw(tmp4, 4, dst);
3528   addi(dst, dst, 8);
3529   bdnz(Lloop);
3530 
3531   bind(Lslow);                    // Fallback to slow version
3532 }
3533 
3534 // Compress char[] to byte[]. cnt must be positive int.
3535 void MacroAssembler::string_compress(Register src, Register dst, Register cnt, Register tmp, Label& Lfailure) {
3536   Label Lloop;
3537   mtctr(cnt);
3538 
3539   bind(Lloop);
3540   lhz(tmp, 0, src);
3541   cmplwi(CCR0, tmp, 0xff);
3542   bgt(CCR0, Lfailure);            // Not latin1.
3543   addi(src, src, 2);
3544   stb(tmp, 0, dst);
3545   addi(dst, dst, 1);
3546   bdnz(Lloop);
3547 }
3548 
3549 // Inflate byte[] to char[] by inflating 16 bytes at once.
3550 void MacroAssembler::string_inflate_16(Register src, Register dst, Register cnt,
3551                                        Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
3552   const Register tmp0 = R0;
3553   assert_different_registers(src, dst, cnt, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5);
3554   Label Lloop, Lslow;
3555 
3556   // Check if cnt >= 8
3557   srwi_(tmp2, cnt, 3);
3558   beq(CCR0, Lslow);
3559   lis(tmp1, 0xFF);                // tmp1 = 0x00FF00FF
3560   ori(tmp1, tmp1, 0xFF);
3561   mtctr(tmp2);
3562 
3563   // 2x unrolled loop
3564   bind(Lloop);
3565   lwz(tmp2, 0, src);              // ____0123 (Big Endian)
3566   lwz(tmp4, 4, src);              // ____4567
3567   addi(src, src, 8);
3568 
3569   rldicl(tmp3, tmp2, 7*8, 64-8);  // _______2
3570   rlwimi(tmp2, tmp2, 3*8, 16, 23);// ____0113
3571   rldicl(tmp5, tmp4, 7*8, 64-8);  // _______6
3572   rlwimi(tmp4, tmp4, 3*8, 16, 23);// ____4557
3573 
3574   andc(tmp0, tmp2, tmp1);         // ____0_1_
3575   rlwimi(tmp2, tmp3, 2*8, 0, 23); // _____2_3
3576   andc(tmp3, tmp4, tmp1);         // ____4_5_
3577   rlwimi(tmp4, tmp5, 2*8, 0, 23); // _____6_7
3578 
3579   rldimi(tmp2, tmp0, 3*8, 0*8);   // _0_1_2_3
3580   rldimi(tmp4, tmp3, 3*8, 0*8);   // _4_5_6_7
3581 
3582   std(tmp2, 0, dst);
3583   std(tmp4, 8, dst);
3584   addi(dst, dst, 16);
3585   bdnz(Lloop);
3586 
3587   bind(Lslow);                    // Fallback to slow version
3588 }
3589 
3590 // Inflate byte[] to char[]. cnt must be positive int.
3591 void MacroAssembler::string_inflate(Register src, Register dst, Register cnt, Register tmp) {
3592   Label Lloop;
3593   mtctr(cnt);
3594 
3595   bind(Lloop);
3596   lbz(tmp, 0, src);
3597   addi(src, src, 1);
3598   sth(tmp, 0, dst);
3599   addi(dst, dst, 2);
3600   bdnz(Lloop);
3601 }
3602 
3603 void MacroAssembler::string_compare(Register str1, Register str2,
3604                                     Register cnt1, Register cnt2,
3605                                     Register tmp1, Register result, int ae) {
3606   const Register tmp0 = R0,
3607                  diff = tmp1;
3608 
3609   assert_different_registers(str1, str2, cnt1, cnt2, tmp0, tmp1, result);
3610   Label Ldone, Lslow, Lloop, Lreturn_diff;
3611 
3612   // Note: Making use of the fact that compareTo(a, b) == -compareTo(b, a)
3613   // we interchange str1 and str2 in the UL case and negate the result.
3614   // Like this, str1 is always latin1 encoded, except for the UU case.
3615   // In addition, we need 0 (or sign which is 0) extend.
3616 
3617   if (ae == StrIntrinsicNode::UU) {
3618     srwi(cnt1, cnt1, 1);
3619   } else {
3620     clrldi(cnt1, cnt1, 32);
3621   }
3622 
3623   if (ae != StrIntrinsicNode::LL) {
3624     srwi(cnt2, cnt2, 1);
3625   } else {
3626     clrldi(cnt2, cnt2, 32);
3627   }
3628 
3629   // See if the lengths are different, and calculate min in cnt1.
3630   // Save diff in case we need it for a tie-breaker.
3631   subf_(diff, cnt2, cnt1); // diff = cnt1 - cnt2
3632   // if (diff > 0) { cnt1 = cnt2; }
3633   if (VM_Version::has_isel()) {
3634     isel(cnt1, CCR0, Assembler::greater, /*invert*/ false, cnt2);
3635   } else {
3636     Label Lskip;
3637     blt(CCR0, Lskip);
3638     mr(cnt1, cnt2);
3639     bind(Lskip);
3640   }
3641 
3642   // Rename registers
3643   Register chr1 = result;
3644   Register chr2 = tmp0;
3645 
3646   // Compare multiple characters in fast loop (only implemented for same encoding).
3647   int stride1 = 8, stride2 = 8;
3648   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3649     int log2_chars_per_iter = (ae == StrIntrinsicNode::LL) ? 3 : 2;
3650     Label Lfastloop, Lskipfast;
3651 
3652     srwi_(tmp0, cnt1, log2_chars_per_iter);
3653     beq(CCR0, Lskipfast);
3654     rldicl(cnt2, cnt1, 0, 64 - log2_chars_per_iter); // Remaining characters.
3655     li(cnt1, 1 << log2_chars_per_iter); // Initialize for failure case: Rescan characters from current iteration.
3656     mtctr(tmp0);
3657 
3658     bind(Lfastloop);
3659     ld(chr1, 0, str1);
3660     ld(chr2, 0, str2);
3661     cmpd(CCR0, chr1, chr2);
3662     bne(CCR0, Lslow);
3663     addi(str1, str1, stride1);
3664     addi(str2, str2, stride2);
3665     bdnz(Lfastloop);
3666     mr(cnt1, cnt2); // Remaining characters.
3667     bind(Lskipfast);
3668   }
3669 
3670   // Loop which searches the first difference character by character.
3671   cmpwi(CCR0, cnt1, 0);
3672   beq(CCR0, Lreturn_diff);
3673   bind(Lslow);
3674   mtctr(cnt1);
3675 
3676   switch (ae) {
3677     case StrIntrinsicNode::LL: stride1 = 1; stride2 = 1; break;
3678     case StrIntrinsicNode::UL: // fallthru (see comment above)
3679     case StrIntrinsicNode::LU: stride1 = 1; stride2 = 2; break;
3680     case StrIntrinsicNode::UU: stride1 = 2; stride2 = 2; break;
3681     default: ShouldNotReachHere(); break;
3682   }
3683 
3684   bind(Lloop);
3685   if (stride1 == 1) { lbz(chr1, 0, str1); } else { lhz(chr1, 0, str1); }
3686   if (stride2 == 1) { lbz(chr2, 0, str2); } else { lhz(chr2, 0, str2); }
3687   subf_(result, chr2, chr1); // result = chr1 - chr2
3688   bne(CCR0, Ldone);
3689   addi(str1, str1, stride1);
3690   addi(str2, str2, stride2);
3691   bdnz(Lloop);
3692 
3693   // If strings are equal up to min length, return the length difference.
3694   bind(Lreturn_diff);
3695   mr(result, diff);
3696 
3697   // Otherwise, return the difference between the first mismatched chars.
3698   bind(Ldone);
3699   if (ae == StrIntrinsicNode::UL) {
3700     neg(result, result); // Negate result (see note above).
3701   }
3702 }
3703 
3704 void MacroAssembler::array_equals(bool is_array_equ, Register ary1, Register ary2,
3705                                   Register limit, Register tmp1, Register result, bool is_byte) {
3706   const Register tmp0 = R0;
3707   assert_different_registers(ary1, ary2, limit, tmp0, tmp1, result);
3708   Label Ldone, Lskiploop, Lloop, Lfastloop, Lskipfast;
3709   bool limit_needs_shift = false;
3710 
3711   if (is_array_equ) {
3712     const int length_offset = arrayOopDesc::length_offset_in_bytes();
3713     const int base_offset   = arrayOopDesc::base_offset_in_bytes(is_byte ? T_BYTE : T_CHAR);
3714 
3715     // Return true if the same array.
3716     cmpd(CCR0, ary1, ary2);
3717     beq(CCR0, Lskiploop);
3718 
3719     // Return false if one of them is NULL.
3720     cmpdi(CCR0, ary1, 0);
3721     cmpdi(CCR1, ary2, 0);
3722     li(result, 0);
3723     cror(CCR0, Assembler::equal, CCR1, Assembler::equal);
3724     beq(CCR0, Ldone);
3725 
3726     // Load the lengths of arrays.
3727     lwz(limit, length_offset, ary1);
3728     lwz(tmp0, length_offset, ary2);
3729 
3730     // Return false if the two arrays are not equal length.
3731     cmpw(CCR0, limit, tmp0);
3732     bne(CCR0, Ldone);
3733 
3734     // Load array addresses.
3735     addi(ary1, ary1, base_offset);
3736     addi(ary2, ary2, base_offset);
3737   } else {
3738     limit_needs_shift = !is_byte;
3739     li(result, 0); // Assume not equal.
3740   }
3741 
3742   // Rename registers
3743   Register chr1 = tmp0;
3744   Register chr2 = tmp1;
3745 
3746   // Compare 8 bytes per iteration in fast loop.
3747   const int log2_chars_per_iter = is_byte ? 3 : 2;
3748 
3749   srwi_(tmp0, limit, log2_chars_per_iter + (limit_needs_shift ? 1 : 0));
3750   beq(CCR0, Lskipfast);
3751   mtctr(tmp0);
3752 
3753   bind(Lfastloop);
3754   ld(chr1, 0, ary1);
3755   ld(chr2, 0, ary2);
3756   addi(ary1, ary1, 8);
3757   addi(ary2, ary2, 8);
3758   cmpd(CCR0, chr1, chr2);
3759   bne(CCR0, Ldone);
3760   bdnz(Lfastloop);
3761 
3762   bind(Lskipfast);
3763   rldicl_(limit, limit, limit_needs_shift ? 64 - 1 : 0, 64 - log2_chars_per_iter); // Remaining characters.
3764   beq(CCR0, Lskiploop);
3765   mtctr(limit);
3766 
3767   // Character by character.
3768   bind(Lloop);
3769   if (is_byte) {
3770     lbz(chr1, 0, ary1);
3771     lbz(chr2, 0, ary2);
3772     addi(ary1, ary1, 1);
3773     addi(ary2, ary2, 1);
3774   } else {
3775     lhz(chr1, 0, ary1);
3776     lhz(chr2, 0, ary2);
3777     addi(ary1, ary1, 2);
3778     addi(ary2, ary2, 2);
3779   }
3780   cmpw(CCR0, chr1, chr2);
3781   bne(CCR0, Ldone);
3782   bdnz(Lloop);
3783 
3784   bind(Lskiploop);
3785   li(result, 1); // All characters are equal.
3786   bind(Ldone);
3787 }
3788 
3789 void MacroAssembler::string_indexof(Register result, Register haystack, Register haycnt,
3790                                     Register needle, ciTypeArray* needle_values, Register needlecnt, int needlecntval,
3791                                     Register tmp1, Register tmp2, Register tmp3, Register tmp4, int ae) {
3792 
3793   // Ensure 0<needlecnt<=haycnt in ideal graph as prerequisite!
3794   Label L_TooShort, L_Found, L_NotFound, L_End;
3795   Register last_addr = haycnt, // Kill haycnt at the beginning.
3796   addr      = tmp1,
3797   n_start   = tmp2,
3798   ch1       = tmp3,
3799   ch2       = R0;
3800 
3801   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
3802   const int h_csize = (ae == StrIntrinsicNode::LL) ? 1 : 2;
3803   const int n_csize = (ae == StrIntrinsicNode::UU) ? 2 : 1;
3804 
3805   // **************************************************************************************************
3806   // Prepare for main loop: optimized for needle count >=2, bail out otherwise.
3807   // **************************************************************************************************
3808 
3809   // Compute last haystack addr to use if no match gets found.
3810   clrldi(haycnt, haycnt, 32);         // Ensure positive int is valid as 64 bit value.
3811   addi(addr, haystack, -h_csize);     // Accesses use pre-increment.
3812   if (needlecntval == 0) { // variable needlecnt
3813    cmpwi(CCR6, needlecnt, 2);
3814    clrldi(needlecnt, needlecnt, 32);  // Ensure positive int is valid as 64 bit value.
3815    blt(CCR6, L_TooShort);             // Variable needlecnt: handle short needle separately.
3816   }
3817 
3818   if (n_csize == 2) { lwz(n_start, 0, needle); } else { lhz(n_start, 0, needle); } // Load first 2 characters of needle.
3819 
3820   if (needlecntval == 0) { // variable needlecnt
3821    subf(ch1, needlecnt, haycnt);      // Last character index to compare is haycnt-needlecnt.
3822    addi(needlecnt, needlecnt, -2);    // Rest of needle.
3823   } else { // constant needlecnt
3824   guarantee(needlecntval != 1, "IndexOf with single-character needle must be handled separately");
3825   assert((needlecntval & 0x7fff) == needlecntval, "wrong immediate");
3826    addi(ch1, haycnt, -needlecntval);  // Last character index to compare is haycnt-needlecnt.
3827    if (needlecntval > 3) { li(needlecnt, needlecntval - 2); } // Rest of needle.
3828   }
3829 
3830   if (h_csize == 2) { slwi(ch1, ch1, 1); } // Scale to number of bytes.
3831 
3832   if (ae ==StrIntrinsicNode::UL) {
3833    srwi(tmp4, n_start, 1*8);          // ___0
3834    rlwimi(n_start, tmp4, 2*8, 0, 23); // _0_1
3835   }
3836 
3837   add(last_addr, haystack, ch1);      // Point to last address to compare (haystack+2*(haycnt-needlecnt)).
3838 
3839   // Main Loop (now we have at least 2 characters).
3840   Label L_OuterLoop, L_InnerLoop, L_FinalCheck, L_Comp1, L_Comp2;
3841   bind(L_OuterLoop); // Search for 1st 2 characters.
3842   Register addr_diff = tmp4;
3843    subf(addr_diff, addr, last_addr);  // Difference between already checked address and last address to check.
3844    addi(addr, addr, h_csize);         // This is the new address we want to use for comparing.
3845    srdi_(ch2, addr_diff, h_csize);
3846    beq(CCR0, L_FinalCheck);           // 2 characters left?
3847    mtctr(ch2);                        // num of characters / 2
3848   bind(L_InnerLoop);                  // Main work horse (2x unrolled search loop)
3849    if (h_csize == 2) {                // Load 2 characters of haystack (ignore alignment).
3850     lwz(ch1, 0, addr);
3851     lwz(ch2, 2, addr);
3852    } else {
3853     lhz(ch1, 0, addr);
3854     lhz(ch2, 1, addr);
3855    }
3856    cmpw(CCR0, ch1, n_start);          // Compare 2 characters (1 would be sufficient but try to reduce branches to CompLoop).
3857    cmpw(CCR1, ch2, n_start);
3858    beq(CCR0, L_Comp1);                // Did we find the needle start?
3859    beq(CCR1, L_Comp2);
3860    addi(addr, addr, 2 * h_csize);
3861    bdnz(L_InnerLoop);
3862   bind(L_FinalCheck);
3863    andi_(addr_diff, addr_diff, h_csize); // Remaining characters not covered by InnerLoop: (num of characters) & 1.
3864    beq(CCR0, L_NotFound);
3865    if (h_csize == 2) { lwz(ch1, 0, addr); } else { lhz(ch1, 0, addr); } // One position left at which we have to compare.
3866    cmpw(CCR1, ch1, n_start);
3867    beq(CCR1, L_Comp1);
3868   bind(L_NotFound);
3869    li(result, -1);                    // not found
3870    b(L_End);
3871 
3872    // **************************************************************************************************
3873    // Special Case: unfortunately, the variable needle case can be called with needlecnt<2
3874    // **************************************************************************************************
3875   if (needlecntval == 0) {           // We have to handle these cases separately.
3876   Label L_OneCharLoop;
3877   bind(L_TooShort);
3878    mtctr(haycnt);
3879    if (n_csize == 2) { lhz(n_start, 0, needle); } else { lbz(n_start, 0, needle); } // First character of needle
3880   bind(L_OneCharLoop);
3881    if (h_csize == 2) { lhzu(ch1, 2, addr); } else { lbzu(ch1, 1, addr); }
3882    cmpw(CCR1, ch1, n_start);
3883    beq(CCR1, L_Found);               // Did we find the one character needle?
3884    bdnz(L_OneCharLoop);
3885    li(result, -1);                   // Not found.
3886    b(L_End);
3887   }
3888 
3889   // **************************************************************************************************
3890   // Regular Case Part II: compare rest of needle (first 2 characters have been compared already)
3891   // **************************************************************************************************
3892 
3893   // Compare the rest
3894   bind(L_Comp2);
3895    addi(addr, addr, h_csize);        // First comparison has failed, 2nd one hit.
3896   bind(L_Comp1);                     // Addr points to possible needle start.
3897   if (needlecntval != 2) {           // Const needlecnt==2?
3898    if (needlecntval != 3) {
3899     if (needlecntval == 0) { beq(CCR6, L_Found); } // Variable needlecnt==2?
3900     Register n_ind = tmp4,
3901              h_ind = n_ind;
3902     li(n_ind, 2 * n_csize);          // First 2 characters are already compared, use index 2.
3903     mtctr(needlecnt);                // Decremented by 2, still > 0.
3904    Label L_CompLoop;
3905    bind(L_CompLoop);
3906     if (ae ==StrIntrinsicNode::UL) {
3907       h_ind = ch1;
3908       sldi(h_ind, n_ind, 1);
3909     }
3910     if (n_csize == 2) { lhzx(ch2, needle, n_ind); } else { lbzx(ch2, needle, n_ind); }
3911     if (h_csize == 2) { lhzx(ch1, addr, h_ind); } else { lbzx(ch1, addr, h_ind); }
3912     cmpw(CCR1, ch1, ch2);
3913     bne(CCR1, L_OuterLoop);
3914     addi(n_ind, n_ind, n_csize);
3915     bdnz(L_CompLoop);
3916    } else { // No loop required if there's only one needle character left.
3917     if (n_csize == 2) { lhz(ch2, 2 * 2, needle); } else { lbz(ch2, 2 * 1, needle); }
3918     if (h_csize == 2) { lhz(ch1, 2 * 2, addr); } else { lbz(ch1, 2 * 1, addr); }
3919     cmpw(CCR1, ch1, ch2);
3920     bne(CCR1, L_OuterLoop);
3921    }
3922   }
3923   // Return index ...
3924   bind(L_Found);
3925    subf(result, haystack, addr);     // relative to haystack, ...
3926    if (h_csize == 2) { srdi(result, result, 1); } // in characters.
3927   bind(L_End);
3928 } // string_indexof
3929 
3930 void MacroAssembler::string_indexof_char(Register result, Register haystack, Register haycnt,
3931                                          Register needle, jchar needleChar, Register tmp1, Register tmp2, bool is_byte) {
3932   assert_different_registers(haystack, haycnt, needle, tmp1, tmp2);
3933 
3934   Label L_InnerLoop, L_FinalCheck, L_Found1, L_Found2, L_NotFound, L_End;
3935   Register addr = tmp1,
3936            ch1 = tmp2,
3937            ch2 = R0;
3938 
3939   const int h_csize = is_byte ? 1 : 2;
3940 
3941 //4:
3942    srwi_(tmp2, haycnt, 1);   // Shift right by exact_log2(UNROLL_FACTOR).
3943    mr(addr, haystack);
3944    beq(CCR0, L_FinalCheck);
3945    mtctr(tmp2);              // Move to count register.
3946 //8:
3947   bind(L_InnerLoop);         // Main work horse (2x unrolled search loop).
3948    if (!is_byte) {
3949     lhz(ch1, 0, addr);
3950     lhz(ch2, 2, addr);
3951    } else {
3952     lbz(ch1, 0, addr);
3953     lbz(ch2, 1, addr);
3954    }
3955    (needle != R0) ? cmpw(CCR0, ch1, needle) : cmplwi(CCR0, ch1, (unsigned int)needleChar);
3956    (needle != R0) ? cmpw(CCR1, ch2, needle) : cmplwi(CCR1, ch2, (unsigned int)needleChar);
3957    beq(CCR0, L_Found1);      // Did we find the needle?
3958    beq(CCR1, L_Found2);
3959    addi(addr, addr, 2 * h_csize);
3960    bdnz(L_InnerLoop);
3961 //16:
3962   bind(L_FinalCheck);
3963    andi_(R0, haycnt, 1);
3964    beq(CCR0, L_NotFound);
3965    if (!is_byte) { lhz(ch1, 0, addr); } else { lbz(ch1, 0, addr); } // One position left at which we have to compare.
3966    (needle != R0) ? cmpw(CCR1, ch1, needle) : cmplwi(CCR1, ch1, (unsigned int)needleChar);
3967    beq(CCR1, L_Found1);
3968 //21:
3969   bind(L_NotFound);
3970    li(result, -1);           // Not found.
3971    b(L_End);
3972 
3973   bind(L_Found2);
3974    addi(addr, addr, h_csize);
3975 //24:
3976   bind(L_Found1);            // Return index ...
3977    subf(result, haystack, addr); // relative to haystack, ...
3978    if (!is_byte) { srdi(result, result, 1); } // in characters.
3979   bind(L_End);
3980 } // string_indexof_char
3981 
3982 
3983 void MacroAssembler::has_negatives(Register src, Register cnt, Register result,
3984                                    Register tmp1, Register tmp2) {
3985   const Register tmp0 = R0;
3986   assert_different_registers(src, result, cnt, tmp0, tmp1, tmp2);
3987   Label Lfastloop, Lslow, Lloop, Lnoneg, Ldone;
3988 
3989   // Check if cnt >= 8 (= 16 bytes)
3990   lis(tmp1, (int)(short)0x8080);  // tmp1 = 0x8080808080808080
3991   srwi_(tmp2, cnt, 4);
3992   li(result, 1);                  // Assume there's a negative byte.
3993   beq(CCR0, Lslow);
3994   ori(tmp1, tmp1, 0x8080);
3995   rldimi(tmp1, tmp1, 32, 0);
3996   mtctr(tmp2);
3997 
3998   // 2x unrolled loop
3999   bind(Lfastloop);
4000   ld(tmp2, 0, src);
4001   ld(tmp0, 8, src);
4002 
4003   orr(tmp0, tmp2, tmp0);
4004 
4005   and_(tmp0, tmp0, tmp1);
4006   bne(CCR0, Ldone);               // Found negative byte.
4007   addi(src, src, 16);
4008 
4009   bdnz(Lfastloop);
4010 
4011   bind(Lslow);                    // Fallback to slow version
4012   rldicl_(tmp0, cnt, 0, 64-4);
4013   beq(CCR0, Lnoneg);
4014   mtctr(tmp0);
4015   bind(Lloop);
4016   lbz(tmp0, 0, src);
4017   addi(src, src, 1);
4018   andi_(tmp0, tmp0, 0x80);
4019   bne(CCR0, Ldone);               // Found negative byte.
4020   bdnz(Lloop);
4021   bind(Lnoneg);
4022   li(result, 0);
4023 
4024   bind(Ldone);
4025 }
4026 
4027 #endif // Compiler2
4028 
4029 // Helpers for Intrinsic Emitters
4030 //
4031 // Revert the byte order of a 32bit value in a register
4032 //   src: 0x44556677
4033 //   dst: 0x77665544
4034 // Three steps to obtain the result:
4035 //  1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word
4036 //     into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared.
4037 //     This value initializes dst.
4038 //  2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost
4039 //     byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go.
4040 //     This value is mask inserted into dst with a [0..23] mask of 1s.
4041 //  3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position.
4042 //     This value is mask inserted into dst with a [8..15] mask of 1s.
4043 void MacroAssembler::load_reverse_32(Register dst, Register src) {
4044   assert_different_registers(dst, src);
4045 
4046   rldicl(dst, src, (4+1)*8, 56);       // Rotate byte 4 into position 7 (rightmost), clear all to the left.
4047   rlwimi(dst, src,     3*8,  0, 23);   // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone.
4048   rlwimi(dst, src,     1*8,  8, 15);   // Insert byte 6 into position 5, leave the rest alone.
4049 }
4050 
4051 // Calculate the column addresses of the crc32 lookup table into distinct registers.
4052 // This loop-invariant calculation is moved out of the loop body, reducing the loop
4053 // body size from 20 to 16 instructions.
4054 // Returns the offset that was used to calculate the address of column tc3.
4055 // Due to register shortage, setting tc3 may overwrite table. With the return offset
4056 // at hand, the original table address can be easily reconstructed.
4057 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {
4058 
4059 #ifdef VM_LITTLE_ENDIAN
4060   // This is what we implement (the DOLIT4 part):
4061   // ========================================================================= */
4062   // #define DOLIT4 c ^= *buf4++; \
4063   //         c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \
4064   //             crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24]
4065   // #define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4
4066   // ========================================================================= */
4067   const int ix0 = 3*(4*CRC32_COLUMN_SIZE);
4068   const int ix1 = 2*(4*CRC32_COLUMN_SIZE);
4069   const int ix2 = 1*(4*CRC32_COLUMN_SIZE);
4070   const int ix3 = 0*(4*CRC32_COLUMN_SIZE);
4071 #else
4072   // This is what we implement (the DOBIG4 part):
4073   // =========================================================================
4074   // #define DOBIG4 c ^= *++buf4; \
4075   //         c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \
4076   //             crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24]
4077   // #define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4
4078   // =========================================================================
4079   const int ix0 = 4*(4*CRC32_COLUMN_SIZE);
4080   const int ix1 = 5*(4*CRC32_COLUMN_SIZE);
4081   const int ix2 = 6*(4*CRC32_COLUMN_SIZE);
4082   const int ix3 = 7*(4*CRC32_COLUMN_SIZE);
4083 #endif
4084   assert_different_registers(table, tc0, tc1, tc2);
4085   assert(table == tc3, "must be!");
4086 
4087   addi(tc0, table, ix0);
4088   addi(tc1, table, ix1);
4089   addi(tc2, table, ix2);
4090   if (ix3 != 0) addi(tc3, table, ix3);
4091 
4092   return ix3;
4093 }
4094 
4095 /**
4096  * uint32_t crc;
4097  * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
4098  */
4099 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) {
4100   assert_different_registers(crc, table, tmp);
4101   assert_different_registers(val, table);
4102 
4103   if (crc == val) {                   // Must rotate first to use the unmodified value.
4104     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
4105                                       // As we use a word (4-byte) instruction, we have to adapt the mask bit positions.
4106     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
4107   } else {
4108     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
4109     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
4110   }
4111   lwzx(tmp, table, tmp);
4112   xorr(crc, crc, tmp);
4113 }
4114 
4115 /**
4116  * uint32_t crc;
4117  * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
4118  */
4119 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
4120   fold_byte_crc32(crc, crc, table, tmp);
4121 }
4122 
4123 /**
4124  * Emits code to update CRC-32 with a byte value according to constants in table.
4125  *
4126  * @param [in,out]crc   Register containing the crc.
4127  * @param [in]val       Register containing the byte to fold into the CRC.
4128  * @param [in]table     Register containing the table of crc constants.
4129  *
4130  * uint32_t crc;
4131  * val = crc_table[(val ^ crc) & 0xFF];
4132  * crc = val ^ (crc >> 8);
4133  */
4134 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
4135   BLOCK_COMMENT("update_byte_crc32:");
4136   xorr(val, val, crc);
4137   fold_byte_crc32(crc, val, table, val);
4138 }
4139 
4140 /**
4141  * @param crc   register containing existing CRC (32-bit)
4142  * @param buf   register pointing to input byte buffer (byte*)
4143  * @param len   register containing number of bytes
4144  * @param table register pointing to CRC table
4145  */
4146 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,
4147                                            Register data, bool loopAlignment) {
4148   assert_different_registers(crc, buf, len, table, data);
4149 
4150   Label L_mainLoop, L_done;
4151   const int mainLoop_stepping  = 1;
4152   const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4;
4153 
4154   // Process all bytes in a single-byte loop.
4155   clrldi_(len, len, 32);                         // Enforce 32 bit. Anything to do?
4156   beq(CCR0, L_done);
4157 
4158   mtctr(len);
4159   align(mainLoop_alignment);
4160   BIND(L_mainLoop);
4161     lbz(data, 0, buf);                           // Byte from buffer, zero-extended.
4162     addi(buf, buf, mainLoop_stepping);           // Advance buffer position.
4163     update_byte_crc32(crc, data, table);
4164     bdnz(L_mainLoop);                            // Iterate.
4165 
4166   bind(L_done);
4167 }
4168 
4169 /**
4170  * Emits code to update CRC-32 with a 4-byte value according to constants in table
4171  * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
4172  */
4173 // A not on the lookup table address(es):
4174 // The lookup table consists of two sets of four columns each.
4175 // The columns {0..3} are used for little-endian machines.
4176 // The columns {4..7} are used for big-endian machines.
4177 // To save the effort of adding the column offset to the table address each time
4178 // a table element is looked up, it is possible to pass the pre-calculated
4179 // column addresses.
4180 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary.
4181 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,
4182                                         Register t0,  Register t1,  Register t2,  Register t3,
4183                                         Register tc0, Register tc1, Register tc2, Register tc3) {
4184   assert_different_registers(crc, t3);
4185 
4186   // XOR crc with next four bytes of buffer.
4187   lwz(t3, bufDisp, buf);
4188   if (bufInc != 0) {
4189     addi(buf, buf, bufInc);
4190   }
4191   xorr(t3, t3, crc);
4192 
4193   // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices.
4194   rlwinm(t0, t3,  2,         24-2, 31-2);  // ((t1 >>  0) & 0xff) << 2
4195   rlwinm(t1, t3,  32+(2- 8), 24-2, 31-2);  // ((t1 >>  8) & 0xff) << 2
4196   rlwinm(t2, t3,  32+(2-16), 24-2, 31-2);  // ((t1 >> 16) & 0xff) << 2
4197   rlwinm(t3, t3,  32+(2-24), 24-2, 31-2);  // ((t1 >> 24) & 0xff) << 2
4198 
4199   // Use the pre-calculated column addresses.
4200   // Load pre-calculated table values.
4201   lwzx(t0, tc0, t0);
4202   lwzx(t1, tc1, t1);
4203   lwzx(t2, tc2, t2);
4204   lwzx(t3, tc3, t3);
4205 
4206   // Calculate new crc from table values.
4207   xorr(t0,  t0, t1);
4208   xorr(t2,  t2, t3);
4209   xorr(crc, t0, t2);  // Now crc contains the final checksum value.
4210 }
4211 
4212 /**
4213  * @param crc   register containing existing CRC (32-bit)
4214  * @param buf   register pointing to input byte buffer (byte*)
4215  * @param len   register containing number of bytes
4216  * @param table register pointing to CRC table
4217  *
4218  * Uses R9..R12 as work register. Must be saved/restored by caller!
4219  */
4220 void MacroAssembler::kernel_crc32_2word(Register crc, Register buf, Register len, Register table,
4221                                         Register t0,  Register t1,  Register t2,  Register t3,
4222                                         Register tc0, Register tc1, Register tc2, Register tc3,
4223                                         bool invertCRC) {
4224   assert_different_registers(crc, buf, len, table);
4225 
4226   Label L_mainLoop, L_tail;
4227   Register  tmp  = t0;
4228   Register  data = t0;
4229   Register  tmp2 = t1;
4230   const int mainLoop_stepping  = 8;
4231   const int tailLoop_stepping  = 1;
4232   const int log_stepping       = exact_log2(mainLoop_stepping);
4233   const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
4234   const int complexThreshold   = 2*mainLoop_stepping;
4235 
4236   // Don't test for len <= 0 here. This pathological case should not occur anyway.
4237   // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles
4238   // for all well-behaved cases. The situation itself is detected and handled correctly
4239   // within update_byteLoop_crc32.
4240   assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
4241 
4242   BLOCK_COMMENT("kernel_crc32_2word {");
4243 
4244   if (invertCRC) {
4245     nand(crc, crc, crc);                      // 1s complement of crc
4246   }
4247 
4248   // Check for short (<mainLoop_stepping) buffer.
4249   cmpdi(CCR0, len, complexThreshold);
4250   blt(CCR0, L_tail);
4251 
4252   // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
4253   // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
4254   {
4255     // Align buf addr to mainLoop_stepping boundary.
4256     neg(tmp2, buf);                           // Calculate # preLoop iterations for alignment.
4257     rldicl(tmp2, tmp2, 0, 64-log_stepping);   // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
4258 
4259     if (complexThreshold > mainLoop_stepping) {
4260       sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4261     } else {
4262       sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
4263       cmpdi(CCR0, tmp, mainLoop_stepping);
4264       blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
4265       mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4266     }
4267     update_byteLoop_crc32(crc, buf, tmp2, table, data, false);
4268   }
4269 
4270   srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
4271   andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
4272   mtctr(tmp2);
4273 
4274 #ifdef VM_LITTLE_ENDIAN
4275   Register crc_rv = crc;
4276 #else
4277   Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
4278                                                  // Occupies tmp, but frees up crc.
4279   load_reverse_32(crc_rv, crc);                  // Revert byte order because we are dealing with big-endian data.
4280   tmp = crc;
4281 #endif
4282 
4283   int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
4284 
4285   align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
4286   BIND(L_mainLoop);
4287     update_1word_crc32(crc_rv, buf, table, 0, 0, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
4288     update_1word_crc32(crc_rv, buf, table, 4, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
4289     bdnz(L_mainLoop);
4290 
4291 #ifndef VM_LITTLE_ENDIAN
4292   load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
4293   tmp = crc_rv;                                  // Tmp uses it's original register again.
4294 #endif
4295 
4296   // Restore original table address for tailLoop.
4297   if (reconstructTableOffset != 0) {
4298     addi(table, table, -reconstructTableOffset);
4299   }
4300 
4301   // Process last few (<complexThreshold) bytes of buffer.
4302   BIND(L_tail);
4303   update_byteLoop_crc32(crc, buf, len, table, data, false);
4304 
4305   if (invertCRC) {
4306     nand(crc, crc, crc);                      // 1s complement of crc
4307   }
4308   BLOCK_COMMENT("} kernel_crc32_2word");
4309 }
4310 
4311 /**
4312  * @param crc   register containing existing CRC (32-bit)
4313  * @param buf   register pointing to input byte buffer (byte*)
4314  * @param len   register containing number of bytes
4315  * @param table register pointing to CRC table
4316  *
4317  * uses R9..R12 as work register. Must be saved/restored by caller!
4318  */
4319 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table,
4320                                         Register t0,  Register t1,  Register t2,  Register t3,
4321                                         Register tc0, Register tc1, Register tc2, Register tc3,
4322                                         bool invertCRC) {
4323   assert_different_registers(crc, buf, len, table);
4324 
4325   Label L_mainLoop, L_tail;
4326   Register  tmp          = t0;
4327   Register  data         = t0;
4328   Register  tmp2         = t1;
4329   const int mainLoop_stepping  = 4;
4330   const int tailLoop_stepping  = 1;
4331   const int log_stepping       = exact_log2(mainLoop_stepping);
4332   const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
4333   const int complexThreshold   = 2*mainLoop_stepping;
4334 
4335   // Don't test for len <= 0 here. This pathological case should not occur anyway.
4336   // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles
4337   // for all well-behaved cases. The situation itself is detected and handled correctly
4338   // within update_byteLoop_crc32.
4339   assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
4340 
4341   BLOCK_COMMENT("kernel_crc32_1word {");
4342 
4343   if (invertCRC) {
4344     nand(crc, crc, crc);                      // 1s complement of crc
4345   }
4346 
4347   // Check for short (<mainLoop_stepping) buffer.
4348   cmpdi(CCR0, len, complexThreshold);
4349   blt(CCR0, L_tail);
4350 
4351   // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
4352   // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
4353   {
4354     // Align buf addr to mainLoop_stepping boundary.
4355     neg(tmp2, buf);                              // Calculate # preLoop iterations for alignment.
4356     rldicl(tmp2, tmp2, 0, 64-log_stepping);      // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
4357 
4358     if (complexThreshold > mainLoop_stepping) {
4359       sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4360     } else {
4361       sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
4362       cmpdi(CCR0, tmp, mainLoop_stepping);
4363       blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
4364       mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4365     }
4366     update_byteLoop_crc32(crc, buf, tmp2, table, data, false);
4367   }
4368 
4369   srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
4370   andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
4371   mtctr(tmp2);
4372 
4373 #ifdef VM_LITTLE_ENDIAN
4374   Register crc_rv = crc;
4375 #else
4376   Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
4377                                                  // Occupies tmp, but frees up crc.
4378   load_reverse_32(crc_rv, crc);                  // Revert byte order because we are dealing with big-endian data.
4379   tmp = crc;
4380 #endif
4381 
4382   int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
4383 
4384   align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
4385   BIND(L_mainLoop);
4386     update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
4387     bdnz(L_mainLoop);
4388 
4389 #ifndef VM_LITTLE_ENDIAN
4390   load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
4391   tmp = crc_rv;                                  // Tmp uses it's original register again.
4392 #endif
4393 
4394   // Restore original table address for tailLoop.
4395   if (reconstructTableOffset != 0) {
4396     addi(table, table, -reconstructTableOffset);
4397   }
4398 
4399   // Process last few (<complexThreshold) bytes of buffer.
4400   BIND(L_tail);
4401   update_byteLoop_crc32(crc, buf, len, table, data, false);
4402 
4403   if (invertCRC) {
4404     nand(crc, crc, crc);                      // 1s complement of crc
4405   }
4406   BLOCK_COMMENT("} kernel_crc32_1word");
4407 }
4408 
4409 /**
4410  * @param crc   register containing existing CRC (32-bit)
4411  * @param buf   register pointing to input byte buffer (byte*)
4412  * @param len   register containing number of bytes
4413  * @param table register pointing to CRC table
4414  *
4415  * Uses R7_ARG5, R8_ARG6 as work registers.
4416  */
4417 void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len, Register table,
4418                                         Register t0,  Register t1,  Register t2,  Register t3,
4419                                         bool invertCRC) {
4420   assert_different_registers(crc, buf, len, table);
4421 
4422   Register  data = t0;                   // Holds the current byte to be folded into crc.
4423 
4424   BLOCK_COMMENT("kernel_crc32_1byte {");
4425 
4426   if (invertCRC) {
4427     nand(crc, crc, crc);                      // 1s complement of crc
4428   }
4429 
4430   // Process all bytes in a single-byte loop.
4431   update_byteLoop_crc32(crc, buf, len, table, data, true);
4432 
4433   if (invertCRC) {
4434     nand(crc, crc, crc);                      // 1s complement of crc
4435   }
4436   BLOCK_COMMENT("} kernel_crc32_1byte");
4437 }
4438 
4439 /**
4440  * @param crc             register containing existing CRC (32-bit)
4441  * @param buf             register pointing to input byte buffer (byte*)
4442  * @param len             register containing number of bytes
4443  * @param table           register pointing to CRC table
4444  * @param constants       register pointing to CRC table for 128-bit aligned memory
4445  * @param barretConstants register pointing to table for barrett reduction
4446  * @param t0              volatile register
4447  * @param t1              volatile register
4448  * @param t2              volatile register
4449  * @param t3              volatile register
4450  */
4451 void MacroAssembler::kernel_crc32_1word_vpmsumd(Register crc, Register buf, Register len, Register table,
4452                                                 Register constants,  Register barretConstants,
4453                                                 Register t0,  Register t1, Register t2, Register t3, Register t4,
4454                                                 bool invertCRC) {
4455   assert_different_registers(crc, buf, len, table);
4456 
4457   Label L_alignedHead, L_tail, L_alignTail, L_start, L_end;
4458 
4459   Register  prealign     = t0;
4460   Register  postalign    = t0;
4461 
4462   BLOCK_COMMENT("kernel_crc32_1word_vpmsumb {");
4463 
4464   // 1. use kernel_crc32_1word for shorter than 384bit
4465   clrldi(len, len, 32);
4466   cmpdi(CCR0, len, 384);
4467   bge(CCR0, L_start);
4468 
4469     Register tc0 = t4;
4470     Register tc1 = constants;
4471     Register tc2 = barretConstants;
4472     kernel_crc32_1word(crc, buf, len, table,t0, t1, t2, t3, tc0, tc1, tc2, table, invertCRC);
4473     b(L_end);
4474 
4475   BIND(L_start);
4476 
4477     // 2. ~c
4478     if (invertCRC) {
4479       nand(crc, crc, crc);                      // 1s complement of crc
4480     }
4481 
4482     // 3. calculate from 0 to first 128bit-aligned address
4483     clrldi_(prealign, buf, 57);
4484     beq(CCR0, L_alignedHead);
4485 
4486     subfic(prealign, prealign, 128);
4487 
4488     subf(len, prealign, len);
4489     update_byteLoop_crc32(crc, buf, prealign, table, t2, false);
4490 
4491     // 4. calculate from first 128bit-aligned address to last 128bit-aligned address
4492     BIND(L_alignedHead);
4493 
4494     clrldi(postalign, len, 57);
4495     subf(len, postalign, len);
4496 
4497     // len must be more than 256bit
4498     kernel_crc32_1word_aligned(crc, buf, len, constants, barretConstants, t1, t2, t3);
4499 
4500     // 5. calculate remaining
4501     cmpdi(CCR0, postalign, 0);
4502     beq(CCR0, L_tail);
4503 
4504     update_byteLoop_crc32(crc, buf, postalign, table, t2, false);
4505 
4506     BIND(L_tail);
4507 
4508     // 6. ~c
4509     if (invertCRC) {
4510       nand(crc, crc, crc);                      // 1s complement of crc
4511     }
4512 
4513   BIND(L_end);
4514 
4515   BLOCK_COMMENT("} kernel_crc32_1word_vpmsumb");
4516 }
4517 
4518 /**
4519  * @param crc             register containing existing CRC (32-bit)
4520  * @param buf             register pointing to input byte buffer (byte*)
4521  * @param len             register containing number of bytes
4522  * @param constants       register pointing to CRC table for 128-bit aligned memory
4523  * @param barretConstants register pointing to table for barrett reduction
4524  * @param t0              volatile register
4525  * @param t1              volatile register
4526  * @param t2              volatile register
4527  */
4528 void MacroAssembler::kernel_crc32_1word_aligned(Register crc, Register buf, Register len,
4529     Register constants, Register barretConstants, Register t0, Register t1, Register t2) {
4530   Label L_mainLoop, L_tail, L_alignTail, L_barrett_reduction, L_end, L_first_warm_up_done, L_first_cool_down, L_second_cool_down, L_XOR, L_test;
4531   Label L_lv0, L_lv1, L_lv2, L_lv3, L_lv4, L_lv5, L_lv6, L_lv7, L_lv8, L_lv9, L_lv10, L_lv11, L_lv12, L_lv13, L_lv14, L_lv15;
4532   Label L_1, L_2, L_3, L_4;
4533 
4534   Register  rLoaded      = t0;
4535   Register  rTmp1        = t1;
4536   Register  rTmp2        = t2;
4537   Register  off16        = R22;
4538   Register  off32        = R23;
4539   Register  off48        = R24;
4540   Register  off64        = R25;
4541   Register  off80        = R26;
4542   Register  off96        = R27;
4543   Register  off112       = R28;
4544   Register  rIdx         = R29;
4545   Register  rMax         = R30;
4546   Register  constantsPos = R31;
4547 
4548   VectorRegister mask_32bit = VR24;
4549   VectorRegister mask_64bit = VR25;
4550   VectorRegister zeroes     = VR26;
4551   VectorRegister const1     = VR27;
4552   VectorRegister const2     = VR28;
4553 
4554   // Save non-volatile vector registers (frameless).
4555   Register offset = t1;   int offsetInt = 0;
4556   offsetInt -= 16; li(offset, -16);           stvx(VR20, offset, R1_SP);
4557   offsetInt -= 16; addi(offset, offset, -16); stvx(VR21, offset, R1_SP);
4558   offsetInt -= 16; addi(offset, offset, -16); stvx(VR22, offset, R1_SP);
4559   offsetInt -= 16; addi(offset, offset, -16); stvx(VR23, offset, R1_SP);
4560   offsetInt -= 16; addi(offset, offset, -16); stvx(VR24, offset, R1_SP);
4561   offsetInt -= 16; addi(offset, offset, -16); stvx(VR25, offset, R1_SP);
4562   offsetInt -= 16; addi(offset, offset, -16); stvx(VR26, offset, R1_SP);
4563   offsetInt -= 16; addi(offset, offset, -16); stvx(VR27, offset, R1_SP);
4564   offsetInt -= 16; addi(offset, offset, -16); stvx(VR28, offset, R1_SP);
4565   offsetInt -= 8; std(R22, offsetInt, R1_SP);
4566   offsetInt -= 8; std(R23, offsetInt, R1_SP);
4567   offsetInt -= 8; std(R24, offsetInt, R1_SP);
4568   offsetInt -= 8; std(R25, offsetInt, R1_SP);
4569   offsetInt -= 8; std(R26, offsetInt, R1_SP);
4570   offsetInt -= 8; std(R27, offsetInt, R1_SP);
4571   offsetInt -= 8; std(R28, offsetInt, R1_SP);
4572   offsetInt -= 8; std(R29, offsetInt, R1_SP);
4573   offsetInt -= 8; std(R30, offsetInt, R1_SP);
4574   offsetInt -= 8; std(R31, offsetInt, R1_SP);
4575 
4576   // Set constants
4577   li(off16, 16);
4578   li(off32, 32);
4579   li(off48, 48);
4580   li(off64, 64);
4581   li(off80, 80);
4582   li(off96, 96);
4583   li(off112, 112);
4584 
4585   clrldi(crc, crc, 32);
4586 
4587   vxor(zeroes, zeroes, zeroes);
4588   vspltisw(VR0, -1);
4589 
4590   vsldoi(mask_32bit, zeroes, VR0, 4);
4591   vsldoi(mask_64bit, zeroes, VR0, 8);
4592 
4593   // Get the initial value into v8
4594   vxor(VR8, VR8, VR8);
4595   mtvrd(VR8, crc);
4596   vsldoi(VR8, zeroes, VR8, 8); // shift into bottom 32 bits
4597 
4598   li (rLoaded, 0);
4599 
4600   rldicr(rIdx, len, 0, 56);
4601 
4602   {
4603     BIND(L_1);
4604     // Checksum in blocks of MAX_SIZE (32768)
4605     lis(rMax, 0);
4606     ori(rMax, rMax, 32768);
4607     mr(rTmp2, rMax);
4608     cmpd(CCR0, rIdx, rMax);
4609     bgt(CCR0, L_2);
4610     mr(rMax, rIdx);
4611 
4612     BIND(L_2);
4613     subf(rIdx, rMax, rIdx);
4614 
4615     // our main loop does 128 bytes at a time
4616     srdi(rMax, rMax, 7);
4617 
4618     /*
4619      * Work out the offset into the constants table to start at. Each
4620      * constant is 16 bytes, and it is used against 128 bytes of input
4621      * data - 128 / 16 = 8
4622      */
4623     sldi(rTmp1, rMax, 4);
4624     srdi(rTmp2, rTmp2, 3);
4625     subf(rTmp1, rTmp1, rTmp2);
4626 
4627     // We reduce our final 128 bytes in a separate step
4628     addi(rMax, rMax, -1);
4629     mtctr(rMax);
4630 
4631     // Find the start of our constants
4632     add(constantsPos, constants, rTmp1);
4633 
4634     // zero VR0-v7 which will contain our checksums
4635     vxor(VR0, VR0, VR0);
4636     vxor(VR1, VR1, VR1);
4637     vxor(VR2, VR2, VR2);
4638     vxor(VR3, VR3, VR3);
4639     vxor(VR4, VR4, VR4);
4640     vxor(VR5, VR5, VR5);
4641     vxor(VR6, VR6, VR6);
4642     vxor(VR7, VR7, VR7);
4643 
4644     lvx(const1, constantsPos);
4645 
4646     /*
4647      * If we are looping back to consume more data we use the values
4648      * already in VR16-v23.
4649      */
4650     cmpdi(CCR0, rLoaded, 1);
4651     beq(CCR0, L_3);
4652     {
4653 
4654       // First warm up pass
4655       lvx(VR16, buf);
4656       lvx(VR17, off16, buf);
4657       lvx(VR18, off32, buf);
4658       lvx(VR19, off48, buf);
4659       lvx(VR20, off64, buf);
4660       lvx(VR21, off80, buf);
4661       lvx(VR22, off96, buf);
4662       lvx(VR23, off112, buf);
4663       addi(buf, buf, 8*16);
4664 
4665       // xor in initial value
4666       vxor(VR16, VR16, VR8);
4667     }
4668 
4669     BIND(L_3);
4670     bdz(L_first_warm_up_done);
4671 
4672     addi(constantsPos, constantsPos, 16);
4673     lvx(const2, constantsPos);
4674 
4675     // Second warm up pass
4676     vpmsumd(VR8, VR16, const1);
4677     lvx(VR16, buf);
4678 
4679     vpmsumd(VR9, VR17, const1);
4680     lvx(VR17, off16, buf);
4681 
4682     vpmsumd(VR10, VR18, const1);
4683     lvx(VR18, off32, buf);
4684 
4685     vpmsumd(VR11, VR19, const1);
4686     lvx(VR19, off48, buf);
4687 
4688     vpmsumd(VR12, VR20, const1);
4689     lvx(VR20, off64, buf);
4690 
4691     vpmsumd(VR13, VR21, const1);
4692     lvx(VR21, off80, buf);
4693 
4694     vpmsumd(VR14, VR22, const1);
4695     lvx(VR22, off96, buf);
4696 
4697     vpmsumd(VR15, VR23, const1);
4698     lvx(VR23, off112, buf);
4699 
4700     addi(buf, buf, 8 * 16);
4701 
4702     bdz(L_first_cool_down);
4703 
4704     /*
4705      * main loop. We modulo schedule it such that it takes three iterations
4706      * to complete - first iteration load, second iteration vpmsum, third
4707      * iteration xor.
4708      */
4709     {
4710       BIND(L_4);
4711       lvx(const1, constantsPos); addi(constantsPos, constantsPos, 16);
4712 
4713       vxor(VR0, VR0, VR8);
4714       vpmsumd(VR8, VR16, const2);
4715       lvx(VR16, buf);
4716 
4717       vxor(VR1, VR1, VR9);
4718       vpmsumd(VR9, VR17, const2);
4719       lvx(VR17, off16, buf);
4720 
4721       vxor(VR2, VR2, VR10);
4722       vpmsumd(VR10, VR18, const2);
4723       lvx(VR18, off32, buf);
4724 
4725       vxor(VR3, VR3, VR11);
4726       vpmsumd(VR11, VR19, const2);
4727       lvx(VR19, off48, buf);
4728       lvx(const2, constantsPos);
4729 
4730       vxor(VR4, VR4, VR12);
4731       vpmsumd(VR12, VR20, const1);
4732       lvx(VR20, off64, buf);
4733 
4734       vxor(VR5, VR5, VR13);
4735       vpmsumd(VR13, VR21, const1);
4736       lvx(VR21, off80, buf);
4737 
4738       vxor(VR6, VR6, VR14);
4739       vpmsumd(VR14, VR22, const1);
4740       lvx(VR22, off96, buf);
4741 
4742       vxor(VR7, VR7, VR15);
4743       vpmsumd(VR15, VR23, const1);
4744       lvx(VR23, off112, buf);
4745 
4746       addi(buf, buf, 8 * 16);
4747 
4748       bdnz(L_4);
4749     }
4750 
4751     BIND(L_first_cool_down);
4752 
4753     // First cool down pass
4754     lvx(const1, constantsPos);
4755     addi(constantsPos, constantsPos, 16);
4756 
4757     vxor(VR0, VR0, VR8);
4758     vpmsumd(VR8, VR16, const1);
4759 
4760     vxor(VR1, VR1, VR9);
4761     vpmsumd(VR9, VR17, const1);
4762 
4763     vxor(VR2, VR2, VR10);
4764     vpmsumd(VR10, VR18, const1);
4765 
4766     vxor(VR3, VR3, VR11);
4767     vpmsumd(VR11, VR19, const1);
4768 
4769     vxor(VR4, VR4, VR12);
4770     vpmsumd(VR12, VR20, const1);
4771 
4772     vxor(VR5, VR5, VR13);
4773     vpmsumd(VR13, VR21, const1);
4774 
4775     vxor(VR6, VR6, VR14);
4776     vpmsumd(VR14, VR22, const1);
4777 
4778     vxor(VR7, VR7, VR15);
4779     vpmsumd(VR15, VR23, const1);
4780 
4781     BIND(L_second_cool_down);
4782     // Second cool down pass
4783     vxor(VR0, VR0, VR8);
4784     vxor(VR1, VR1, VR9);
4785     vxor(VR2, VR2, VR10);
4786     vxor(VR3, VR3, VR11);
4787     vxor(VR4, VR4, VR12);
4788     vxor(VR5, VR5, VR13);
4789     vxor(VR6, VR6, VR14);
4790     vxor(VR7, VR7, VR15);
4791 
4792     /*
4793      * vpmsumd produces a 96 bit result in the least significant bits
4794      * of the register. Since we are bit reflected we have to shift it
4795      * left 32 bits so it occupies the least significant bits in the
4796      * bit reflected domain.
4797      */
4798     vsldoi(VR0, VR0, zeroes, 4);
4799     vsldoi(VR1, VR1, zeroes, 4);
4800     vsldoi(VR2, VR2, zeroes, 4);
4801     vsldoi(VR3, VR3, zeroes, 4);
4802     vsldoi(VR4, VR4, zeroes, 4);
4803     vsldoi(VR5, VR5, zeroes, 4);
4804     vsldoi(VR6, VR6, zeroes, 4);
4805     vsldoi(VR7, VR7, zeroes, 4);
4806 
4807     // xor with last 1024 bits
4808     lvx(VR8, buf);
4809     lvx(VR9, off16, buf);
4810     lvx(VR10, off32, buf);
4811     lvx(VR11, off48, buf);
4812     lvx(VR12, off64, buf);
4813     lvx(VR13, off80, buf);
4814     lvx(VR14, off96, buf);
4815     lvx(VR15, off112, buf);
4816     addi(buf, buf, 8 * 16);
4817 
4818     vxor(VR16, VR0, VR8);
4819     vxor(VR17, VR1, VR9);
4820     vxor(VR18, VR2, VR10);
4821     vxor(VR19, VR3, VR11);
4822     vxor(VR20, VR4, VR12);
4823     vxor(VR21, VR5, VR13);
4824     vxor(VR22, VR6, VR14);
4825     vxor(VR23, VR7, VR15);
4826 
4827     li(rLoaded, 1);
4828     cmpdi(CCR0, rIdx, 0);
4829     addi(rIdx, rIdx, 128);
4830     bne(CCR0, L_1);
4831   }
4832 
4833   // Work out how many bytes we have left
4834   andi_(len, len, 127);
4835 
4836   // Calculate where in the constant table we need to start
4837   subfic(rTmp1, len, 128);
4838   add(constantsPos, constantsPos, rTmp1);
4839 
4840   // How many 16 byte chunks are in the tail
4841   srdi(rIdx, len, 4);
4842   mtctr(rIdx);
4843 
4844   /*
4845    * Reduce the previously calculated 1024 bits to 64 bits, shifting
4846    * 32 bits to include the trailing 32 bits of zeros
4847    */
4848   lvx(VR0, constantsPos);
4849   lvx(VR1, off16, constantsPos);
4850   lvx(VR2, off32, constantsPos);
4851   lvx(VR3, off48, constantsPos);
4852   lvx(VR4, off64, constantsPos);
4853   lvx(VR5, off80, constantsPos);
4854   lvx(VR6, off96, constantsPos);
4855   lvx(VR7, off112, constantsPos);
4856   addi(constantsPos, constantsPos, 8 * 16);
4857 
4858   vpmsumw(VR0, VR16, VR0);
4859   vpmsumw(VR1, VR17, VR1);
4860   vpmsumw(VR2, VR18, VR2);
4861   vpmsumw(VR3, VR19, VR3);
4862   vpmsumw(VR4, VR20, VR4);
4863   vpmsumw(VR5, VR21, VR5);
4864   vpmsumw(VR6, VR22, VR6);
4865   vpmsumw(VR7, VR23, VR7);
4866 
4867   // Now reduce the tail (0 - 112 bytes)
4868   cmpdi(CCR0, rIdx, 0);
4869   beq(CCR0, L_XOR);
4870 
4871   lvx(VR16, buf); addi(buf, buf, 16);
4872   lvx(VR17, constantsPos);
4873   vpmsumw(VR16, VR16, VR17);
4874   vxor(VR0, VR0, VR16);
4875   beq(CCR0, L_XOR);
4876 
4877   lvx(VR16, buf); addi(buf, buf, 16);
4878   lvx(VR17, off16, constantsPos);
4879   vpmsumw(VR16, VR16, VR17);
4880   vxor(VR0, VR0, VR16);
4881   beq(CCR0, L_XOR);
4882 
4883   lvx(VR16, buf); addi(buf, buf, 16);
4884   lvx(VR17, off32, constantsPos);
4885   vpmsumw(VR16, VR16, VR17);
4886   vxor(VR0, VR0, VR16);
4887   beq(CCR0, L_XOR);
4888 
4889   lvx(VR16, buf); addi(buf, buf, 16);
4890   lvx(VR17, off48,constantsPos);
4891   vpmsumw(VR16, VR16, VR17);
4892   vxor(VR0, VR0, VR16);
4893   beq(CCR0, L_XOR);
4894 
4895   lvx(VR16, buf); addi(buf, buf, 16);
4896   lvx(VR17, off64, constantsPos);
4897   vpmsumw(VR16, VR16, VR17);
4898   vxor(VR0, VR0, VR16);
4899   beq(CCR0, L_XOR);
4900 
4901   lvx(VR16, buf); addi(buf, buf, 16);
4902   lvx(VR17, off80, constantsPos);
4903   vpmsumw(VR16, VR16, VR17);
4904   vxor(VR0, VR0, VR16);
4905   beq(CCR0, L_XOR);
4906 
4907   lvx(VR16, buf); addi(buf, buf, 16);
4908   lvx(VR17, off96, constantsPos);
4909   vpmsumw(VR16, VR16, VR17);
4910   vxor(VR0, VR0, VR16);
4911 
4912   // Now xor all the parallel chunks together
4913   BIND(L_XOR);
4914   vxor(VR0, VR0, VR1);
4915   vxor(VR2, VR2, VR3);
4916   vxor(VR4, VR4, VR5);
4917   vxor(VR6, VR6, VR7);
4918 
4919   vxor(VR0, VR0, VR2);
4920   vxor(VR4, VR4, VR6);
4921 
4922   vxor(VR0, VR0, VR4);
4923 
4924   b(L_barrett_reduction);
4925 
4926   BIND(L_first_warm_up_done);
4927   lvx(const1, constantsPos);
4928   addi(constantsPos, constantsPos, 16);
4929   vpmsumd(VR8,  VR16, const1);
4930   vpmsumd(VR9,  VR17, const1);
4931   vpmsumd(VR10, VR18, const1);
4932   vpmsumd(VR11, VR19, const1);
4933   vpmsumd(VR12, VR20, const1);
4934   vpmsumd(VR13, VR21, const1);
4935   vpmsumd(VR14, VR22, const1);
4936   vpmsumd(VR15, VR23, const1);
4937   b(L_second_cool_down);
4938 
4939   BIND(L_barrett_reduction);
4940 
4941   lvx(const1, barretConstants);
4942   addi(barretConstants, barretConstants, 16);
4943   lvx(const2, barretConstants);
4944 
4945   vsldoi(VR1, VR0, VR0, 8);
4946   vxor(VR0, VR0, VR1);    // xor two 64 bit results together
4947 
4948   // shift left one bit
4949   vspltisb(VR1, 1);
4950   vsl(VR0, VR0, VR1);
4951 
4952   vand(VR0, VR0, mask_64bit);
4953 
4954   /*
4955    * The reflected version of Barrett reduction. Instead of bit
4956    * reflecting our data (which is expensive to do), we bit reflect our
4957    * constants and our algorithm, which means the intermediate data in
4958    * our vector registers goes from 0-63 instead of 63-0. We can reflect
4959    * the algorithm because we don't carry in mod 2 arithmetic.
4960    */
4961   vand(VR1, VR0, mask_32bit);  // bottom 32 bits of a
4962   vpmsumd(VR1, VR1, const1);   // ma
4963   vand(VR1, VR1, mask_32bit);  // bottom 32bits of ma
4964   vpmsumd(VR1, VR1, const2);   // qn */
4965   vxor(VR0, VR0, VR1);         // a - qn, subtraction is xor in GF(2)
4966 
4967   /*
4968    * Since we are bit reflected, the result (ie the low 32 bits) is in
4969    * the high 32 bits. We just need to shift it left 4 bytes
4970    * V0 [ 0 1 X 3 ]
4971    * V0 [ 0 X 2 3 ]
4972    */
4973   vsldoi(VR0, VR0, zeroes, 4);    // shift result into top 64 bits of
4974 
4975   // Get it into r3
4976   mfvrd(crc, VR0);
4977 
4978   BIND(L_end);
4979 
4980   offsetInt = 0;
4981   // Restore non-volatile Vector registers (frameless).
4982   offsetInt -= 16; li(offset, -16);           lvx(VR20, offset, R1_SP);
4983   offsetInt -= 16; addi(offset, offset, -16); lvx(VR21, offset, R1_SP);
4984   offsetInt -= 16; addi(offset, offset, -16); lvx(VR22, offset, R1_SP);
4985   offsetInt -= 16; addi(offset, offset, -16); lvx(VR23, offset, R1_SP);
4986   offsetInt -= 16; addi(offset, offset, -16); lvx(VR24, offset, R1_SP);
4987   offsetInt -= 16; addi(offset, offset, -16); lvx(VR25, offset, R1_SP);
4988   offsetInt -= 16; addi(offset, offset, -16); lvx(VR26, offset, R1_SP);
4989   offsetInt -= 16; addi(offset, offset, -16); lvx(VR27, offset, R1_SP);
4990   offsetInt -= 16; addi(offset, offset, -16); lvx(VR28, offset, R1_SP);
4991   offsetInt -= 8;  ld(R22, offsetInt, R1_SP);
4992   offsetInt -= 8;  ld(R23, offsetInt, R1_SP);
4993   offsetInt -= 8;  ld(R24, offsetInt, R1_SP);
4994   offsetInt -= 8;  ld(R25, offsetInt, R1_SP);
4995   offsetInt -= 8;  ld(R26, offsetInt, R1_SP);
4996   offsetInt -= 8;  ld(R27, offsetInt, R1_SP);
4997   offsetInt -= 8;  ld(R28, offsetInt, R1_SP);
4998   offsetInt -= 8;  ld(R29, offsetInt, R1_SP);
4999   offsetInt -= 8;  ld(R30, offsetInt, R1_SP);
5000   offsetInt -= 8;  ld(R31, offsetInt, R1_SP);
5001 }
5002 
5003 void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp, bool invertCRC) {
5004   assert_different_registers(crc, buf, /* len,  not used!! */ table, tmp);
5005 
5006   BLOCK_COMMENT("kernel_crc32_singleByte:");
5007   if (invertCRC) {
5008     nand(crc, crc, crc);                // 1s complement of crc
5009   }
5010 
5011   lbz(tmp, 0, buf);                     // Byte from buffer, zero-extended.
5012   update_byte_crc32(crc, tmp, table);
5013 
5014   if (invertCRC) {
5015     nand(crc, crc, crc);                // 1s complement of crc
5016   }
5017 }
5018 
5019 void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) {
5020   assert_different_registers(crc, val, table);
5021 
5022   BLOCK_COMMENT("kernel_crc32_singleByteReg:");
5023   if (invertCRC) {
5024     nand(crc, crc, crc);                // 1s complement of crc
5025   }
5026 
5027   update_byte_crc32(crc, val, table);
5028 
5029   if (invertCRC) {
5030     nand(crc, crc, crc);                // 1s complement of crc
5031   }
5032 }
5033 
5034 // dest_lo += src1 + src2
5035 // dest_hi += carry1 + carry2
5036 void MacroAssembler::add2_with_carry(Register dest_hi,
5037                                      Register dest_lo,
5038                                      Register src1, Register src2) {
5039   li(R0, 0);
5040   addc(dest_lo, dest_lo, src1);
5041   adde(dest_hi, dest_hi, R0);
5042   addc(dest_lo, dest_lo, src2);
5043   adde(dest_hi, dest_hi, R0);
5044 }
5045 
5046 // Multiply 64 bit by 64 bit first loop.
5047 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,
5048                                            Register x_xstart,
5049                                            Register y, Register y_idx,
5050                                            Register z,
5051                                            Register carry,
5052                                            Register product_high, Register product,
5053                                            Register idx, Register kdx,
5054                                            Register tmp) {
5055   //  jlong carry, x[], y[], z[];
5056   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
5057   //    huge_128 product = y[idx] * x[xstart] + carry;
5058   //    z[kdx] = (jlong)product;
5059   //    carry  = (jlong)(product >>> 64);
5060   //  }
5061   //  z[xstart] = carry;
5062 
5063   Label L_first_loop, L_first_loop_exit;
5064   Label L_one_x, L_one_y, L_multiply;
5065 
5066   addic_(xstart, xstart, -1);
5067   blt(CCR0, L_one_x);   // Special case: length of x is 1.
5068 
5069   // Load next two integers of x.
5070   sldi(tmp, xstart, LogBytesPerInt);
5071   ldx(x_xstart, x, tmp);
5072 #ifdef VM_LITTLE_ENDIAN
5073   rldicl(x_xstart, x_xstart, 32, 0);
5074 #endif
5075 
5076   align(32, 16);
5077   bind(L_first_loop);
5078 
5079   cmpdi(CCR0, idx, 1);
5080   blt(CCR0, L_first_loop_exit);
5081   addi(idx, idx, -2);
5082   beq(CCR0, L_one_y);
5083 
5084   // Load next two integers of y.
5085   sldi(tmp, idx, LogBytesPerInt);
5086   ldx(y_idx, y, tmp);
5087 #ifdef VM_LITTLE_ENDIAN
5088   rldicl(y_idx, y_idx, 32, 0);
5089 #endif
5090 
5091 
5092   bind(L_multiply);
5093   multiply64(product_high, product, x_xstart, y_idx);
5094 
5095   li(tmp, 0);
5096   addc(product, product, carry);         // Add carry to result.
5097   adde(product_high, product_high, tmp); // Add carry of the last addition.
5098   addi(kdx, kdx, -2);
5099 
5100   // Store result.
5101 #ifdef VM_LITTLE_ENDIAN
5102   rldicl(product, product, 32, 0);
5103 #endif
5104   sldi(tmp, kdx, LogBytesPerInt);
5105   stdx(product, z, tmp);
5106   mr_if_needed(carry, product_high);
5107   b(L_first_loop);
5108 
5109 
5110   bind(L_one_y); // Load one 32 bit portion of y as (0,value).
5111 
5112   lwz(y_idx, 0, y);
5113   b(L_multiply);
5114 
5115 
5116   bind(L_one_x); // Load one 32 bit portion of x as (0,value).
5117 
5118   lwz(x_xstart, 0, x);
5119   b(L_first_loop);
5120 
5121   bind(L_first_loop_exit);
5122 }
5123 
5124 // Multiply 64 bit by 64 bit and add 128 bit.
5125 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y,
5126                                             Register z, Register yz_idx,
5127                                             Register idx, Register carry,
5128                                             Register product_high, Register product,
5129                                             Register tmp, int offset) {
5130 
5131   //  huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
5132   //  z[kdx] = (jlong)product;
5133 
5134   sldi(tmp, idx, LogBytesPerInt);
5135   if (offset) {
5136     addi(tmp, tmp, offset);
5137   }
5138   ldx(yz_idx, y, tmp);
5139 #ifdef VM_LITTLE_ENDIAN
5140   rldicl(yz_idx, yz_idx, 32, 0);
5141 #endif
5142 
5143   multiply64(product_high, product, x_xstart, yz_idx);
5144   ldx(yz_idx, z, tmp);
5145 #ifdef VM_LITTLE_ENDIAN
5146   rldicl(yz_idx, yz_idx, 32, 0);
5147 #endif
5148 
5149   add2_with_carry(product_high, product, carry, yz_idx);
5150 
5151   sldi(tmp, idx, LogBytesPerInt);
5152   if (offset) {
5153     addi(tmp, tmp, offset);
5154   }
5155 #ifdef VM_LITTLE_ENDIAN
5156   rldicl(product, product, 32, 0);
5157 #endif
5158   stdx(product, z, tmp);
5159 }
5160 
5161 // Multiply 128 bit by 128 bit. Unrolled inner loop.
5162 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart,
5163                                              Register y, Register z,
5164                                              Register yz_idx, Register idx, Register carry,
5165                                              Register product_high, Register product,
5166                                              Register carry2, Register tmp) {
5167 
5168   //  jlong carry, x[], y[], z[];
5169   //  int kdx = ystart+1;
5170   //  for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
5171   //    huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
5172   //    z[kdx+idx+1] = (jlong)product;
5173   //    jlong carry2 = (jlong)(product >>> 64);
5174   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
5175   //    z[kdx+idx] = (jlong)product;
5176   //    carry = (jlong)(product >>> 64);
5177   //  }
5178   //  idx += 2;
5179   //  if (idx > 0) {
5180   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
5181   //    z[kdx+idx] = (jlong)product;
5182   //    carry = (jlong)(product >>> 64);
5183   //  }
5184 
5185   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
5186   const Register jdx = R0;
5187 
5188   // Scale the index.
5189   srdi_(jdx, idx, 2);
5190   beq(CCR0, L_third_loop_exit);
5191   mtctr(jdx);
5192 
5193   align(32, 16);
5194   bind(L_third_loop);
5195 
5196   addi(idx, idx, -4);
5197 
5198   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8);
5199   mr_if_needed(carry2, product_high);
5200 
5201   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0);
5202   mr_if_needed(carry, product_high);
5203   bdnz(L_third_loop);
5204 
5205   bind(L_third_loop_exit);  // Handle any left-over operand parts.
5206 
5207   andi_(idx, idx, 0x3);
5208   beq(CCR0, L_post_third_loop_done);
5209 
5210   Label L_check_1;
5211 
5212   addic_(idx, idx, -2);
5213   blt(CCR0, L_check_1);
5214 
5215   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0);
5216   mr_if_needed(carry, product_high);
5217 
5218   bind(L_check_1);
5219 
5220   addi(idx, idx, 0x2);
5221   andi_(idx, idx, 0x1);
5222   addic_(idx, idx, -1);
5223   blt(CCR0, L_post_third_loop_done);
5224 
5225   sldi(tmp, idx, LogBytesPerInt);
5226   lwzx(yz_idx, y, tmp);
5227   multiply64(product_high, product, x_xstart, yz_idx);
5228   lwzx(yz_idx, z, tmp);
5229 
5230   add2_with_carry(product_high, product, yz_idx, carry);
5231 
5232   sldi(tmp, idx, LogBytesPerInt);
5233   stwx(product, z, tmp);
5234   srdi(product, product, 32);
5235 
5236   sldi(product_high, product_high, 32);
5237   orr(product, product, product_high);
5238   mr_if_needed(carry, product);
5239 
5240   bind(L_post_third_loop_done);
5241 }   // multiply_128_x_128_loop
5242 
5243 void MacroAssembler::muladd(Register out, Register in,
5244                             Register offset, Register len, Register k,
5245                             Register tmp1, Register tmp2, Register carry) {
5246 
5247   // Labels
5248   Label LOOP, SKIP;
5249 
5250   // Make sure length is positive.
5251   cmpdi  (CCR0,    len,     0);
5252 
5253   // Prepare variables
5254   subi   (offset,  offset,  4);
5255   li     (carry,   0);
5256   ble    (CCR0,    SKIP);
5257 
5258   mtctr  (len);
5259   subi   (len,     len,     1    );
5260   sldi   (len,     len,     2    );
5261 
5262   // Main loop
5263   bind(LOOP);
5264   lwzx   (tmp1,    len,     in   );
5265   lwzx   (tmp2,    offset,  out  );
5266   mulld  (tmp1,    tmp1,    k    );
5267   add    (tmp2,    carry,   tmp2 );
5268   add    (tmp2,    tmp1,    tmp2 );
5269   stwx   (tmp2,    offset,  out  );
5270   srdi   (carry,   tmp2,    32   );
5271   subi   (offset,  offset,  4    );
5272   subi   (len,     len,     4    );
5273   bdnz   (LOOP);
5274   bind(SKIP);
5275 }
5276 
5277 void MacroAssembler::multiply_to_len(Register x, Register xlen,
5278                                      Register y, Register ylen,
5279                                      Register z, Register zlen,
5280                                      Register tmp1, Register tmp2,
5281                                      Register tmp3, Register tmp4,
5282                                      Register tmp5, Register tmp6,
5283                                      Register tmp7, Register tmp8,
5284                                      Register tmp9, Register tmp10,
5285                                      Register tmp11, Register tmp12,
5286                                      Register tmp13) {
5287 
5288   ShortBranchVerifier sbv(this);
5289 
5290   assert_different_registers(x, xlen, y, ylen, z, zlen,
5291                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
5292   assert_different_registers(x, xlen, y, ylen, z, zlen,
5293                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp7);
5294   assert_different_registers(x, xlen, y, ylen, z, zlen,
5295                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp8);
5296 
5297   const Register idx = tmp1;
5298   const Register kdx = tmp2;
5299   const Register xstart = tmp3;
5300 
5301   const Register y_idx = tmp4;
5302   const Register carry = tmp5;
5303   const Register product = tmp6;
5304   const Register product_high = tmp7;
5305   const Register x_xstart = tmp8;
5306   const Register tmp = tmp9;
5307 
5308   // First Loop.
5309   //
5310   //  final static long LONG_MASK = 0xffffffffL;
5311   //  int xstart = xlen - 1;
5312   //  int ystart = ylen - 1;
5313   //  long carry = 0;
5314   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
5315   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
5316   //    z[kdx] = (int)product;
5317   //    carry = product >>> 32;
5318   //  }
5319   //  z[xstart] = (int)carry;
5320 
5321   mr_if_needed(idx, ylen);        // idx = ylen
5322   mr_if_needed(kdx, zlen);        // kdx = xlen + ylen
5323   li(carry, 0);                   // carry = 0
5324 
5325   Label L_done;
5326 
5327   addic_(xstart, xlen, -1);
5328   blt(CCR0, L_done);
5329 
5330   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z,
5331                         carry, product_high, product, idx, kdx, tmp);
5332 
5333   Label L_second_loop;
5334 
5335   cmpdi(CCR0, kdx, 0);
5336   beq(CCR0, L_second_loop);
5337 
5338   Label L_carry;
5339 
5340   addic_(kdx, kdx, -1);
5341   beq(CCR0, L_carry);
5342 
5343   // Store lower 32 bits of carry.
5344   sldi(tmp, kdx, LogBytesPerInt);
5345   stwx(carry, z, tmp);
5346   srdi(carry, carry, 32);
5347   addi(kdx, kdx, -1);
5348 
5349 
5350   bind(L_carry);
5351 
5352   // Store upper 32 bits of carry.
5353   sldi(tmp, kdx, LogBytesPerInt);
5354   stwx(carry, z, tmp);
5355 
5356   // Second and third (nested) loops.
5357   //
5358   //  for (int i = xstart-1; i >= 0; i--) { // Second loop
5359   //    carry = 0;
5360   //    for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
5361   //      long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
5362   //                     (z[k] & LONG_MASK) + carry;
5363   //      z[k] = (int)product;
5364   //      carry = product >>> 32;
5365   //    }
5366   //    z[i] = (int)carry;
5367   //  }
5368   //
5369   //  i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
5370 
5371   bind(L_second_loop);
5372 
5373   li(carry, 0);                   // carry = 0;
5374 
5375   addic_(xstart, xstart, -1);     // i = xstart-1;
5376   blt(CCR0, L_done);
5377 
5378   Register zsave = tmp10;
5379 
5380   mr(zsave, z);
5381 
5382 
5383   Label L_last_x;
5384 
5385   sldi(tmp, xstart, LogBytesPerInt);
5386   add(z, z, tmp);                 // z = z + k - j
5387   addi(z, z, 4);
5388   addic_(xstart, xstart, -1);     // i = xstart-1;
5389   blt(CCR0, L_last_x);
5390 
5391   sldi(tmp, xstart, LogBytesPerInt);
5392   ldx(x_xstart, x, tmp);
5393 #ifdef VM_LITTLE_ENDIAN
5394   rldicl(x_xstart, x_xstart, 32, 0);
5395 #endif
5396 
5397 
5398   Label L_third_loop_prologue;
5399 
5400   bind(L_third_loop_prologue);
5401 
5402   Register xsave = tmp11;
5403   Register xlensave = tmp12;
5404   Register ylensave = tmp13;
5405 
5406   mr(xsave, x);
5407   mr(xlensave, xstart);
5408   mr(ylensave, ylen);
5409 
5410 
5411   multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen,
5412                           carry, product_high, product, x, tmp);
5413 
5414   mr(z, zsave);
5415   mr(x, xsave);
5416   mr(xlen, xlensave);   // This is the decrement of the loop counter!
5417   mr(ylen, ylensave);
5418 
5419   addi(tmp3, xlen, 1);
5420   sldi(tmp, tmp3, LogBytesPerInt);
5421   stwx(carry, z, tmp);
5422   addic_(tmp3, tmp3, -1);
5423   blt(CCR0, L_done);
5424 
5425   srdi(carry, carry, 32);
5426   sldi(tmp, tmp3, LogBytesPerInt);
5427   stwx(carry, z, tmp);
5428   b(L_second_loop);
5429 
5430   // Next infrequent code is moved outside loops.
5431   bind(L_last_x);
5432 
5433   lwz(x_xstart, 0, x);
5434   b(L_third_loop_prologue);
5435 
5436   bind(L_done);
5437 }   // multiply_to_len
5438 
5439 void MacroAssembler::asm_assert(bool check_equal, const char *msg, int id) {
5440 #ifdef ASSERT
5441   Label ok;
5442   if (check_equal) {
5443     beq(CCR0, ok);
5444   } else {
5445     bne(CCR0, ok);
5446   }
5447   stop(msg, id);
5448   bind(ok);
5449 #endif
5450 }
5451 
5452 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset,
5453                                           Register mem_base, const char* msg, int id) {
5454 #ifdef ASSERT
5455   switch (size) {
5456     case 4:
5457       lwz(R0, mem_offset, mem_base);
5458       cmpwi(CCR0, R0, 0);
5459       break;
5460     case 8:
5461       ld(R0, mem_offset, mem_base);
5462       cmpdi(CCR0, R0, 0);
5463       break;
5464     default:
5465       ShouldNotReachHere();
5466   }
5467   asm_assert(check_equal, msg, id);
5468 #endif // ASSERT
5469 }
5470 
5471 void MacroAssembler::verify_thread() {
5472   if (VerifyThread) {
5473     unimplemented("'VerifyThread' currently not implemented on PPC");
5474   }
5475 }
5476 
5477 // READ: oop. KILL: R0. Volatile floats perhaps.
5478 void MacroAssembler::verify_oop(Register oop, const char* msg) {
5479   if (!VerifyOops) {
5480     return;
5481   }
5482 
5483   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
5484   const Register tmp = R11; // Will be preserved.
5485   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
5486   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
5487 
5488   mr_if_needed(R4_ARG2, oop);
5489   save_LR_CR(tmp); // save in old frame
5490   push_frame_reg_args(nbytes_save, tmp);
5491   // load FunctionDescriptor** / entry_address *
5492   load_const_optimized(tmp, fd, R0);
5493   // load FunctionDescriptor* / entry_address
5494   ld(tmp, 0, tmp);
5495   load_const_optimized(R3_ARG1, (address)msg, R0);
5496   // Call destination for its side effect.
5497   call_c(tmp);
5498 
5499   pop_frame();
5500   restore_LR_CR(tmp);
5501   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
5502 }
5503 
5504 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) {
5505   if (!VerifyOops) {
5506     return;
5507   }
5508 
5509   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
5510   const Register tmp = R11; // Will be preserved.
5511   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
5512   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
5513 
5514   ld(R4_ARG2, offs, base);
5515   save_LR_CR(tmp); // save in old frame
5516   push_frame_reg_args(nbytes_save, tmp);
5517   // load FunctionDescriptor** / entry_address *
5518   load_const_optimized(tmp, fd, R0);
5519   // load FunctionDescriptor* / entry_address
5520   ld(tmp, 0, tmp);
5521   load_const_optimized(R3_ARG1, (address)msg, R0);
5522   // Call destination for its side effect.
5523   call_c(tmp);
5524 
5525   pop_frame();
5526   restore_LR_CR(tmp);
5527   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
5528 }
5529 
5530 const char* stop_types[] = {
5531   "stop",
5532   "untested",
5533   "unimplemented",
5534   "shouldnotreachhere"
5535 };
5536 
5537 static void stop_on_request(int tp, const char* msg) {
5538   tty->print("PPC assembly code requires stop: (%s) %s\n", stop_types[tp%/*stop_end*/4], msg);
5539   guarantee(false, "PPC assembly code requires stop: %s", msg);
5540 }
5541 
5542 // Call a C-function that prints output.
5543 void MacroAssembler::stop(int type, const char* msg, int id) {
5544 #ifndef PRODUCT
5545   block_comment(err_msg("stop: %s %s {", stop_types[type%stop_end], msg));
5546 #else
5547   block_comment("stop {");
5548 #endif
5549 
5550   // setup arguments
5551   load_const_optimized(R3_ARG1, type);
5552   load_const_optimized(R4_ARG2, (void *)msg, /*tmp=*/R0);
5553   call_VM_leaf(CAST_FROM_FN_PTR(address, stop_on_request), R3_ARG1, R4_ARG2);
5554   illtrap();
5555   emit_int32(id);
5556   block_comment("} stop;");
5557 }
5558 
5559 #ifndef PRODUCT
5560 // Write pattern 0x0101010101010101 in memory region [low-before, high+after].
5561 // Val, addr are temp registers.
5562 // If low == addr, addr is killed.
5563 // High is preserved.
5564 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) {
5565   if (!ZapMemory) return;
5566 
5567   assert_different_registers(low, val);
5568 
5569   BLOCK_COMMENT("zap memory region {");
5570   load_const_optimized(val, 0x0101010101010101);
5571   int size = before + after;
5572   if (low == high && size < 5 && size > 0) {
5573     int offset = -before*BytesPerWord;
5574     for (int i = 0; i < size; ++i) {
5575       std(val, offset, low);
5576       offset += (1*BytesPerWord);
5577     }
5578   } else {
5579     addi(addr, low, -before*BytesPerWord);
5580     assert_different_registers(high, val);
5581     if (after) addi(high, high, after * BytesPerWord);
5582     Label loop;
5583     bind(loop);
5584     std(val, 0, addr);
5585     addi(addr, addr, 8);
5586     cmpd(CCR6, addr, high);
5587     ble(CCR6, loop);
5588     if (after) addi(high, high, -after * BytesPerWord);  // Correct back to old value.
5589   }
5590   BLOCK_COMMENT("} zap memory region");
5591 }
5592 
5593 #endif // !PRODUCT
5594 
5595 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() {
5596   int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true);
5597   assert(sizeof(bool) == 1, "PowerPC ABI");
5598   masm->lbz(temp, simm16_offset, temp);
5599   masm->cmpwi(CCR0, temp, 0);
5600   masm->beq(CCR0, _label);
5601 }
5602 
5603 SkipIfEqualZero::~SkipIfEqualZero() {
5604   _masm->bind(_label);
5605 }