1 /*
   2  * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2012, 2018, SAP SE. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "compiler/disassembler.hpp"
  29 #include "gc/shared/collectedHeap.inline.hpp"
  30 #include "gc/shared/barrierSet.hpp"
  31 #include "gc/shared/barrierSetAssembler.hpp"
  32 #include "interpreter/interpreter.hpp"
  33 #include "memory/resourceArea.hpp"
  34 #include "nativeInst_ppc.hpp"
  35 #include "prims/methodHandles.hpp"
  36 #include "runtime/biasedLocking.hpp"
  37 #include "runtime/icache.hpp"
  38 #include "runtime/interfaceSupport.inline.hpp"
  39 #include "runtime/objectMonitor.hpp"
  40 #include "runtime/os.hpp"
  41 #include "runtime/safepoint.hpp"
  42 #include "runtime/safepointMechanism.hpp"
  43 #include "runtime/sharedRuntime.hpp"
  44 #include "runtime/stubRoutines.hpp"
  45 #include "utilities/macros.hpp"
  46 #ifdef COMPILER2
  47 #include "opto/intrinsicnode.hpp"
  48 #endif
  49 
  50 #ifdef PRODUCT
  51 #define BLOCK_COMMENT(str) // nothing
  52 #else
  53 #define BLOCK_COMMENT(str) block_comment(str)
  54 #endif
  55 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  56 
  57 #ifdef ASSERT
  58 // On RISC, there's no benefit to verifying instruction boundaries.
  59 bool AbstractAssembler::pd_check_instruction_mark() { return false; }
  60 #endif
  61 
  62 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) {
  63   assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range");
  64   if (Assembler::is_simm(si31, 16)) {
  65     ld(d, si31, a);
  66     if (emit_filler_nop) nop();
  67   } else {
  68     const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31);
  69     const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31);
  70     addis(d, a, hi);
  71     ld(d, lo, d);
  72   }
  73 }
  74 
  75 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) {
  76   assert_different_registers(d, a);
  77   ld_largeoffset_unchecked(d, si31, a, emit_filler_nop);
  78 }
  79 
  80 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base,
  81                                       size_t size_in_bytes, bool is_signed) {
  82   switch (size_in_bytes) {
  83   case  8:              ld(dst, offs, base);                         break;
  84   case  4:  is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break;
  85   case  2:  is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break;
  86   case  1:  lbz(dst, offs, base); if (is_signed) extsb(dst, dst);    break; // lba doesn't exist :(
  87   default:  ShouldNotReachHere();
  88   }
  89 }
  90 
  91 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base,
  92                                        size_t size_in_bytes) {
  93   switch (size_in_bytes) {
  94   case  8:  std(dst, offs, base); break;
  95   case  4:  stw(dst, offs, base); break;
  96   case  2:  sth(dst, offs, base); break;
  97   case  1:  stb(dst, offs, base); break;
  98   default:  ShouldNotReachHere();
  99   }
 100 }
 101 
 102 void MacroAssembler::align(int modulus, int max, int rem) {
 103   int padding = (rem + modulus - (offset() % modulus)) % modulus;
 104   if (padding > max) return;
 105   for (int c = (padding >> 2); c > 0; --c) { nop(); }
 106 }
 107 
 108 // Issue instructions that calculate given TOC from global TOC.
 109 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16,
 110                                                        bool add_relocation, bool emit_dummy_addr) {
 111   int offset = -1;
 112   if (emit_dummy_addr) {
 113     offset = -128; // dummy address
 114   } else if (addr != (address)(intptr_t)-1) {
 115     offset = MacroAssembler::offset_to_global_toc(addr);
 116   }
 117 
 118   if (hi16) {
 119     addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset));
 120   }
 121   if (lo16) {
 122     if (add_relocation) {
 123       // Relocate at the addi to avoid confusion with a load from the method's TOC.
 124       relocate(internal_word_Relocation::spec(addr));
 125     }
 126     addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset));
 127   }
 128 }
 129 
 130 address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) {
 131   const int offset = MacroAssembler::offset_to_global_toc(addr);
 132 
 133   const address inst2_addr = a;
 134   const int inst2 = *(int *)inst2_addr;
 135 
 136   // The relocation points to the second instruction, the addi,
 137   // and the addi reads and writes the same register dst.
 138   const int dst = inv_rt_field(inst2);
 139   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 140 
 141   // Now, find the preceding addis which writes to dst.
 142   int inst1 = 0;
 143   address inst1_addr = inst2_addr - BytesPerInstWord;
 144   while (inst1_addr >= bound) {
 145     inst1 = *(int *) inst1_addr;
 146     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 147       // Stop, found the addis which writes dst.
 148       break;
 149     }
 150     inst1_addr -= BytesPerInstWord;
 151   }
 152 
 153   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 154   set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset));
 155   set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset));
 156   return inst1_addr;
 157 }
 158 
 159 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) {
 160   const address inst2_addr = a;
 161   const int inst2 = *(int *)inst2_addr;
 162 
 163   // The relocation points to the second instruction, the addi,
 164   // and the addi reads and writes the same register dst.
 165   const int dst = inv_rt_field(inst2);
 166   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 167 
 168   // Now, find the preceding addis which writes to dst.
 169   int inst1 = 0;
 170   address inst1_addr = inst2_addr - BytesPerInstWord;
 171   while (inst1_addr >= bound) {
 172     inst1 = *(int *) inst1_addr;
 173     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 174       // stop, found the addis which writes dst
 175       break;
 176     }
 177     inst1_addr -= BytesPerInstWord;
 178   }
 179 
 180   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 181 
 182   int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0);
 183   // -1 is a special case
 184   if (offset == -1) {
 185     return (address)(intptr_t)-1;
 186   } else {
 187     return global_toc() + offset;
 188   }
 189 }
 190 
 191 #ifdef _LP64
 192 // Patch compressed oops or klass constants.
 193 // Assembler sequence is
 194 // 1) compressed oops:
 195 //    lis  rx = const.hi
 196 //    ori rx = rx | const.lo
 197 // 2) compressed klass:
 198 //    lis  rx = const.hi
 199 //    clrldi rx = rx & 0xFFFFffff // clearMS32b, optional
 200 //    ori rx = rx | const.lo
 201 // Clrldi will be passed by.
 202 address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) {
 203   assert(UseCompressedOops, "Should only patch compressed oops");
 204 
 205   const address inst2_addr = a;
 206   const int inst2 = *(int *)inst2_addr;
 207 
 208   // The relocation points to the second instruction, the ori,
 209   // and the ori reads and writes the same register dst.
 210   const int dst = inv_rta_field(inst2);
 211   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 212   // Now, find the preceding addis which writes to dst.
 213   int inst1 = 0;
 214   address inst1_addr = inst2_addr - BytesPerInstWord;
 215   bool inst1_found = false;
 216   while (inst1_addr >= bound) {
 217     inst1 = *(int *)inst1_addr;
 218     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; }
 219     inst1_addr -= BytesPerInstWord;
 220   }
 221   assert(inst1_found, "inst is not lis");
 222 
 223   int xc = (data >> 16) & 0xffff;
 224   int xd = (data >>  0) & 0xffff;
 225 
 226   set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo
 227   set_imm((int *)inst2_addr,        (xd)); // unsigned int
 228   return inst1_addr;
 229 }
 230 
 231 // Get compressed oop or klass constant.
 232 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) {
 233   assert(UseCompressedOops, "Should only patch compressed oops");
 234 
 235   const address inst2_addr = a;
 236   const int inst2 = *(int *)inst2_addr;
 237 
 238   // The relocation points to the second instruction, the ori,
 239   // and the ori reads and writes the same register dst.
 240   const int dst = inv_rta_field(inst2);
 241   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 242   // Now, find the preceding lis which writes to dst.
 243   int inst1 = 0;
 244   address inst1_addr = inst2_addr - BytesPerInstWord;
 245   bool inst1_found = false;
 246 
 247   while (inst1_addr >= bound) {
 248     inst1 = *(int *) inst1_addr;
 249     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;}
 250     inst1_addr -= BytesPerInstWord;
 251   }
 252   assert(inst1_found, "inst is not lis");
 253 
 254   uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff));
 255   uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16);
 256 
 257   return (int) (xl | xh);
 258 }
 259 #endif // _LP64
 260 
 261 // Returns true if successful.
 262 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a,
 263                                                 Register toc, bool fixed_size) {
 264   int toc_offset = 0;
 265   // Use RelocationHolder::none for the constant pool entry, otherwise
 266   // we will end up with a failing NativeCall::verify(x) where x is
 267   // the address of the constant pool entry.
 268   // FIXME: We should insert relocation information for oops at the constant
 269   // pool entries instead of inserting it at the loads; patching of a constant
 270   // pool entry should be less expensive.
 271   address const_address = address_constant((address)a.value(), RelocationHolder::none);
 272   if (const_address == NULL) { return false; } // allocation failure
 273   // Relocate at the pc of the load.
 274   relocate(a.rspec());
 275   toc_offset = (int)(const_address - code()->consts()->start());
 276   ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size);
 277   return true;
 278 }
 279 
 280 bool MacroAssembler::is_load_const_from_method_toc_at(address a) {
 281   const address inst1_addr = a;
 282   const int inst1 = *(int *)inst1_addr;
 283 
 284    // The relocation points to the ld or the addis.
 285    return (is_ld(inst1)) ||
 286           (is_addis(inst1) && inv_ra_field(inst1) != 0);
 287 }
 288 
 289 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) {
 290   assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc");
 291 
 292   const address inst1_addr = a;
 293   const int inst1 = *(int *)inst1_addr;
 294 
 295   if (is_ld(inst1)) {
 296     return inv_d1_field(inst1);
 297   } else if (is_addis(inst1)) {
 298     const int dst = inv_rt_field(inst1);
 299 
 300     // Now, find the succeeding ld which reads and writes to dst.
 301     address inst2_addr = inst1_addr + BytesPerInstWord;
 302     int inst2 = 0;
 303     while (true) {
 304       inst2 = *(int *) inst2_addr;
 305       if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) {
 306         // Stop, found the ld which reads and writes dst.
 307         break;
 308       }
 309       inst2_addr += BytesPerInstWord;
 310     }
 311     return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2);
 312   }
 313   ShouldNotReachHere();
 314   return 0;
 315 }
 316 
 317 // Get the constant from a `load_const' sequence.
 318 long MacroAssembler::get_const(address a) {
 319   assert(is_load_const_at(a), "not a load of a constant");
 320   const int *p = (const int*) a;
 321   unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48);
 322   if (is_ori(*(p+1))) {
 323     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32);
 324     x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16);
 325     x |= (((unsigned long) (get_imm(a,4) & 0xffff)));
 326   } else if (is_lis(*(p+1))) {
 327     x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32);
 328     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16);
 329     x |= (((unsigned long) (get_imm(a,3) & 0xffff)));
 330   } else {
 331     ShouldNotReachHere();
 332     return (long) 0;
 333   }
 334   return (long) x;
 335 }
 336 
 337 // Patch the 64 bit constant of a `load_const' sequence. This is a low
 338 // level procedure. It neither flushes the instruction cache nor is it
 339 // mt safe.
 340 void MacroAssembler::patch_const(address a, long x) {
 341   assert(is_load_const_at(a), "not a load of a constant");
 342   int *p = (int*) a;
 343   if (is_ori(*(p+1))) {
 344     set_imm(0 + p, (x >> 48) & 0xffff);
 345     set_imm(1 + p, (x >> 32) & 0xffff);
 346     set_imm(3 + p, (x >> 16) & 0xffff);
 347     set_imm(4 + p, x & 0xffff);
 348   } else if (is_lis(*(p+1))) {
 349     set_imm(0 + p, (x >> 48) & 0xffff);
 350     set_imm(2 + p, (x >> 32) & 0xffff);
 351     set_imm(1 + p, (x >> 16) & 0xffff);
 352     set_imm(3 + p, x & 0xffff);
 353   } else {
 354     ShouldNotReachHere();
 355   }
 356 }
 357 
 358 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) {
 359   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
 360   int index = oop_recorder()->allocate_metadata_index(obj);
 361   RelocationHolder rspec = metadata_Relocation::spec(index);
 362   return AddressLiteral((address)obj, rspec);
 363 }
 364 
 365 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) {
 366   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
 367   int index = oop_recorder()->find_index(obj);
 368   RelocationHolder rspec = metadata_Relocation::spec(index);
 369   return AddressLiteral((address)obj, rspec);
 370 }
 371 
 372 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) {
 373   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
 374   int oop_index = oop_recorder()->allocate_oop_index(obj);
 375   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 376 }
 377 
 378 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) {
 379   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
 380   int oop_index = oop_recorder()->find_index(obj);
 381   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 382 }
 383 
 384 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
 385                                                       Register tmp, int offset) {
 386   intptr_t value = *delayed_value_addr;
 387   if (value != 0) {
 388     return RegisterOrConstant(value + offset);
 389   }
 390 
 391   // Load indirectly to solve generation ordering problem.
 392   // static address, no relocation
 393   int simm16_offset = load_const_optimized(tmp, delayed_value_addr, noreg, true);
 394   ld(tmp, simm16_offset, tmp); // must be aligned ((xa & 3) == 0)
 395 
 396   if (offset != 0) {
 397     addi(tmp, tmp, offset);
 398   }
 399 
 400   return RegisterOrConstant(tmp);
 401 }
 402 
 403 #ifndef PRODUCT
 404 void MacroAssembler::pd_print_patched_instruction(address branch) {
 405   Unimplemented(); // TODO: PPC port
 406 }
 407 #endif // ndef PRODUCT
 408 
 409 // Conditional far branch for destinations encodable in 24+2 bits.
 410 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) {
 411 
 412   // If requested by flag optimize, relocate the bc_far as a
 413   // runtime_call and prepare for optimizing it when the code gets
 414   // relocated.
 415   if (optimize == bc_far_optimize_on_relocate) {
 416     relocate(relocInfo::runtime_call_type);
 417   }
 418 
 419   // variant 2:
 420   //
 421   //    b!cxx SKIP
 422   //    bxx   DEST
 423   //  SKIP:
 424   //
 425 
 426   const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 427                                                 opposite_bcond(inv_boint_bcond(boint)));
 428 
 429   // We emit two branches.
 430   // First, a conditional branch which jumps around the far branch.
 431   const address not_taken_pc = pc() + 2 * BytesPerInstWord;
 432   const address bc_pc        = pc();
 433   bc(opposite_boint, biint, not_taken_pc);
 434 
 435   const int bc_instr = *(int*)bc_pc;
 436   assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition");
 437   assert(opposite_boint == inv_bo_field(bc_instr), "postcondition");
 438   assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))),
 439                                      opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))),
 440          "postcondition");
 441   assert(biint == inv_bi_field(bc_instr), "postcondition");
 442 
 443   // Second, an unconditional far branch which jumps to dest.
 444   // Note: target(dest) remembers the current pc (see CodeSection::target)
 445   //       and returns the current pc if the label is not bound yet; when
 446   //       the label gets bound, the unconditional far branch will be patched.
 447   const address target_pc = target(dest);
 448   const address b_pc  = pc();
 449   b(target_pc);
 450 
 451   assert(not_taken_pc == pc(),                     "postcondition");
 452   assert(dest.is_bound() || target_pc == b_pc, "postcondition");
 453 }
 454 
 455 // 1 or 2 instructions
 456 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) {
 457   if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) {
 458     bc(boint, biint, dest);
 459   } else {
 460     bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate);
 461   }
 462 }
 463 
 464 bool MacroAssembler::is_bc_far_at(address instruction_addr) {
 465   return is_bc_far_variant1_at(instruction_addr) ||
 466          is_bc_far_variant2_at(instruction_addr) ||
 467          is_bc_far_variant3_at(instruction_addr);
 468 }
 469 
 470 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) {
 471   if (is_bc_far_variant1_at(instruction_addr)) {
 472     const address instruction_1_addr = instruction_addr;
 473     const int instruction_1 = *(int*)instruction_1_addr;
 474     return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr);
 475   } else if (is_bc_far_variant2_at(instruction_addr)) {
 476     const address instruction_2_addr = instruction_addr + 4;
 477     return bxx_destination(instruction_2_addr);
 478   } else if (is_bc_far_variant3_at(instruction_addr)) {
 479     return instruction_addr + 8;
 480   }
 481   // variant 4 ???
 482   ShouldNotReachHere();
 483   return NULL;
 484 }
 485 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) {
 486 
 487   if (is_bc_far_variant3_at(instruction_addr)) {
 488     // variant 3, far cond branch to the next instruction, already patched to nops:
 489     //
 490     //    nop
 491     //    endgroup
 492     //  SKIP/DEST:
 493     //
 494     return;
 495   }
 496 
 497   // first, extract boint and biint from the current branch
 498   int boint = 0;
 499   int biint = 0;
 500 
 501   ResourceMark rm;
 502   const int code_size = 2 * BytesPerInstWord;
 503   CodeBuffer buf(instruction_addr, code_size);
 504   MacroAssembler masm(&buf);
 505   if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) {
 506     // Far branch to next instruction: Optimize it by patching nops (produce variant 3).
 507     masm.nop();
 508     masm.endgroup();
 509   } else {
 510     if (is_bc_far_variant1_at(instruction_addr)) {
 511       // variant 1, the 1st instruction contains the destination address:
 512       //
 513       //    bcxx  DEST
 514       //    nop
 515       //
 516       const int instruction_1 = *(int*)(instruction_addr);
 517       boint = inv_bo_field(instruction_1);
 518       biint = inv_bi_field(instruction_1);
 519     } else if (is_bc_far_variant2_at(instruction_addr)) {
 520       // variant 2, the 2nd instruction contains the destination address:
 521       //
 522       //    b!cxx SKIP
 523       //    bxx   DEST
 524       //  SKIP:
 525       //
 526       const int instruction_1 = *(int*)(instruction_addr);
 527       boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))),
 528           opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1))));
 529       biint = inv_bi_field(instruction_1);
 530     } else {
 531       // variant 4???
 532       ShouldNotReachHere();
 533     }
 534 
 535     // second, set the new branch destination and optimize the code
 536     if (dest != instruction_addr + 4 && // the bc_far is still unbound!
 537         masm.is_within_range_of_bcxx(dest, instruction_addr)) {
 538       // variant 1:
 539       //
 540       //    bcxx  DEST
 541       //    nop
 542       //
 543       masm.bc(boint, biint, dest);
 544       masm.nop();
 545     } else {
 546       // variant 2:
 547       //
 548       //    b!cxx SKIP
 549       //    bxx   DEST
 550       //  SKIP:
 551       //
 552       const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 553                                                     opposite_bcond(inv_boint_bcond(boint)));
 554       const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord;
 555       masm.bc(opposite_boint, biint, not_taken_pc);
 556       masm.b(dest);
 557     }
 558   }
 559   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 560 }
 561 
 562 // Emit a NOT mt-safe patchable 64 bit absolute call/jump.
 563 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) {
 564   // get current pc
 565   uint64_t start_pc = (uint64_t) pc();
 566 
 567   const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last
 568   const address pc_of_b  = (address) (start_pc + (0*BytesPerInstWord)); // b is first
 569 
 570   // relocate here
 571   if (rt != relocInfo::none) {
 572     relocate(rt);
 573   }
 574 
 575   if ( ReoptimizeCallSequences &&
 576        (( link && is_within_range_of_b(dest, pc_of_bl)) ||
 577         (!link && is_within_range_of_b(dest, pc_of_b)))) {
 578     // variant 2:
 579     // Emit an optimized, pc-relative call/jump.
 580 
 581     if (link) {
 582       // some padding
 583       nop();
 584       nop();
 585       nop();
 586       nop();
 587       nop();
 588       nop();
 589 
 590       // do the call
 591       assert(pc() == pc_of_bl, "just checking");
 592       bl(dest, relocInfo::none);
 593     } else {
 594       // do the jump
 595       assert(pc() == pc_of_b, "just checking");
 596       b(dest, relocInfo::none);
 597 
 598       // some padding
 599       nop();
 600       nop();
 601       nop();
 602       nop();
 603       nop();
 604       nop();
 605     }
 606 
 607     // Assert that we can identify the emitted call/jump.
 608     assert(is_bxx64_patchable_variant2_at((address)start_pc, link),
 609            "can't identify emitted call");
 610   } else {
 611     // variant 1:
 612     mr(R0, R11);  // spill R11 -> R0.
 613 
 614     // Load the destination address into CTR,
 615     // calculate destination relative to global toc.
 616     calculate_address_from_global_toc(R11, dest, true, true, false);
 617 
 618     mtctr(R11);
 619     mr(R11, R0);  // spill R11 <- R0.
 620     nop();
 621 
 622     // do the call/jump
 623     if (link) {
 624       bctrl();
 625     } else{
 626       bctr();
 627     }
 628     // Assert that we can identify the emitted call/jump.
 629     assert(is_bxx64_patchable_variant1b_at((address)start_pc, link),
 630            "can't identify emitted call");
 631   }
 632 
 633   // Assert that we can identify the emitted call/jump.
 634   assert(is_bxx64_patchable_at((address)start_pc, link),
 635          "can't identify emitted call");
 636   assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest,
 637          "wrong encoding of dest address");
 638 }
 639 
 640 // Identify a bxx64_patchable instruction.
 641 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) {
 642   return is_bxx64_patchable_variant1b_at(instruction_addr, link)
 643     //|| is_bxx64_patchable_variant1_at(instruction_addr, link)
 644       || is_bxx64_patchable_variant2_at(instruction_addr, link);
 645 }
 646 
 647 // Does the call64_patchable instruction use a pc-relative encoding of
 648 // the call destination?
 649 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) {
 650   // variant 2 is pc-relative
 651   return is_bxx64_patchable_variant2_at(instruction_addr, link);
 652 }
 653 
 654 // Identify variant 1.
 655 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) {
 656   unsigned int* instr = (unsigned int*) instruction_addr;
 657   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 658       && is_mtctr(instr[5]) // mtctr
 659     && is_load_const_at(instruction_addr);
 660 }
 661 
 662 // Identify variant 1b: load destination relative to global toc.
 663 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) {
 664   unsigned int* instr = (unsigned int*) instruction_addr;
 665   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 666     && is_mtctr(instr[3]) // mtctr
 667     && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr);
 668 }
 669 
 670 // Identify variant 2.
 671 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) {
 672   unsigned int* instr = (unsigned int*) instruction_addr;
 673   if (link) {
 674     return is_bl (instr[6])  // bl dest is last
 675       && is_nop(instr[0])  // nop
 676       && is_nop(instr[1])  // nop
 677       && is_nop(instr[2])  // nop
 678       && is_nop(instr[3])  // nop
 679       && is_nop(instr[4])  // nop
 680       && is_nop(instr[5]); // nop
 681   } else {
 682     return is_b  (instr[0])  // b  dest is first
 683       && is_nop(instr[1])  // nop
 684       && is_nop(instr[2])  // nop
 685       && is_nop(instr[3])  // nop
 686       && is_nop(instr[4])  // nop
 687       && is_nop(instr[5])  // nop
 688       && is_nop(instr[6]); // nop
 689   }
 690 }
 691 
 692 // Set dest address of a bxx64_patchable instruction.
 693 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) {
 694   ResourceMark rm;
 695   int code_size = MacroAssembler::bxx64_patchable_size;
 696   CodeBuffer buf(instruction_addr, code_size);
 697   MacroAssembler masm(&buf);
 698   masm.bxx64_patchable(dest, relocInfo::none, link);
 699   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 700 }
 701 
 702 // Get dest address of a bxx64_patchable instruction.
 703 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) {
 704   if (is_bxx64_patchable_variant1_at(instruction_addr, link)) {
 705     return (address) (unsigned long) get_const(instruction_addr);
 706   } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) {
 707     unsigned int* instr = (unsigned int*) instruction_addr;
 708     if (link) {
 709       const int instr_idx = 6; // bl is last
 710       int branchoffset = branch_destination(instr[instr_idx], 0);
 711       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 712     } else {
 713       const int instr_idx = 0; // b is first
 714       int branchoffset = branch_destination(instr[instr_idx], 0);
 715       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 716     }
 717   // Load dest relative to global toc.
 718   } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) {
 719     return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord,
 720                                                                instruction_addr);
 721   } else {
 722     ShouldNotReachHere();
 723     return NULL;
 724   }
 725 }
 726 
 727 // Uses ordering which corresponds to ABI:
 728 //    _savegpr0_14:  std  r14,-144(r1)
 729 //    _savegpr0_15:  std  r15,-136(r1)
 730 //    _savegpr0_16:  std  r16,-128(r1)
 731 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) {
 732   std(R14, offset, dst);   offset += 8;
 733   std(R15, offset, dst);   offset += 8;
 734   std(R16, offset, dst);   offset += 8;
 735   std(R17, offset, dst);   offset += 8;
 736   std(R18, offset, dst);   offset += 8;
 737   std(R19, offset, dst);   offset += 8;
 738   std(R20, offset, dst);   offset += 8;
 739   std(R21, offset, dst);   offset += 8;
 740   std(R22, offset, dst);   offset += 8;
 741   std(R23, offset, dst);   offset += 8;
 742   std(R24, offset, dst);   offset += 8;
 743   std(R25, offset, dst);   offset += 8;
 744   std(R26, offset, dst);   offset += 8;
 745   std(R27, offset, dst);   offset += 8;
 746   std(R28, offset, dst);   offset += 8;
 747   std(R29, offset, dst);   offset += 8;
 748   std(R30, offset, dst);   offset += 8;
 749   std(R31, offset, dst);   offset += 8;
 750 
 751   stfd(F14, offset, dst);   offset += 8;
 752   stfd(F15, offset, dst);   offset += 8;
 753   stfd(F16, offset, dst);   offset += 8;
 754   stfd(F17, offset, dst);   offset += 8;
 755   stfd(F18, offset, dst);   offset += 8;
 756   stfd(F19, offset, dst);   offset += 8;
 757   stfd(F20, offset, dst);   offset += 8;
 758   stfd(F21, offset, dst);   offset += 8;
 759   stfd(F22, offset, dst);   offset += 8;
 760   stfd(F23, offset, dst);   offset += 8;
 761   stfd(F24, offset, dst);   offset += 8;
 762   stfd(F25, offset, dst);   offset += 8;
 763   stfd(F26, offset, dst);   offset += 8;
 764   stfd(F27, offset, dst);   offset += 8;
 765   stfd(F28, offset, dst);   offset += 8;
 766   stfd(F29, offset, dst);   offset += 8;
 767   stfd(F30, offset, dst);   offset += 8;
 768   stfd(F31, offset, dst);
 769 }
 770 
 771 // Uses ordering which corresponds to ABI:
 772 //    _restgpr0_14:  ld   r14,-144(r1)
 773 //    _restgpr0_15:  ld   r15,-136(r1)
 774 //    _restgpr0_16:  ld   r16,-128(r1)
 775 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) {
 776   ld(R14, offset, src);   offset += 8;
 777   ld(R15, offset, src);   offset += 8;
 778   ld(R16, offset, src);   offset += 8;
 779   ld(R17, offset, src);   offset += 8;
 780   ld(R18, offset, src);   offset += 8;
 781   ld(R19, offset, src);   offset += 8;
 782   ld(R20, offset, src);   offset += 8;
 783   ld(R21, offset, src);   offset += 8;
 784   ld(R22, offset, src);   offset += 8;
 785   ld(R23, offset, src);   offset += 8;
 786   ld(R24, offset, src);   offset += 8;
 787   ld(R25, offset, src);   offset += 8;
 788   ld(R26, offset, src);   offset += 8;
 789   ld(R27, offset, src);   offset += 8;
 790   ld(R28, offset, src);   offset += 8;
 791   ld(R29, offset, src);   offset += 8;
 792   ld(R30, offset, src);   offset += 8;
 793   ld(R31, offset, src);   offset += 8;
 794 
 795   // FP registers
 796   lfd(F14, offset, src);   offset += 8;
 797   lfd(F15, offset, src);   offset += 8;
 798   lfd(F16, offset, src);   offset += 8;
 799   lfd(F17, offset, src);   offset += 8;
 800   lfd(F18, offset, src);   offset += 8;
 801   lfd(F19, offset, src);   offset += 8;
 802   lfd(F20, offset, src);   offset += 8;
 803   lfd(F21, offset, src);   offset += 8;
 804   lfd(F22, offset, src);   offset += 8;
 805   lfd(F23, offset, src);   offset += 8;
 806   lfd(F24, offset, src);   offset += 8;
 807   lfd(F25, offset, src);   offset += 8;
 808   lfd(F26, offset, src);   offset += 8;
 809   lfd(F27, offset, src);   offset += 8;
 810   lfd(F28, offset, src);   offset += 8;
 811   lfd(F29, offset, src);   offset += 8;
 812   lfd(F30, offset, src);   offset += 8;
 813   lfd(F31, offset, src);
 814 }
 815 
 816 // For verify_oops.
 817 void MacroAssembler::save_volatile_gprs(Register dst, int offset) {
 818   std(R2,  offset, dst);   offset += 8;
 819   std(R3,  offset, dst);   offset += 8;
 820   std(R4,  offset, dst);   offset += 8;
 821   std(R5,  offset, dst);   offset += 8;
 822   std(R6,  offset, dst);   offset += 8;
 823   std(R7,  offset, dst);   offset += 8;
 824   std(R8,  offset, dst);   offset += 8;
 825   std(R9,  offset, dst);   offset += 8;
 826   std(R10, offset, dst);   offset += 8;
 827   std(R11, offset, dst);   offset += 8;
 828   std(R12, offset, dst);   offset += 8;
 829 
 830   stfd(F0, offset, dst);   offset += 8;
 831   stfd(F1, offset, dst);   offset += 8;
 832   stfd(F2, offset, dst);   offset += 8;
 833   stfd(F3, offset, dst);   offset += 8;
 834   stfd(F4, offset, dst);   offset += 8;
 835   stfd(F5, offset, dst);   offset += 8;
 836   stfd(F6, offset, dst);   offset += 8;
 837   stfd(F7, offset, dst);   offset += 8;
 838   stfd(F8, offset, dst);   offset += 8;
 839   stfd(F9, offset, dst);   offset += 8;
 840   stfd(F10, offset, dst);  offset += 8;
 841   stfd(F11, offset, dst);  offset += 8;
 842   stfd(F12, offset, dst);  offset += 8;
 843   stfd(F13, offset, dst);
 844 }
 845 
 846 // For verify_oops.
 847 void MacroAssembler::restore_volatile_gprs(Register src, int offset) {
 848   ld(R2,  offset, src);   offset += 8;
 849   ld(R3,  offset, src);   offset += 8;
 850   ld(R4,  offset, src);   offset += 8;
 851   ld(R5,  offset, src);   offset += 8;
 852   ld(R6,  offset, src);   offset += 8;
 853   ld(R7,  offset, src);   offset += 8;
 854   ld(R8,  offset, src);   offset += 8;
 855   ld(R9,  offset, src);   offset += 8;
 856   ld(R10, offset, src);   offset += 8;
 857   ld(R11, offset, src);   offset += 8;
 858   ld(R12, offset, src);   offset += 8;
 859 
 860   lfd(F0, offset, src);   offset += 8;
 861   lfd(F1, offset, src);   offset += 8;
 862   lfd(F2, offset, src);   offset += 8;
 863   lfd(F3, offset, src);   offset += 8;
 864   lfd(F4, offset, src);   offset += 8;
 865   lfd(F5, offset, src);   offset += 8;
 866   lfd(F6, offset, src);   offset += 8;
 867   lfd(F7, offset, src);   offset += 8;
 868   lfd(F8, offset, src);   offset += 8;
 869   lfd(F9, offset, src);   offset += 8;
 870   lfd(F10, offset, src);  offset += 8;
 871   lfd(F11, offset, src);  offset += 8;
 872   lfd(F12, offset, src);  offset += 8;
 873   lfd(F13, offset, src);
 874 }
 875 
 876 void MacroAssembler::save_LR_CR(Register tmp) {
 877   mfcr(tmp);
 878   std(tmp, _abi(cr), R1_SP);
 879   mflr(tmp);
 880   std(tmp, _abi(lr), R1_SP);
 881   // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad)
 882 }
 883 
 884 void MacroAssembler::restore_LR_CR(Register tmp) {
 885   assert(tmp != R1_SP, "must be distinct");
 886   ld(tmp, _abi(lr), R1_SP);
 887   mtlr(tmp);
 888   ld(tmp, _abi(cr), R1_SP);
 889   mtcr(tmp);
 890 }
 891 
 892 address MacroAssembler::get_PC_trash_LR(Register result) {
 893   Label L;
 894   bl(L);
 895   bind(L);
 896   address lr_pc = pc();
 897   mflr(result);
 898   return lr_pc;
 899 }
 900 
 901 void MacroAssembler::resize_frame(Register offset, Register tmp) {
 902 #ifdef ASSERT
 903   assert_different_registers(offset, tmp, R1_SP);
 904   andi_(tmp, offset, frame::alignment_in_bytes-1);
 905   asm_assert_eq("resize_frame: unaligned", 0x204);
 906 #endif
 907 
 908   // tmp <- *(SP)
 909   ld(tmp, _abi(callers_sp), R1_SP);
 910   // addr <- SP + offset;
 911   // *(addr) <- tmp;
 912   // SP <- addr
 913   stdux(tmp, R1_SP, offset);
 914 }
 915 
 916 void MacroAssembler::resize_frame(int offset, Register tmp) {
 917   assert(is_simm(offset, 16), "too big an offset");
 918   assert_different_registers(tmp, R1_SP);
 919   assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned");
 920   // tmp <- *(SP)
 921   ld(tmp, _abi(callers_sp), R1_SP);
 922   // addr <- SP + offset;
 923   // *(addr) <- tmp;
 924   // SP <- addr
 925   stdu(tmp, offset, R1_SP);
 926 }
 927 
 928 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) {
 929   // (addr == tmp1) || (addr == tmp2) is allowed here!
 930   assert(tmp1 != tmp2, "must be distinct");
 931 
 932   // compute offset w.r.t. current stack pointer
 933   // tmp_1 <- addr - SP (!)
 934   subf(tmp1, R1_SP, addr);
 935 
 936   // atomically update SP keeping back link.
 937   resize_frame(tmp1/* offset */, tmp2/* tmp */);
 938 }
 939 
 940 void MacroAssembler::push_frame(Register bytes, Register tmp) {
 941 #ifdef ASSERT
 942   assert(bytes != R0, "r0 not allowed here");
 943   andi_(R0, bytes, frame::alignment_in_bytes-1);
 944   asm_assert_eq("push_frame(Reg, Reg): unaligned", 0x203);
 945 #endif
 946   neg(tmp, bytes);
 947   stdux(R1_SP, R1_SP, tmp);
 948 }
 949 
 950 // Push a frame of size `bytes'.
 951 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) {
 952   long offset = align_addr(bytes, frame::alignment_in_bytes);
 953   if (is_simm(-offset, 16)) {
 954     stdu(R1_SP, -offset, R1_SP);
 955   } else {
 956     load_const_optimized(tmp, -offset);
 957     stdux(R1_SP, R1_SP, tmp);
 958   }
 959 }
 960 
 961 // Push a frame of size `bytes' plus abi_reg_args on top.
 962 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) {
 963   push_frame(bytes + frame::abi_reg_args_size, tmp);
 964 }
 965 
 966 // Setup up a new C frame with a spill area for non-volatile GPRs and
 967 // additional space for local variables.
 968 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes,
 969                                                       Register tmp) {
 970   push_frame(bytes + frame::abi_reg_args_size + frame::spill_nonvolatiles_size, tmp);
 971 }
 972 
 973 // Pop current C frame.
 974 void MacroAssembler::pop_frame() {
 975   ld(R1_SP, _abi(callers_sp), R1_SP);
 976 }
 977 
 978 #if defined(ABI_ELFv2)
 979 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) {
 980   // TODO(asmundak): make sure the caller uses R12 as function descriptor
 981   // most of the times.
 982   if (R12 != r_function_entry) {
 983     mr(R12, r_function_entry);
 984   }
 985   mtctr(R12);
 986   // Do a call or a branch.
 987   if (and_link) {
 988     bctrl();
 989   } else {
 990     bctr();
 991   }
 992   _last_calls_return_pc = pc();
 993 
 994   return _last_calls_return_pc;
 995 }
 996 
 997 // Call a C function via a function descriptor and use full C
 998 // calling conventions. Updates and returns _last_calls_return_pc.
 999 address MacroAssembler::call_c(Register r_function_entry) {
1000   return branch_to(r_function_entry, /*and_link=*/true);
1001 }
1002 
1003 // For tail calls: only branch, don't link, so callee returns to caller of this function.
1004 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) {
1005   return branch_to(r_function_entry, /*and_link=*/false);
1006 }
1007 
1008 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) {
1009   load_const(R12, function_entry, R0);
1010   return branch_to(R12,  /*and_link=*/true);
1011 }
1012 
1013 #else
1014 // Generic version of a call to C function via a function descriptor
1015 // with variable support for C calling conventions (TOC, ENV, etc.).
1016 // Updates and returns _last_calls_return_pc.
1017 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call,
1018                                   bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) {
1019   // we emit standard ptrgl glue code here
1020   assert((function_descriptor != R0), "function_descriptor cannot be R0");
1021 
1022   // retrieve necessary entries from the function descriptor
1023   ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor);
1024   mtctr(R0);
1025 
1026   if (load_toc_of_callee) {
1027     ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor);
1028   }
1029   if (load_env_of_callee) {
1030     ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor);
1031   } else if (load_toc_of_callee) {
1032     li(R11, 0);
1033   }
1034 
1035   // do a call or a branch
1036   if (and_link) {
1037     bctrl();
1038   } else {
1039     bctr();
1040   }
1041   _last_calls_return_pc = pc();
1042 
1043   return _last_calls_return_pc;
1044 }
1045 
1046 // Call a C function via a function descriptor and use full C calling
1047 // conventions.
1048 // We don't use the TOC in generated code, so there is no need to save
1049 // and restore its value.
1050 address MacroAssembler::call_c(Register fd) {
1051   return branch_to(fd, /*and_link=*/true,
1052                        /*save toc=*/false,
1053                        /*restore toc=*/false,
1054                        /*load toc=*/true,
1055                        /*load env=*/true);
1056 }
1057 
1058 address MacroAssembler::call_c_and_return_to_caller(Register fd) {
1059   return branch_to(fd, /*and_link=*/false,
1060                        /*save toc=*/false,
1061                        /*restore toc=*/false,
1062                        /*load toc=*/true,
1063                        /*load env=*/true);
1064 }
1065 
1066 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) {
1067   if (rt != relocInfo::none) {
1068     // this call needs to be relocatable
1069     if (!ReoptimizeCallSequences
1070         || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1071         || fd == NULL   // support code-size estimation
1072         || !fd->is_friend_function()
1073         || fd->entry() == NULL) {
1074       // it's not a friend function as defined by class FunctionDescriptor,
1075       // so do a full call-c here.
1076       load_const(R11, (address)fd, R0);
1077 
1078       bool has_env = (fd != NULL && fd->env() != NULL);
1079       return branch_to(R11, /*and_link=*/true,
1080                             /*save toc=*/false,
1081                             /*restore toc=*/false,
1082                             /*load toc=*/true,
1083                             /*load env=*/has_env);
1084     } else {
1085       // It's a friend function. Load the entry point and don't care about
1086       // toc and env. Use an optimizable call instruction, but ensure the
1087       // same code-size as in the case of a non-friend function.
1088       nop();
1089       nop();
1090       nop();
1091       bl64_patchable(fd->entry(), rt);
1092       _last_calls_return_pc = pc();
1093       return _last_calls_return_pc;
1094     }
1095   } else {
1096     // This call does not need to be relocatable, do more aggressive
1097     // optimizations.
1098     if (!ReoptimizeCallSequences
1099       || !fd->is_friend_function()) {
1100       // It's not a friend function as defined by class FunctionDescriptor,
1101       // so do a full call-c here.
1102       load_const(R11, (address)fd, R0);
1103       return branch_to(R11, /*and_link=*/true,
1104                             /*save toc=*/false,
1105                             /*restore toc=*/false,
1106                             /*load toc=*/true,
1107                             /*load env=*/true);
1108     } else {
1109       // it's a friend function, load the entry point and don't care about
1110       // toc and env.
1111       address dest = fd->entry();
1112       if (is_within_range_of_b(dest, pc())) {
1113         bl(dest);
1114       } else {
1115         bl64_patchable(dest, rt);
1116       }
1117       _last_calls_return_pc = pc();
1118       return _last_calls_return_pc;
1119     }
1120   }
1121 }
1122 
1123 // Call a C function.  All constants needed reside in TOC.
1124 //
1125 // Read the address to call from the TOC.
1126 // Read env from TOC, if fd specifies an env.
1127 // Read new TOC from TOC.
1128 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd,
1129                                          relocInfo::relocType rt, Register toc) {
1130   if (!ReoptimizeCallSequences
1131     || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1132     || !fd->is_friend_function()) {
1133     // It's not a friend function as defined by class FunctionDescriptor,
1134     // so do a full call-c here.
1135     assert(fd->entry() != NULL, "function must be linked");
1136 
1137     AddressLiteral fd_entry(fd->entry());
1138     bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true);
1139     mtctr(R11);
1140     if (fd->env() == NULL) {
1141       li(R11, 0);
1142       nop();
1143     } else {
1144       AddressLiteral fd_env(fd->env());
1145       success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true);
1146     }
1147     AddressLiteral fd_toc(fd->toc());
1148     // Set R2_TOC (load from toc)
1149     success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true);
1150     bctrl();
1151     _last_calls_return_pc = pc();
1152     if (!success) { return NULL; }
1153   } else {
1154     // It's a friend function, load the entry point and don't care about
1155     // toc and env. Use an optimizable call instruction, but ensure the
1156     // same code-size as in the case of a non-friend function.
1157     nop();
1158     bl64_patchable(fd->entry(), rt);
1159     _last_calls_return_pc = pc();
1160   }
1161   return _last_calls_return_pc;
1162 }
1163 #endif // ABI_ELFv2
1164 
1165 void MacroAssembler::call_VM_base(Register oop_result,
1166                                   Register last_java_sp,
1167                                   address  entry_point,
1168                                   bool     check_exceptions) {
1169   BLOCK_COMMENT("call_VM {");
1170   // Determine last_java_sp register.
1171   if (!last_java_sp->is_valid()) {
1172     last_java_sp = R1_SP;
1173   }
1174   set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1);
1175 
1176   // ARG1 must hold thread address.
1177   mr(R3_ARG1, R16_thread);
1178 #if defined(ABI_ELFv2)
1179   address return_pc = call_c(entry_point, relocInfo::none);
1180 #else
1181   address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none);
1182 #endif
1183 
1184   reset_last_Java_frame();
1185 
1186   // Check for pending exceptions.
1187   if (check_exceptions) {
1188     // We don't check for exceptions here.
1189     ShouldNotReachHere();
1190   }
1191 
1192   // Get oop result if there is one and reset the value in the thread.
1193   if (oop_result->is_valid()) {
1194     get_vm_result(oop_result);
1195   }
1196 
1197   _last_calls_return_pc = return_pc;
1198   BLOCK_COMMENT("} call_VM");
1199 }
1200 
1201 void MacroAssembler::call_VM_leaf_base(address entry_point) {
1202   BLOCK_COMMENT("call_VM_leaf {");
1203 #if defined(ABI_ELFv2)
1204   call_c(entry_point, relocInfo::none);
1205 #else
1206   call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none);
1207 #endif
1208   BLOCK_COMMENT("} call_VM_leaf");
1209 }
1210 
1211 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) {
1212   call_VM_base(oop_result, noreg, entry_point, check_exceptions);
1213 }
1214 
1215 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1,
1216                              bool check_exceptions) {
1217   // R3_ARG1 is reserved for the thread.
1218   mr_if_needed(R4_ARG2, arg_1);
1219   call_VM(oop_result, entry_point, check_exceptions);
1220 }
1221 
1222 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2,
1223                              bool check_exceptions) {
1224   // R3_ARG1 is reserved for the thread
1225   mr_if_needed(R4_ARG2, arg_1);
1226   assert(arg_2 != R4_ARG2, "smashed argument");
1227   mr_if_needed(R5_ARG3, arg_2);
1228   call_VM(oop_result, entry_point, check_exceptions);
1229 }
1230 
1231 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3,
1232                              bool check_exceptions) {
1233   // R3_ARG1 is reserved for the thread
1234   mr_if_needed(R4_ARG2, arg_1);
1235   assert(arg_2 != R4_ARG2, "smashed argument");
1236   mr_if_needed(R5_ARG3, arg_2);
1237   mr_if_needed(R6_ARG4, arg_3);
1238   call_VM(oop_result, entry_point, check_exceptions);
1239 }
1240 
1241 void MacroAssembler::call_VM_leaf(address entry_point) {
1242   call_VM_leaf_base(entry_point);
1243 }
1244 
1245 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) {
1246   mr_if_needed(R3_ARG1, arg_1);
1247   call_VM_leaf(entry_point);
1248 }
1249 
1250 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) {
1251   mr_if_needed(R3_ARG1, arg_1);
1252   assert(arg_2 != R3_ARG1, "smashed argument");
1253   mr_if_needed(R4_ARG2, arg_2);
1254   call_VM_leaf(entry_point);
1255 }
1256 
1257 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) {
1258   mr_if_needed(R3_ARG1, arg_1);
1259   assert(arg_2 != R3_ARG1, "smashed argument");
1260   mr_if_needed(R4_ARG2, arg_2);
1261   assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument");
1262   mr_if_needed(R5_ARG3, arg_3);
1263   call_VM_leaf(entry_point);
1264 }
1265 
1266 // Check whether instruction is a read access to the polling page
1267 // which was emitted by load_from_polling_page(..).
1268 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext,
1269                                                address* polling_address_ptr) {
1270   if (!is_ld(instruction))
1271     return false; // It's not a ld. Fail.
1272 
1273   int rt = inv_rt_field(instruction);
1274   int ra = inv_ra_field(instruction);
1275   int ds = inv_ds_field(instruction);
1276   if (!(ds == 0 && ra != 0 && rt == 0)) {
1277     return false; // It's not a ld(r0, X, ra). Fail.
1278   }
1279 
1280   if (!ucontext) {
1281     // Set polling address.
1282     if (polling_address_ptr != NULL) {
1283       *polling_address_ptr = NULL;
1284     }
1285     return true; // No ucontext given. Can't check value of ra. Assume true.
1286   }
1287 
1288 #ifdef LINUX
1289   // Ucontext given. Check that register ra contains the address of
1290   // the safepoing polling page.
1291   ucontext_t* uc = (ucontext_t*) ucontext;
1292   // Set polling address.
1293   address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds;
1294   if (polling_address_ptr != NULL) {
1295     *polling_address_ptr = addr;
1296   }
1297   return os::is_poll_address(addr);
1298 #else
1299   // Not on Linux, ucontext must be NULL.
1300   ShouldNotReachHere();
1301   return false;
1302 #endif
1303 }
1304 
1305 bool MacroAssembler::is_memory_serialization(int instruction, JavaThread* thread, void* ucontext) {
1306 #ifdef LINUX
1307   ucontext_t* uc = (ucontext_t*) ucontext;
1308 
1309   if (is_stwx(instruction) || is_stwux(instruction)) {
1310     int ra = inv_ra_field(instruction);
1311     int rb = inv_rb_field(instruction);
1312 
1313     // look up content of ra and rb in ucontext
1314     address ra_val=(address)uc->uc_mcontext.regs->gpr[ra];
1315     long rb_val=(long)uc->uc_mcontext.regs->gpr[rb];
1316     return os::is_memory_serialize_page(thread, ra_val+rb_val);
1317   } else if (is_stw(instruction) || is_stwu(instruction)) {
1318     int ra = inv_ra_field(instruction);
1319     int d1 = inv_d1_field(instruction);
1320 
1321     // look up content of ra in ucontext
1322     address ra_val=(address)uc->uc_mcontext.regs->gpr[ra];
1323     return os::is_memory_serialize_page(thread, ra_val+d1);
1324   } else {
1325     return false;
1326   }
1327 #else
1328   // workaround not needed on !LINUX :-)
1329   ShouldNotCallThis();
1330   return false;
1331 #endif
1332 }
1333 
1334 void MacroAssembler::bang_stack_with_offset(int offset) {
1335   // When increasing the stack, the old stack pointer will be written
1336   // to the new top of stack according to the PPC64 abi.
1337   // Therefore, stack banging is not necessary when increasing
1338   // the stack by <= os::vm_page_size() bytes.
1339   // When increasing the stack by a larger amount, this method is
1340   // called repeatedly to bang the intermediate pages.
1341 
1342   // Stack grows down, caller passes positive offset.
1343   assert(offset > 0, "must bang with positive offset");
1344 
1345   long stdoffset = -offset;
1346 
1347   if (is_simm(stdoffset, 16)) {
1348     // Signed 16 bit offset, a simple std is ok.
1349     if (UseLoadInstructionsForStackBangingPPC64) {
1350       ld(R0, (int)(signed short)stdoffset, R1_SP);
1351     } else {
1352       std(R0,(int)(signed short)stdoffset, R1_SP);
1353     }
1354   } else if (is_simm(stdoffset, 31)) {
1355     const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset);
1356     const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset);
1357 
1358     Register tmp = R11;
1359     addis(tmp, R1_SP, hi);
1360     if (UseLoadInstructionsForStackBangingPPC64) {
1361       ld(R0,  lo, tmp);
1362     } else {
1363       std(R0, lo, tmp);
1364     }
1365   } else {
1366     ShouldNotReachHere();
1367   }
1368 }
1369 
1370 // If instruction is a stack bang of the form
1371 //    std    R0,    x(Ry),       (see bang_stack_with_offset())
1372 //    stdu   R1_SP, x(R1_SP),    (see push_frame(), resize_frame())
1373 // or stdux  R1_SP, Rx, R1_SP    (see push_frame(), resize_frame())
1374 // return the banged address. Otherwise, return 0.
1375 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) {
1376 #ifdef LINUX
1377   ucontext_t* uc = (ucontext_t*) ucontext;
1378   int rs = inv_rs_field(instruction);
1379   int ra = inv_ra_field(instruction);
1380   if (   (is_ld(instruction)   && rs == 0 &&  UseLoadInstructionsForStackBangingPPC64)
1381       || (is_std(instruction)  && rs == 0 && !UseLoadInstructionsForStackBangingPPC64)
1382       || (is_stdu(instruction) && rs == 1)) {
1383     int ds = inv_ds_field(instruction);
1384     // return banged address
1385     return ds+(address)uc->uc_mcontext.regs->gpr[ra];
1386   } else if (is_stdux(instruction) && rs == 1) {
1387     int rb = inv_rb_field(instruction);
1388     address sp = (address)uc->uc_mcontext.regs->gpr[1];
1389     long rb_val = (long)uc->uc_mcontext.regs->gpr[rb];
1390     return ra != 1 || rb_val >= 0 ? NULL         // not a stack bang
1391                                   : sp + rb_val; // banged address
1392   }
1393   return NULL; // not a stack bang
1394 #else
1395   // workaround not needed on !LINUX :-)
1396   ShouldNotCallThis();
1397   return NULL;
1398 #endif
1399 }
1400 
1401 void MacroAssembler::reserved_stack_check(Register return_pc) {
1402   // Test if reserved zone needs to be enabled.
1403   Label no_reserved_zone_enabling;
1404 
1405   ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread);
1406   cmpld(CCR0, R1_SP, R0);
1407   blt_predict_taken(CCR0, no_reserved_zone_enabling);
1408 
1409   // Enable reserved zone again, throw stack overflow exception.
1410   push_frame_reg_args(0, R0);
1411   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread);
1412   pop_frame();
1413   mtlr(return_pc);
1414   load_const_optimized(R0, StubRoutines::throw_delayed_StackOverflowError_entry());
1415   mtctr(R0);
1416   bctr();
1417 
1418   should_not_reach_here();
1419 
1420   bind(no_reserved_zone_enabling);
1421 }
1422 
1423 void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base,
1424                                 bool cmpxchgx_hint) {
1425   Label retry;
1426   bind(retry);
1427   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1428   stdcx_(exchange_value, addr_base);
1429   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1430     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1431   } else {
1432     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1433   }
1434 }
1435 
1436 void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base,
1437                                 Register tmp, bool cmpxchgx_hint) {
1438   Label retry;
1439   bind(retry);
1440   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1441   add(tmp, dest_current_value, inc_value);
1442   stdcx_(tmp, addr_base);
1443   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1444     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1445   } else {
1446     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1447   }
1448 }
1449 
1450 // Word/sub-word atomic helper functions
1451 
1452 // Temps and addr_base are killed if size < 4 and processor does not support respective instructions.
1453 // Only signed types are supported with size < 4.
1454 // Atomic add always kills tmp1.
1455 void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value,
1456                                                    Register addr_base, Register tmp1, Register tmp2, Register tmp3,
1457                                                    bool cmpxchgx_hint, bool is_add, int size) {
1458   // Sub-word instructions are available since Power 8.
1459   // For older processors, instruction_type != size holds, and we
1460   // emulate the sub-word instructions by constructing a 4-byte value
1461   // that leaves the other bytes unchanged.
1462   const int instruction_type = VM_Version::has_lqarx() ? size : 4;
1463 
1464   Label retry;
1465   Register shift_amount = noreg,
1466            val32 = dest_current_value,
1467            modval = is_add ? tmp1 : exchange_value;
1468 
1469   if (instruction_type != size) {
1470     assert_different_registers(tmp1, tmp2, tmp3, dest_current_value, exchange_value, addr_base);
1471     modval = tmp1;
1472     shift_amount = tmp2;
1473     val32 = tmp3;
1474     // Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.
1475 #ifdef VM_LITTLE_ENDIAN
1476     rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;
1477     clrrdi(addr_base, addr_base, 2);
1478 #else
1479     xori(shift_amount, addr_base, (size == 1) ? 3 : 2);
1480     clrrdi(addr_base, addr_base, 2);
1481     rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;
1482 #endif
1483   }
1484 
1485   // atomic emulation loop
1486   bind(retry);
1487 
1488   switch (instruction_type) {
1489     case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1490     case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1491     case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1492     default: ShouldNotReachHere();
1493   }
1494 
1495   if (instruction_type != size) {
1496     srw(dest_current_value, val32, shift_amount);
1497   }
1498 
1499   if (is_add) { add(modval, dest_current_value, exchange_value); }
1500 
1501   if (instruction_type != size) {
1502     // Transform exchange value such that the replacement can be done by one xor instruction.
1503     xorr(modval, dest_current_value, is_add ? modval : exchange_value);
1504     clrldi(modval, modval, (size == 1) ? 56 : 48);
1505     slw(modval, modval, shift_amount);
1506     xorr(modval, val32, modval);
1507   }
1508 
1509   switch (instruction_type) {
1510     case 4: stwcx_(modval, addr_base); break;
1511     case 2: sthcx_(modval, addr_base); break;
1512     case 1: stbcx_(modval, addr_base); break;
1513     default: ShouldNotReachHere();
1514   }
1515 
1516   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1517     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1518   } else {
1519     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1520   }
1521 
1522   // l?arx zero-extends, but Java wants byte/short values sign-extended.
1523   if (size == 1) {
1524     extsb(dest_current_value, dest_current_value);
1525   } else if (size == 2) {
1526     extsh(dest_current_value, dest_current_value);
1527   };
1528 }
1529 
1530 // Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions.
1531 // Only signed types are supported with size < 4.
1532 void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value,
1533                                        Register compare_value, Register exchange_value,
1534                                        Register addr_base, Register tmp1, Register tmp2,
1535                                        Label &retry, Label &failed, bool cmpxchgx_hint, int size) {
1536   // Sub-word instructions are available since Power 8.
1537   // For older processors, instruction_type != size holds, and we
1538   // emulate the sub-word instructions by constructing a 4-byte value
1539   // that leaves the other bytes unchanged.
1540   const int instruction_type = VM_Version::has_lqarx() ? size : 4;
1541 
1542   Register shift_amount = noreg,
1543            val32 = dest_current_value,
1544            modval = exchange_value;
1545 
1546   if (instruction_type != size) {
1547     assert_different_registers(tmp1, tmp2, dest_current_value, compare_value, exchange_value, addr_base);
1548     shift_amount = tmp1;
1549     val32 = tmp2;
1550     modval = tmp2;
1551     // Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.
1552 #ifdef VM_LITTLE_ENDIAN
1553     rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;
1554     clrrdi(addr_base, addr_base, 2);
1555 #else
1556     xori(shift_amount, addr_base, (size == 1) ? 3 : 2);
1557     clrrdi(addr_base, addr_base, 2);
1558     rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;
1559 #endif
1560     // Transform exchange value such that the replacement can be done by one xor instruction.
1561     xorr(exchange_value, compare_value, exchange_value);
1562     clrldi(exchange_value, exchange_value, (size == 1) ? 56 : 48);
1563     slw(exchange_value, exchange_value, shift_amount);
1564   }
1565 
1566   // atomic emulation loop
1567   bind(retry);
1568 
1569   switch (instruction_type) {
1570     case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1571     case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1572     case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1573     default: ShouldNotReachHere();
1574   }
1575 
1576   if (instruction_type != size) {
1577     srw(dest_current_value, val32, shift_amount);
1578   }
1579   if (size == 1) {
1580     extsb(dest_current_value, dest_current_value);
1581   } else if (size == 2) {
1582     extsh(dest_current_value, dest_current_value);
1583   };
1584 
1585   cmpw(flag, dest_current_value, compare_value);
1586   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1587     bne_predict_not_taken(flag, failed);
1588   } else {
1589     bne(                  flag, failed);
1590   }
1591   // branch to done  => (flag == ne), (dest_current_value != compare_value)
1592   // fall through    => (flag == eq), (dest_current_value == compare_value)
1593 
1594   if (instruction_type != size) {
1595     xorr(modval, val32, exchange_value);
1596   }
1597 
1598   switch (instruction_type) {
1599     case 4: stwcx_(modval, addr_base); break;
1600     case 2: sthcx_(modval, addr_base); break;
1601     case 1: stbcx_(modval, addr_base); break;
1602     default: ShouldNotReachHere();
1603   }
1604 }
1605 
1606 // CmpxchgX sets condition register to cmpX(current, compare).
1607 void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value,
1608                                      Register compare_value, Register exchange_value,
1609                                      Register addr_base, Register tmp1, Register tmp2,
1610                                      int semantics, bool cmpxchgx_hint,
1611                                      Register int_flag_success, bool contention_hint, bool weak, int size) {
1612   Label retry;
1613   Label failed;
1614   Label done;
1615 
1616   // Save one branch if result is returned via register and
1617   // result register is different from the other ones.
1618   bool use_result_reg    = (int_flag_success != noreg);
1619   bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value &&
1620                             int_flag_success != exchange_value && int_flag_success != addr_base &&
1621                             int_flag_success != tmp1 && int_flag_success != tmp2);
1622   assert(!weak || flag == CCR0, "weak only supported with CCR0");
1623   assert(size == 1 || size == 2 || size == 4, "unsupported");
1624 
1625   if (use_result_reg && preset_result_reg) {
1626     li(int_flag_success, 0); // preset (assume cas failed)
1627   }
1628 
1629   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1630   if (contention_hint) { // Don't try to reserve if cmp fails.
1631     switch (size) {
1632       case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break;
1633       case 2: lha(dest_current_value, 0, addr_base); break;
1634       case 4: lwz(dest_current_value, 0, addr_base); break;
1635       default: ShouldNotReachHere();
1636     }
1637     cmpw(flag, dest_current_value, compare_value);
1638     bne(flag, failed);
1639   }
1640 
1641   // release/fence semantics
1642   if (semantics & MemBarRel) {
1643     release();
1644   }
1645 
1646   cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base, tmp1, tmp2,
1647                     retry, failed, cmpxchgx_hint, size);
1648   if (!weak || use_result_reg) {
1649     if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1650       bne_predict_not_taken(CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1651     } else {
1652       bne(                  CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1653     }
1654   }
1655   // fall through    => (flag == eq), (dest_current_value == compare_value), (swapped)
1656 
1657   // Result in register (must do this at the end because int_flag_success can be the
1658   // same register as one above).
1659   if (use_result_reg) {
1660     li(int_flag_success, 1);
1661   }
1662 
1663   if (semantics & MemBarFenceAfter) {
1664     fence();
1665   } else if (semantics & MemBarAcq) {
1666     isync();
1667   }
1668 
1669   if (use_result_reg && !preset_result_reg) {
1670     b(done);
1671   }
1672 
1673   bind(failed);
1674   if (use_result_reg && !preset_result_reg) {
1675     li(int_flag_success, 0);
1676   }
1677 
1678   bind(done);
1679   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1680   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1681 }
1682 
1683 // Preforms atomic compare exchange:
1684 //   if (compare_value == *addr_base)
1685 //     *addr_base = exchange_value
1686 //     int_flag_success = 1;
1687 //   else
1688 //     int_flag_success = 0;
1689 //
1690 // ConditionRegister flag       = cmp(compare_value, *addr_base)
1691 // Register dest_current_value  = *addr_base
1692 // Register compare_value       Used to compare with value in memory
1693 // Register exchange_value      Written to memory if compare_value == *addr_base
1694 // Register addr_base           The memory location to compareXChange
1695 // Register int_flag_success    Set to 1 if exchange_value was written to *addr_base
1696 //
1697 // To avoid the costly compare exchange the value is tested beforehand.
1698 // Several special cases exist to avoid that unnecessary information is generated.
1699 //
1700 void MacroAssembler::cmpxchgd(ConditionRegister flag,
1701                               Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value,
1702                               Register addr_base, int semantics, bool cmpxchgx_hint,
1703                               Register int_flag_success, Label* failed_ext, bool contention_hint, bool weak) {
1704   Label retry;
1705   Label failed_int;
1706   Label& failed = (failed_ext != NULL) ? *failed_ext : failed_int;
1707   Label done;
1708 
1709   // Save one branch if result is returned via register and result register is different from the other ones.
1710   bool use_result_reg    = (int_flag_success!=noreg);
1711   bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() &&
1712                             int_flag_success!=exchange_value && int_flag_success!=addr_base);
1713   assert(!weak || flag == CCR0, "weak only supported with CCR0");
1714   assert(int_flag_success == noreg || failed_ext == NULL, "cannot have both");
1715 
1716   if (use_result_reg && preset_result_reg) {
1717     li(int_flag_success, 0); // preset (assume cas failed)
1718   }
1719 
1720   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1721   if (contention_hint) { // Don't try to reserve if cmp fails.
1722     ld(dest_current_value, 0, addr_base);
1723     cmpd(flag, compare_value, dest_current_value);
1724     bne(flag, failed);
1725   }
1726 
1727   // release/fence semantics
1728   if (semantics & MemBarRel) {
1729     release();
1730   }
1731 
1732   // atomic emulation loop
1733   bind(retry);
1734 
1735   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1736   cmpd(flag, compare_value, dest_current_value);
1737   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1738     bne_predict_not_taken(flag, failed);
1739   } else {
1740     bne(                  flag, failed);
1741   }
1742 
1743   stdcx_(exchange_value, addr_base);
1744   if (!weak || use_result_reg || failed_ext) {
1745     if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1746       bne_predict_not_taken(CCR0, weak ? failed : retry); // stXcx_ sets CCR0
1747     } else {
1748       bne(                  CCR0, weak ? failed : retry); // stXcx_ sets CCR0
1749     }
1750   }
1751 
1752   // result in register (must do this at the end because int_flag_success can be the same register as one above)
1753   if (use_result_reg) {
1754     li(int_flag_success, 1);
1755   }
1756 
1757   if (semantics & MemBarFenceAfter) {
1758     fence();
1759   } else if (semantics & MemBarAcq) {
1760     isync();
1761   }
1762 
1763   if (use_result_reg && !preset_result_reg) {
1764     b(done);
1765   }
1766 
1767   bind(failed_int);
1768   if (use_result_reg && !preset_result_reg) {
1769     li(int_flag_success, 0);
1770   }
1771 
1772   bind(done);
1773   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1774   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1775 }
1776 
1777 // Look up the method for a megamorphic invokeinterface call.
1778 // The target method is determined by <intf_klass, itable_index>.
1779 // The receiver klass is in recv_klass.
1780 // On success, the result will be in method_result, and execution falls through.
1781 // On failure, execution transfers to the given label.
1782 void MacroAssembler::lookup_interface_method(Register recv_klass,
1783                                              Register intf_klass,
1784                                              RegisterOrConstant itable_index,
1785                                              Register method_result,
1786                                              Register scan_temp,
1787                                              Register temp2,
1788                                              Label& L_no_such_interface,
1789                                              bool return_method) {
1790   assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
1791 
1792   // Compute start of first itableOffsetEntry (which is at the end of the vtable).
1793   int vtable_base = in_bytes(Klass::vtable_start_offset());
1794   int itentry_off = itableMethodEntry::method_offset_in_bytes();
1795   int logMEsize   = exact_log2(itableMethodEntry::size() * wordSize);
1796   int scan_step   = itableOffsetEntry::size() * wordSize;
1797   int log_vte_size= exact_log2(vtableEntry::size_in_bytes());
1798 
1799   lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass);
1800   // %%% We should store the aligned, prescaled offset in the klassoop.
1801   // Then the next several instructions would fold away.
1802 
1803   sldi(scan_temp, scan_temp, log_vte_size);
1804   addi(scan_temp, scan_temp, vtable_base);
1805   add(scan_temp, recv_klass, scan_temp);
1806 
1807   // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1808   if (return_method) {
1809     if (itable_index.is_register()) {
1810       Register itable_offset = itable_index.as_register();
1811       sldi(method_result, itable_offset, logMEsize);
1812       if (itentry_off) { addi(method_result, method_result, itentry_off); }
1813       add(method_result, method_result, recv_klass);
1814     } else {
1815       long itable_offset = (long)itable_index.as_constant();
1816       // static address, no relocation
1817       add_const_optimized(method_result, recv_klass, (itable_offset << logMEsize) + itentry_off, temp2);
1818     }
1819   }
1820 
1821   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
1822   //   if (scan->interface() == intf) {
1823   //     result = (klass + scan->offset() + itable_index);
1824   //   }
1825   // }
1826   Label search, found_method;
1827 
1828   for (int peel = 1; peel >= 0; peel--) {
1829     // %%%% Could load both offset and interface in one ldx, if they were
1830     // in the opposite order. This would save a load.
1831     ld(temp2, itableOffsetEntry::interface_offset_in_bytes(), scan_temp);
1832 
1833     // Check that this entry is non-null. A null entry means that
1834     // the receiver class doesn't implement the interface, and wasn't the
1835     // same as when the caller was compiled.
1836     cmpd(CCR0, temp2, intf_klass);
1837 
1838     if (peel) {
1839       beq(CCR0, found_method);
1840     } else {
1841       bne(CCR0, search);
1842       // (invert the test to fall through to found_method...)
1843     }
1844 
1845     if (!peel) break;
1846 
1847     bind(search);
1848 
1849     cmpdi(CCR0, temp2, 0);
1850     beq(CCR0, L_no_such_interface);
1851     addi(scan_temp, scan_temp, scan_step);
1852   }
1853 
1854   bind(found_method);
1855 
1856   // Got a hit.
1857   if (return_method) {
1858     int ito_offset = itableOffsetEntry::offset_offset_in_bytes();
1859     lwz(scan_temp, ito_offset, scan_temp);
1860     ldx(method_result, scan_temp, method_result);
1861   }
1862 }
1863 
1864 // virtual method calling
1865 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1866                                            RegisterOrConstant vtable_index,
1867                                            Register method_result) {
1868 
1869   assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg());
1870 
1871   const int base = in_bytes(Klass::vtable_start_offset());
1872   assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1873 
1874   if (vtable_index.is_register()) {
1875     sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord);
1876     add(recv_klass, vtable_index.as_register(), recv_klass);
1877   } else {
1878     addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord);
1879   }
1880   ld(R19_method, base + vtableEntry::method_offset_in_bytes(), recv_klass);
1881 }
1882 
1883 /////////////////////////////////////////// subtype checking ////////////////////////////////////////////
1884 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1885                                                    Register super_klass,
1886                                                    Register temp1_reg,
1887                                                    Register temp2_reg,
1888                                                    Label* L_success,
1889                                                    Label* L_failure,
1890                                                    Label* L_slow_path,
1891                                                    RegisterOrConstant super_check_offset) {
1892 
1893   const Register check_cache_offset = temp1_reg;
1894   const Register cached_super       = temp2_reg;
1895 
1896   assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super);
1897 
1898   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1899   int sc_offset  = in_bytes(Klass::secondary_super_cache_offset());
1900 
1901   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1902   bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset);
1903 
1904   Label L_fallthrough;
1905   int label_nulls = 0;
1906   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1907   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1908   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1909   assert(label_nulls <= 1 ||
1910          (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path),
1911          "at most one NULL in the batch, usually");
1912 
1913   // If the pointers are equal, we are done (e.g., String[] elements).
1914   // This self-check enables sharing of secondary supertype arrays among
1915   // non-primary types such as array-of-interface. Otherwise, each such
1916   // type would need its own customized SSA.
1917   // We move this check to the front of the fast path because many
1918   // type checks are in fact trivially successful in this manner,
1919   // so we get a nicely predicted branch right at the start of the check.
1920   cmpd(CCR0, sub_klass, super_klass);
1921   beq(CCR0, *L_success);
1922 
1923   // Check the supertype display:
1924   if (must_load_sco) {
1925     // The super check offset is always positive...
1926     lwz(check_cache_offset, sco_offset, super_klass);
1927     super_check_offset = RegisterOrConstant(check_cache_offset);
1928     // super_check_offset is register.
1929     assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register());
1930   }
1931   // The loaded value is the offset from KlassOopDesc.
1932 
1933   ld(cached_super, super_check_offset, sub_klass);
1934   cmpd(CCR0, cached_super, super_klass);
1935 
1936   // This check has worked decisively for primary supers.
1937   // Secondary supers are sought in the super_cache ('super_cache_addr').
1938   // (Secondary supers are interfaces and very deeply nested subtypes.)
1939   // This works in the same check above because of a tricky aliasing
1940   // between the super_cache and the primary super display elements.
1941   // (The 'super_check_addr' can address either, as the case requires.)
1942   // Note that the cache is updated below if it does not help us find
1943   // what we need immediately.
1944   // So if it was a primary super, we can just fail immediately.
1945   // Otherwise, it's the slow path for us (no success at this point).
1946 
1947 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); }
1948 
1949   if (super_check_offset.is_register()) {
1950     beq(CCR0, *L_success);
1951     cmpwi(CCR0, super_check_offset.as_register(), sc_offset);
1952     if (L_failure == &L_fallthrough) {
1953       beq(CCR0, *L_slow_path);
1954     } else {
1955       bne(CCR0, *L_failure);
1956       FINAL_JUMP(*L_slow_path);
1957     }
1958   } else {
1959     if (super_check_offset.as_constant() == sc_offset) {
1960       // Need a slow path; fast failure is impossible.
1961       if (L_slow_path == &L_fallthrough) {
1962         beq(CCR0, *L_success);
1963       } else {
1964         bne(CCR0, *L_slow_path);
1965         FINAL_JUMP(*L_success);
1966       }
1967     } else {
1968       // No slow path; it's a fast decision.
1969       if (L_failure == &L_fallthrough) {
1970         beq(CCR0, *L_success);
1971       } else {
1972         bne(CCR0, *L_failure);
1973         FINAL_JUMP(*L_success);
1974       }
1975     }
1976   }
1977 
1978   bind(L_fallthrough);
1979 #undef FINAL_JUMP
1980 }
1981 
1982 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1983                                                    Register super_klass,
1984                                                    Register temp1_reg,
1985                                                    Register temp2_reg,
1986                                                    Label* L_success,
1987                                                    Register result_reg) {
1988   const Register array_ptr = temp1_reg; // current value from cache array
1989   const Register temp      = temp2_reg;
1990 
1991   assert_different_registers(sub_klass, super_klass, array_ptr, temp);
1992 
1993   int source_offset = in_bytes(Klass::secondary_supers_offset());
1994   int target_offset = in_bytes(Klass::secondary_super_cache_offset());
1995 
1996   int length_offset = Array<Klass*>::length_offset_in_bytes();
1997   int base_offset   = Array<Klass*>::base_offset_in_bytes();
1998 
1999   Label hit, loop, failure, fallthru;
2000 
2001   ld(array_ptr, source_offset, sub_klass);
2002 
2003   // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated.");
2004   lwz(temp, length_offset, array_ptr);
2005   cmpwi(CCR0, temp, 0);
2006   beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0
2007 
2008   mtctr(temp); // load ctr
2009 
2010   bind(loop);
2011   // Oops in table are NO MORE compressed.
2012   ld(temp, base_offset, array_ptr);
2013   cmpd(CCR0, temp, super_klass);
2014   beq(CCR0, hit);
2015   addi(array_ptr, array_ptr, BytesPerWord);
2016   bdnz(loop);
2017 
2018   bind(failure);
2019   if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss)
2020   b(fallthru);
2021 
2022   bind(hit);
2023   std(super_klass, target_offset, sub_klass); // save result to cache
2024   if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit)
2025   if (L_success != NULL) { b(*L_success); }
2026   else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided
2027 
2028   bind(fallthru);
2029 }
2030 
2031 // Try fast path, then go to slow one if not successful
2032 void MacroAssembler::check_klass_subtype(Register sub_klass,
2033                          Register super_klass,
2034                          Register temp1_reg,
2035                          Register temp2_reg,
2036                          Label& L_success) {
2037   Label L_failure;
2038   check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure);
2039   check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success);
2040   bind(L_failure); // Fallthru if not successful.
2041 }
2042 
2043 void MacroAssembler::check_method_handle_type(Register mtype_reg, Register mh_reg,
2044                                               Register temp_reg,
2045                                               Label& wrong_method_type) {
2046   assert_different_registers(mtype_reg, mh_reg, temp_reg);
2047   // Compare method type against that of the receiver.
2048   load_heap_oop(temp_reg, delayed_value(java_lang_invoke_MethodHandle::type_offset_in_bytes, temp_reg), mh_reg,
2049                 noreg, noreg, false, IS_NOT_NULL);
2050   cmpd(CCR0, temp_reg, mtype_reg);
2051   bne(CCR0, wrong_method_type);
2052 }
2053 
2054 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot,
2055                                                    Register temp_reg,
2056                                                    int extra_slot_offset) {
2057   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
2058   int stackElementSize = Interpreter::stackElementSize;
2059   int offset = extra_slot_offset * stackElementSize;
2060   if (arg_slot.is_constant()) {
2061     offset += arg_slot.as_constant() * stackElementSize;
2062     return offset;
2063   } else {
2064     assert(temp_reg != noreg, "must specify");
2065     sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize));
2066     if (offset != 0)
2067       addi(temp_reg, temp_reg, offset);
2068     return temp_reg;
2069   }
2070 }
2071 
2072 // Supports temp2_reg = R0.
2073 void MacroAssembler::biased_locking_enter(ConditionRegister cr_reg, Register obj_reg,
2074                                           Register mark_reg, Register temp_reg,
2075                                           Register temp2_reg, Label& done, Label* slow_case) {
2076   assert(UseBiasedLocking, "why call this otherwise?");
2077 
2078 #ifdef ASSERT
2079   assert_different_registers(obj_reg, mark_reg, temp_reg, temp2_reg);
2080 #endif
2081 
2082   Label cas_label;
2083 
2084   // Branch to done if fast path fails and no slow_case provided.
2085   Label *slow_case_int = (slow_case != NULL) ? slow_case : &done;
2086 
2087   // Biased locking
2088   // See whether the lock is currently biased toward our thread and
2089   // whether the epoch is still valid
2090   // Note that the runtime guarantees sufficient alignment of JavaThread
2091   // pointers to allow age to be placed into low bits
2092   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits,
2093          "biased locking makes assumptions about bit layout");
2094 
2095   if (PrintBiasedLockingStatistics) {
2096     load_const(temp2_reg, (address) BiasedLocking::total_entry_count_addr(), temp_reg);
2097     lwzx(temp_reg, temp2_reg);
2098     addi(temp_reg, temp_reg, 1);
2099     stwx(temp_reg, temp2_reg);
2100   }
2101 
2102   andi(temp_reg, mark_reg, markOopDesc::biased_lock_mask_in_place);
2103   cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern);
2104   bne(cr_reg, cas_label);
2105 
2106   load_klass(temp_reg, obj_reg);
2107 
2108   load_const_optimized(temp2_reg, ~((int) markOopDesc::age_mask_in_place));
2109   ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
2110   orr(temp_reg, R16_thread, temp_reg);
2111   xorr(temp_reg, mark_reg, temp_reg);
2112   andr(temp_reg, temp_reg, temp2_reg);
2113   cmpdi(cr_reg, temp_reg, 0);
2114   if (PrintBiasedLockingStatistics) {
2115     Label l;
2116     bne(cr_reg, l);
2117     load_const(temp2_reg, (address) BiasedLocking::biased_lock_entry_count_addr());
2118     lwzx(mark_reg, temp2_reg);
2119     addi(mark_reg, mark_reg, 1);
2120     stwx(mark_reg, temp2_reg);
2121     // restore mark_reg
2122     ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);
2123     bind(l);
2124   }
2125   beq(cr_reg, done);
2126 
2127   Label try_revoke_bias;
2128   Label try_rebias;
2129 
2130   // At this point we know that the header has the bias pattern and
2131   // that we are not the bias owner in the current epoch. We need to
2132   // figure out more details about the state of the header in order to
2133   // know what operations can be legally performed on the object's
2134   // header.
2135 
2136   // If the low three bits in the xor result aren't clear, that means
2137   // the prototype header is no longer biased and we have to revoke
2138   // the bias on this object.
2139   andi(temp2_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
2140   cmpwi(cr_reg, temp2_reg, 0);
2141   bne(cr_reg, try_revoke_bias);
2142 
2143   // Biasing is still enabled for this data type. See whether the
2144   // epoch of the current bias is still valid, meaning that the epoch
2145   // bits of the mark word are equal to the epoch bits of the
2146   // prototype header. (Note that the prototype header's epoch bits
2147   // only change at a safepoint.) If not, attempt to rebias the object
2148   // toward the current thread. Note that we must be absolutely sure
2149   // that the current epoch is invalid in order to do this because
2150   // otherwise the manipulations it performs on the mark word are
2151   // illegal.
2152 
2153   int shift_amount = 64 - markOopDesc::epoch_shift;
2154   // rotate epoch bits to right (little) end and set other bits to 0
2155   // [ big part | epoch | little part ] -> [ 0..0 | epoch ]
2156   rldicl_(temp2_reg, temp_reg, shift_amount, 64 - markOopDesc::epoch_bits);
2157   // branch if epoch bits are != 0, i.e. they differ, because the epoch has been incremented
2158   bne(CCR0, try_rebias);
2159 
2160   // The epoch of the current bias is still valid but we know nothing
2161   // about the owner; it might be set or it might be clear. Try to
2162   // acquire the bias of the object using an atomic operation. If this
2163   // fails we will go in to the runtime to revoke the object's bias.
2164   // Note that we first construct the presumed unbiased header so we
2165   // don't accidentally blow away another thread's valid bias.
2166   andi(mark_reg, mark_reg, (markOopDesc::biased_lock_mask_in_place |
2167                                 markOopDesc::age_mask_in_place |
2168                                 markOopDesc::epoch_mask_in_place));
2169   orr(temp_reg, R16_thread, mark_reg);
2170 
2171   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2172 
2173   // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
2174   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
2175            /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
2176            /*where=*/obj_reg,
2177            MacroAssembler::MemBarAcq,
2178            MacroAssembler::cmpxchgx_hint_acquire_lock(),
2179            noreg, slow_case_int); // bail out if failed
2180 
2181   // If the biasing toward our thread failed, this means that
2182   // another thread succeeded in biasing it toward itself and we
2183   // need to revoke that bias. The revocation will occur in the
2184   // interpreter runtime in the slow case.
2185   if (PrintBiasedLockingStatistics) {
2186     load_const(temp2_reg, (address) BiasedLocking::anonymously_biased_lock_entry_count_addr(), temp_reg);
2187     lwzx(temp_reg, temp2_reg);
2188     addi(temp_reg, temp_reg, 1);
2189     stwx(temp_reg, temp2_reg);
2190   }
2191   b(done);
2192 
2193   bind(try_rebias);
2194   // At this point we know the epoch has expired, meaning that the
2195   // current "bias owner", if any, is actually invalid. Under these
2196   // circumstances _only_, we are allowed to use the current header's
2197   // value as the comparison value when doing the cas to acquire the
2198   // bias in the current epoch. In other words, we allow transfer of
2199   // the bias from one thread to another directly in this situation.
2200   load_klass(temp_reg, obj_reg);
2201   andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place);
2202   orr(temp2_reg, R16_thread, temp2_reg);
2203   ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
2204   orr(temp_reg, temp2_reg, temp_reg);
2205 
2206   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2207 
2208   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
2209                  /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
2210                  /*where=*/obj_reg,
2211                  MacroAssembler::MemBarAcq,
2212                  MacroAssembler::cmpxchgx_hint_acquire_lock(),
2213                  noreg, slow_case_int); // bail out if failed
2214 
2215   // If the biasing toward our thread failed, this means that
2216   // another thread succeeded in biasing it toward itself and we
2217   // need to revoke that bias. The revocation will occur in the
2218   // interpreter runtime in the slow case.
2219   if (PrintBiasedLockingStatistics) {
2220     load_const(temp2_reg, (address) BiasedLocking::rebiased_lock_entry_count_addr(), temp_reg);
2221     lwzx(temp_reg, temp2_reg);
2222     addi(temp_reg, temp_reg, 1);
2223     stwx(temp_reg, temp2_reg);
2224   }
2225   b(done);
2226 
2227   bind(try_revoke_bias);
2228   // The prototype mark in the klass doesn't have the bias bit set any
2229   // more, indicating that objects of this data type are not supposed
2230   // to be biased any more. We are going to try to reset the mark of
2231   // this object to the prototype value and fall through to the
2232   // CAS-based locking scheme. Note that if our CAS fails, it means
2233   // that another thread raced us for the privilege of revoking the
2234   // bias of this particular object, so it's okay to continue in the
2235   // normal locking code.
2236   load_klass(temp_reg, obj_reg);
2237   ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
2238   andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place);
2239   orr(temp_reg, temp_reg, temp2_reg);
2240 
2241   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2242 
2243   // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
2244   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
2245                  /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
2246                  /*where=*/obj_reg,
2247                  MacroAssembler::MemBarAcq,
2248                  MacroAssembler::cmpxchgx_hint_acquire_lock());
2249 
2250   // reload markOop in mark_reg before continuing with lightweight locking
2251   ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);
2252 
2253   // Fall through to the normal CAS-based lock, because no matter what
2254   // the result of the above CAS, some thread must have succeeded in
2255   // removing the bias bit from the object's header.
2256   if (PrintBiasedLockingStatistics) {
2257     Label l;
2258     bne(cr_reg, l);
2259     load_const(temp2_reg, (address) BiasedLocking::revoked_lock_entry_count_addr(), temp_reg);
2260     lwzx(temp_reg, temp2_reg);
2261     addi(temp_reg, temp_reg, 1);
2262     stwx(temp_reg, temp2_reg);
2263     bind(l);
2264   }
2265 
2266   bind(cas_label);
2267 }
2268 
2269 void MacroAssembler::biased_locking_exit (ConditionRegister cr_reg, Register mark_addr, Register temp_reg, Label& done) {
2270   // Check for biased locking unlock case, which is a no-op
2271   // Note: we do not have to check the thread ID for two reasons.
2272   // First, the interpreter checks for IllegalMonitorStateException at
2273   // a higher level. Second, if the bias was revoked while we held the
2274   // lock, the object could not be rebiased toward another thread, so
2275   // the bias bit would be clear.
2276 
2277   ld(temp_reg, 0, mark_addr);
2278   andi(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
2279 
2280   cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern);
2281   beq(cr_reg, done);
2282 }
2283 
2284 // allocation (for C1)
2285 void MacroAssembler::eden_allocate(
2286   Register obj,                      // result: pointer to object after successful allocation
2287   Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
2288   int      con_size_in_bytes,        // object size in bytes if   known at compile time
2289   Register t1,                       // temp register
2290   Register t2,                       // temp register
2291   Label&   slow_case                 // continuation point if fast allocation fails
2292 ) {
2293   b(slow_case);
2294 }
2295 
2296 void MacroAssembler::tlab_allocate(
2297   Register obj,                      // result: pointer to object after successful allocation
2298   Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
2299   int      con_size_in_bytes,        // object size in bytes if   known at compile time
2300   Register t1,                       // temp register
2301   Label&   slow_case                 // continuation point if fast allocation fails
2302 ) {
2303   // make sure arguments make sense
2304   assert_different_registers(obj, var_size_in_bytes, t1);
2305   assert(0 <= con_size_in_bytes && is_simm13(con_size_in_bytes), "illegal object size");
2306   assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment");
2307 
2308   const Register new_top = t1;
2309   //verify_tlab(); not implemented
2310 
2311   ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2312   ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread);
2313   if (var_size_in_bytes == noreg) {
2314     addi(new_top, obj, con_size_in_bytes);
2315   } else {
2316     add(new_top, obj, var_size_in_bytes);
2317   }
2318   cmpld(CCR0, new_top, R0);
2319   bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case);
2320 
2321 #ifdef ASSERT
2322   // make sure new free pointer is properly aligned
2323   {
2324     Label L;
2325     andi_(R0, new_top, MinObjAlignmentInBytesMask);
2326     beq(CCR0, L);
2327     stop("updated TLAB free is not properly aligned", 0x934);
2328     bind(L);
2329   }
2330 #endif // ASSERT
2331 
2332   // update the tlab top pointer
2333   std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2334   //verify_tlab(); not implemented
2335 }
2336 void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register t1, Register t2) {
2337   unimplemented("incr_allocated_bytes");
2338 }
2339 
2340 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset,
2341                                              int insts_call_instruction_offset, Register Rtoc) {
2342   // Start the stub.
2343   address stub = start_a_stub(64);
2344   if (stub == NULL) { return NULL; } // CodeCache full: bail out
2345 
2346   // Create a trampoline stub relocation which relates this trampoline stub
2347   // with the call instruction at insts_call_instruction_offset in the
2348   // instructions code-section.
2349   relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset));
2350   const int stub_start_offset = offset();
2351 
2352   // For java_to_interp stubs we use R11_scratch1 as scratch register
2353   // and in call trampoline stubs we use R12_scratch2. This way we
2354   // can distinguish them (see is_NativeCallTrampolineStub_at()).
2355   Register reg_scratch = R12_scratch2;
2356 
2357   // Now, create the trampoline stub's code:
2358   // - load the TOC
2359   // - load the call target from the constant pool
2360   // - call
2361   if (Rtoc == noreg) {
2362     calculate_address_from_global_toc(reg_scratch, method_toc());
2363     Rtoc = reg_scratch;
2364   }
2365 
2366   ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false);
2367   mtctr(reg_scratch);
2368   bctr();
2369 
2370   const address stub_start_addr = addr_at(stub_start_offset);
2371 
2372   // Assert that the encoded destination_toc_offset can be identified and that it is correct.
2373   assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(),
2374          "encoded offset into the constant pool must match");
2375   // Trampoline_stub_size should be good.
2376   assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size");
2377   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
2378 
2379   // End the stub.
2380   end_a_stub();
2381   return stub;
2382 }
2383 
2384 // TM on PPC64.
2385 void MacroAssembler::atomic_inc_ptr(Register addr, Register result, int simm16) {
2386   Label retry;
2387   bind(retry);
2388   ldarx(result, addr, /*hint*/ false);
2389   addi(result, result, simm16);
2390   stdcx_(result, addr);
2391   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
2392     bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
2393   } else {
2394     bne(                  CCR0, retry); // stXcx_ sets CCR0
2395   }
2396 }
2397 
2398 void MacroAssembler::atomic_ori_int(Register addr, Register result, int uimm16) {
2399   Label retry;
2400   bind(retry);
2401   lwarx(result, addr, /*hint*/ false);
2402   ori(result, result, uimm16);
2403   stwcx_(result, addr);
2404   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
2405     bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
2406   } else {
2407     bne(                  CCR0, retry); // stXcx_ sets CCR0
2408   }
2409 }
2410 
2411 #if INCLUDE_RTM_OPT
2412 
2413 // Update rtm_counters based on abort status
2414 // input: abort_status
2415 //        rtm_counters_Reg (RTMLockingCounters*)
2416 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters_Reg) {
2417   // Mapping to keep PreciseRTMLockingStatistics similar to x86.
2418   // x86 ppc (! means inverted, ? means not the same)
2419   //  0   31  Set if abort caused by XABORT instruction.
2420   //  1  ! 7  If set, the transaction may succeed on a retry. This bit is always clear if bit 0 is set.
2421   //  2   13  Set if another logical processor conflicted with a memory address that was part of the transaction that aborted.
2422   //  3   10  Set if an internal buffer overflowed.
2423   //  4  ?12  Set if a debug breakpoint was hit.
2424   //  5  ?32  Set if an abort occurred during execution of a nested transaction.
2425   const int failure_bit[] = {tm_tabort, // Signal handler will set this too.
2426                              tm_failure_persistent, // Inverted: transient.
2427                              tm_non_trans_cf,
2428                              tm_trans_cf,
2429                              tm_footprint_of,
2430                              tm_failure_code,
2431                              tm_transaction_level};
2432 
2433   const bool failure_logic_inv[] = {false,  // tabort
2434                                     true,   // failure_persistent
2435                                     false,  // non_trans_cf
2436                                     false,  // trans_cf
2437                                     false,  // footprint_of
2438                                     true,   // failure_code
2439                                     false}; // transaction_level
2440 
2441   const int num_failure_bits = sizeof(failure_bit) / sizeof(int);
2442   const int num_counters = RTMLockingCounters::ABORT_STATUS_LIMIT;
2443 
2444   bool bit2counter_map[][num_counters] =
2445   // counters:
2446   // 0        1        2         3         4         5
2447   // abort  , persist, conflict, overflow, debug   , nested        bits:
2448   {{ true   , false  , false   , false   , false   , false },   // abort
2449    { false  , true   , false   , false   , false   , false },   // failure_persistent
2450    { false  , false  , true    , false   , false   , false },   // non_trans_cf
2451    { false  , false  , true    , false   , false   , false },   // trans_cf
2452    { false  , false  , false   , true    , false   , false },   // footprint_of
2453    { false  , false  , false   , false   , true    , false },   // failure_code = 0xD4
2454    { false  , false  , false   , false   , false   , true  }};  // transaction_level > 1
2455   // ...
2456 
2457   // Move abort_status value to R0 and use abort_status register as a
2458   // temporary register because R0 as third operand in ld/std is treated
2459   // as base address zero (value). Likewise, R0 as second operand in addi
2460   // is problematic because it amounts to li.
2461   const Register temp_Reg = abort_status;
2462   const Register abort_status_R0 = R0;
2463   mr(abort_status_R0, abort_status);
2464 
2465   // Keep track of offsets added to rtm_counters_Reg to restore it back.
2466   int counters_offs = RTMLockingCounters::abort_count_offset();
2467   addi(rtm_counters_Reg, rtm_counters_Reg, counters_offs);
2468 
2469   // Increment total abort counter.
2470   // atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically.
2471   ldx(temp_Reg, rtm_counters_Reg);
2472   addi(temp_Reg, temp_Reg, 1);
2473   stdx(temp_Reg, rtm_counters_Reg);
2474 
2475   // Increment specific abort counters.
2476   if (PrintPreciseRTMLockingStatistics) {
2477 
2478     int abort_offs;
2479 
2480     abort_offs = RTMLockingCounters::abortX_count_offset() - counters_offs;
2481     addi(rtm_counters_Reg, rtm_counters_Reg, abort_offs);
2482 
2483     // Keep track of offsets added to rtm_counters_Reg.
2484     counters_offs += abort_offs;
2485 
2486     for (int nbit = 0; nbit < num_failure_bits; nbit++) {
2487       for (int ncounter = 0; ncounter < num_counters; ncounter++) {
2488         if (bit2counter_map[nbit][ncounter] == true) {
2489 
2490               Label check_abort;
2491 
2492               // Counter offset based on counter number (counter * 8 bytes).
2493               abort_offs = ncounter << 3;
2494 
2495               if (failure_bit[nbit] == tm_transaction_level) {
2496                 // Don't check outer transaction, TL = 1 (bit 63). Hence only
2497                 // 11 bits in the TL field are checked to find out if failure
2498                 // occured in a nested transaction. This check also matches
2499                 // the case when nesting_of = 1 (nesting overflow).
2500                 rldicr_(temp_Reg, abort_status_R0, failure_bit[nbit], 10);
2501               } else if (failure_bit[nbit] == tm_failure_code) {
2502                 // Check failure code for trap or illegal caught in TM.
2503                 // Bits 0:7 are tested as bit 7 (persistent) is copied from
2504                 // tabort or treclaim source operand.
2505                 // On Linux: trap or illegal is TM_CAUSE_SIGNAL (0xD4).
2506                 rldicl(temp_Reg, abort_status_R0, 8, 56);
2507                 cmpdi(CCR0, temp_Reg, 0xD4);
2508               } else {
2509                 rldicr_(temp_Reg, abort_status_R0, failure_bit[nbit], 0);
2510               }
2511 
2512               if (failure_logic_inv[nbit] == true) {
2513                 bne(CCR0, check_abort);
2514               } else {
2515                 beq(CCR0, check_abort);
2516               }
2517 
2518               // We don't increment atomically.
2519               ld(temp_Reg, abort_offs, rtm_counters_Reg);
2520               addi(temp_Reg, temp_Reg, 1);
2521               std(temp_Reg, abort_offs, rtm_counters_Reg);
2522 
2523               bind(check_abort);
2524         }
2525       }
2526     }
2527   }
2528 
2529   // Restore rtm_counters_Reg and abort_status.
2530   addi(rtm_counters_Reg, rtm_counters_Reg, -counters_offs);
2531   mr(abort_status, abort_status_R0);
2532 }
2533 
2534 // Branch if (random & (count-1) != 0), count is 2^n
2535 // tmp and CR0 are killed
2536 void MacroAssembler::branch_on_random_using_tb(Register tmp, int count, Label& brLabel) {
2537   mftb(tmp);
2538   andi_(tmp, tmp, count-1);
2539   bne(CCR0, brLabel);
2540 }
2541 
2542 // Perform abort ratio calculation, set no_rtm bit if high ratio.
2543 // input:  rtm_counters_Reg (RTMLockingCounters* address) - KILLED
2544 void MacroAssembler::rtm_abort_ratio_calculation(Register rtm_counters_Reg,
2545                                                  RTMLockingCounters* rtm_counters,
2546                                                  Metadata* method_data) {
2547   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
2548 
2549   if (RTMLockingCalculationDelay > 0) {
2550     // Delay calculation.
2551     ld(rtm_counters_Reg, (RegisterOrConstant)(intptr_t)RTMLockingCounters::rtm_calculation_flag_addr());
2552     cmpdi(CCR0, rtm_counters_Reg, 0);
2553     beq(CCR0, L_done);
2554     load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2555   }
2556   // Abort ratio calculation only if abort_count > RTMAbortThreshold.
2557   //   Aborted transactions = abort_count * 100
2558   //   All transactions = total_count *  RTMTotalCountIncrRate
2559   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
2560   ld(R0, RTMLockingCounters::abort_count_offset(), rtm_counters_Reg);
2561   if (is_simm(RTMAbortThreshold, 16)) {   // cmpdi can handle 16bit immediate only.
2562     cmpdi(CCR0, R0, RTMAbortThreshold);
2563     blt(CCR0, L_check_always_rtm2);  // reload of rtm_counters_Reg not necessary
2564   } else {
2565     load_const_optimized(rtm_counters_Reg, RTMAbortThreshold);
2566     cmpd(CCR0, R0, rtm_counters_Reg);
2567     blt(CCR0, L_check_always_rtm1);  // reload of rtm_counters_Reg required
2568   }
2569   mulli(R0, R0, 100);
2570 
2571   const Register tmpReg = rtm_counters_Reg;
2572   ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2573   mulli(tmpReg, tmpReg, RTMTotalCountIncrRate); // allowable range: int16
2574   mulli(tmpReg, tmpReg, RTMAbortRatio);         // allowable range: int16
2575   cmpd(CCR0, R0, tmpReg);
2576   blt(CCR0, L_check_always_rtm1); // jump to reload
2577   if (method_data != NULL) {
2578     // Set rtm_state to "no rtm" in MDO.
2579     // Not using a metadata relocation. Method and Class Loader are kept alive anyway.
2580     // (See nmethod::metadata_do and CodeBuffer::finalize_oop_references.)
2581     load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2582     atomic_ori_int(R0, tmpReg, NoRTM);
2583   }
2584   b(L_done);
2585 
2586   bind(L_check_always_rtm1);
2587   load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2588   bind(L_check_always_rtm2);
2589   ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2590   int64_t thresholdValue = RTMLockingThreshold / RTMTotalCountIncrRate;
2591   if (is_simm(thresholdValue, 16)) {   // cmpdi can handle 16bit immediate only.
2592     cmpdi(CCR0, tmpReg, thresholdValue);
2593   } else {
2594     load_const_optimized(R0, thresholdValue);
2595     cmpd(CCR0, tmpReg, R0);
2596   }
2597   blt(CCR0, L_done);
2598   if (method_data != NULL) {
2599     // Set rtm_state to "always rtm" in MDO.
2600     // Not using a metadata relocation. See above.
2601     load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2602     atomic_ori_int(R0, tmpReg, UseRTM);
2603   }
2604   bind(L_done);
2605 }
2606 
2607 // Update counters and perform abort ratio calculation.
2608 // input: abort_status_Reg
2609 void MacroAssembler::rtm_profiling(Register abort_status_Reg, Register temp_Reg,
2610                                    RTMLockingCounters* rtm_counters,
2611                                    Metadata* method_data,
2612                                    bool profile_rtm) {
2613 
2614   assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2615   // Update rtm counters based on state at abort.
2616   // Reads abort_status_Reg, updates flags.
2617   assert_different_registers(abort_status_Reg, temp_Reg);
2618   load_const_optimized(temp_Reg, (address)rtm_counters, R0);
2619   rtm_counters_update(abort_status_Reg, temp_Reg);
2620   if (profile_rtm) {
2621     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2622     rtm_abort_ratio_calculation(temp_Reg, rtm_counters, method_data);
2623   }
2624 }
2625 
2626 // Retry on abort if abort's status indicates non-persistent failure.
2627 // inputs: retry_count_Reg
2628 //       : abort_status_Reg
2629 // output: retry_count_Reg decremented by 1
2630 void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg,
2631                                              Label& retryLabel, Label* checkRetry) {
2632   Label doneRetry;
2633   rldicr_(R0, abort_status_Reg, tm_failure_persistent, 0);
2634   bne(CCR0, doneRetry);
2635   if (checkRetry) { bind(*checkRetry); }
2636   addic_(retry_count_Reg, retry_count_Reg, -1);
2637   blt(CCR0, doneRetry);
2638   b(retryLabel);
2639   bind(doneRetry);
2640 }
2641 
2642 // Spin and retry if lock is busy.
2643 // inputs: owner_addr_Reg (monitor address)
2644 //       : retry_count_Reg
2645 // output: retry_count_Reg decremented by 1
2646 // CTR is killed
2647 void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register owner_addr_Reg, Label& retryLabel) {
2648   Label SpinLoop, doneRetry, doRetry;
2649   addic_(retry_count_Reg, retry_count_Reg, -1);
2650   blt(CCR0, doneRetry);
2651 
2652   if (RTMSpinLoopCount > 1) {
2653     li(R0, RTMSpinLoopCount);
2654     mtctr(R0);
2655   }
2656 
2657   // low thread priority
2658   smt_prio_low();
2659   bind(SpinLoop);
2660 
2661   if (RTMSpinLoopCount > 1) {
2662     bdz(doRetry);
2663     ld(R0, 0, owner_addr_Reg);
2664     cmpdi(CCR0, R0, 0);
2665     bne(CCR0, SpinLoop);
2666   }
2667 
2668   bind(doRetry);
2669 
2670   // restore thread priority to default in userspace
2671 #ifdef LINUX
2672   smt_prio_medium_low();
2673 #else
2674   smt_prio_medium();
2675 #endif
2676 
2677   b(retryLabel);
2678 
2679   bind(doneRetry);
2680 }
2681 
2682 // Use RTM for normal stack locks.
2683 // Input: objReg (object to lock)
2684 void MacroAssembler::rtm_stack_locking(ConditionRegister flag,
2685                                        Register obj, Register mark_word, Register tmp,
2686                                        Register retry_on_abort_count_Reg,
2687                                        RTMLockingCounters* stack_rtm_counters,
2688                                        Metadata* method_data, bool profile_rtm,
2689                                        Label& DONE_LABEL, Label& IsInflated) {
2690   assert(UseRTMForStackLocks, "why call this otherwise?");
2691   assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
2692   Label L_rtm_retry, L_decrement_retry, L_on_abort;
2693 
2694   if (RTMRetryCount > 0) {
2695     load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
2696     bind(L_rtm_retry);
2697   }
2698   andi_(R0, mark_word, markOopDesc::monitor_value);  // inflated vs stack-locked|neutral|biased
2699   bne(CCR0, IsInflated);
2700 
2701   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2702     Label L_noincrement;
2703     if (RTMTotalCountIncrRate > 1) {
2704       branch_on_random_using_tb(tmp, RTMTotalCountIncrRate, L_noincrement);
2705     }
2706     assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
2707     load_const_optimized(tmp, (address)stack_rtm_counters->total_count_addr(), R0);
2708     //atomic_inc_ptr(tmp, /*temp, will be reloaded*/mark_word); We don't increment atomically
2709     ldx(mark_word, tmp);
2710     addi(mark_word, mark_word, 1);
2711     stdx(mark_word, tmp);
2712     bind(L_noincrement);
2713   }
2714   tbegin_();
2715   beq(CCR0, L_on_abort);
2716   ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);         // Reload in transaction, conflicts need to be tracked.
2717   andi(R0, mark_word, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
2718   cmpwi(flag, R0, markOopDesc::unlocked_value);                // bits = 001 unlocked
2719   beq(flag, DONE_LABEL);                                       // all done if unlocked
2720 
2721   if (UseRTMXendForLockBusy) {
2722     tend_();
2723     b(L_decrement_retry);
2724   } else {
2725     tabort_();
2726   }
2727   bind(L_on_abort);
2728   const Register abort_status_Reg = tmp;
2729   mftexasr(abort_status_Reg);
2730   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2731     rtm_profiling(abort_status_Reg, /*temp*/mark_word, stack_rtm_counters, method_data, profile_rtm);
2732   }
2733   ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // reload
2734   if (RTMRetryCount > 0) {
2735     // Retry on lock abort if abort status is not permanent.
2736     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry, &L_decrement_retry);
2737   } else {
2738     bind(L_decrement_retry);
2739   }
2740 }
2741 
2742 // Use RTM for inflating locks
2743 // inputs: obj       (object to lock)
2744 //         mark_word (current header - KILLED)
2745 //         boxReg    (on-stack box address (displaced header location) - KILLED)
2746 void MacroAssembler::rtm_inflated_locking(ConditionRegister flag,
2747                                           Register obj, Register mark_word, Register boxReg,
2748                                           Register retry_on_busy_count_Reg, Register retry_on_abort_count_Reg,
2749                                           RTMLockingCounters* rtm_counters,
2750                                           Metadata* method_data, bool profile_rtm,
2751                                           Label& DONE_LABEL) {
2752   assert(UseRTMLocking, "why call this otherwise?");
2753   Label L_rtm_retry, L_decrement_retry, L_on_abort;
2754   // Clean monitor_value bit to get valid pointer.
2755   int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markOopDesc::monitor_value;
2756 
2757   // Store non-null, using boxReg instead of (intptr_t)markOopDesc::unused_mark().
2758   std(boxReg, BasicLock::displaced_header_offset_in_bytes(), boxReg);
2759   const Register tmpReg = boxReg;
2760   const Register owner_addr_Reg = mark_word;
2761   addi(owner_addr_Reg, mark_word, owner_offset);
2762 
2763   if (RTMRetryCount > 0) {
2764     load_const_optimized(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy.
2765     load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort.
2766     bind(L_rtm_retry);
2767   }
2768   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2769     Label L_noincrement;
2770     if (RTMTotalCountIncrRate > 1) {
2771       branch_on_random_using_tb(R0, RTMTotalCountIncrRate, L_noincrement);
2772     }
2773     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2774     load_const(R0, (address)rtm_counters->total_count_addr(), tmpReg);
2775     //atomic_inc_ptr(R0, tmpReg); We don't increment atomically
2776     ldx(tmpReg, R0);
2777     addi(tmpReg, tmpReg, 1);
2778     stdx(tmpReg, R0);
2779     bind(L_noincrement);
2780   }
2781   tbegin_();
2782   beq(CCR0, L_on_abort);
2783   // We don't reload mark word. Will only be reset at safepoint.
2784   ld(R0, 0, owner_addr_Reg); // Load in transaction, conflicts need to be tracked.
2785   cmpdi(flag, R0, 0);
2786   beq(flag, DONE_LABEL);
2787 
2788   if (UseRTMXendForLockBusy) {
2789     tend_();
2790     b(L_decrement_retry);
2791   } else {
2792     tabort_();
2793   }
2794   bind(L_on_abort);
2795   const Register abort_status_Reg = tmpReg;
2796   mftexasr(abort_status_Reg);
2797   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2798     rtm_profiling(abort_status_Reg, /*temp*/ owner_addr_Reg, rtm_counters, method_data, profile_rtm);
2799     // Restore owner_addr_Reg
2800     ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);
2801 #ifdef ASSERT
2802     andi_(R0, mark_word, markOopDesc::monitor_value);
2803     asm_assert_ne("must be inflated", 0xa754); // Deflating only allowed at safepoint.
2804 #endif
2805     addi(owner_addr_Reg, mark_word, owner_offset);
2806   }
2807   if (RTMRetryCount > 0) {
2808     // Retry on lock abort if abort status is not permanent.
2809     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
2810   }
2811 
2812   // Appears unlocked - try to swing _owner from null to non-null.
2813   cmpxchgd(flag, /*current val*/ R0, (intptr_t)0, /*new val*/ R16_thread, owner_addr_Reg,
2814            MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2815            MacroAssembler::cmpxchgx_hint_acquire_lock(), noreg, &L_decrement_retry, true);
2816 
2817   if (RTMRetryCount > 0) {
2818     // success done else retry
2819     b(DONE_LABEL);
2820     bind(L_decrement_retry);
2821     // Spin and retry if lock is busy.
2822     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, owner_addr_Reg, L_rtm_retry);
2823   } else {
2824     bind(L_decrement_retry);
2825   }
2826 }
2827 
2828 #endif //  INCLUDE_RTM_OPT
2829 
2830 // "The box" is the space on the stack where we copy the object mark.
2831 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box,
2832                                                Register temp, Register displaced_header, Register current_header,
2833                                                bool try_bias,
2834                                                RTMLockingCounters* rtm_counters,
2835                                                RTMLockingCounters* stack_rtm_counters,
2836                                                Metadata* method_data,
2837                                                bool use_rtm, bool profile_rtm) {
2838   assert_different_registers(oop, box, temp, displaced_header, current_header);
2839   assert(flag != CCR0, "bad condition register");
2840   Label cont;
2841   Label object_has_monitor;
2842   Label cas_failed;
2843 
2844   // Load markOop from object into displaced_header.
2845   ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop);
2846 
2847 
2848   // Always do locking in runtime.
2849   if (EmitSync & 0x01) {
2850     cmpdi(flag, oop, 0); // Oop can't be 0 here => always false.
2851     return;
2852   }
2853 
2854   if (try_bias) {
2855     biased_locking_enter(flag, oop, displaced_header, temp, current_header, cont);
2856   }
2857 
2858 #if INCLUDE_RTM_OPT
2859   if (UseRTMForStackLocks && use_rtm) {
2860     rtm_stack_locking(flag, oop, displaced_header, temp, /*temp*/ current_header,
2861                       stack_rtm_counters, method_data, profile_rtm,
2862                       cont, object_has_monitor);
2863   }
2864 #endif // INCLUDE_RTM_OPT
2865 
2866   // Handle existing monitor.
2867   if ((EmitSync & 0x02) == 0) {
2868     // The object has an existing monitor iff (mark & monitor_value) != 0.
2869     andi_(temp, displaced_header, markOopDesc::monitor_value);
2870     bne(CCR0, object_has_monitor);
2871   }
2872 
2873   // Set displaced_header to be (markOop of object | UNLOCK_VALUE).
2874   ori(displaced_header, displaced_header, markOopDesc::unlocked_value);
2875 
2876   // Load Compare Value application register.
2877 
2878   // Initialize the box. (Must happen before we update the object mark!)
2879   std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2880 
2881   // Must fence, otherwise, preceding store(s) may float below cmpxchg.
2882   // Compare object markOop with mark and if equal exchange scratch1 with object markOop.
2883   cmpxchgd(/*flag=*/flag,
2884            /*current_value=*/current_header,
2885            /*compare_value=*/displaced_header,
2886            /*exchange_value=*/box,
2887            /*where=*/oop,
2888            MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2889            MacroAssembler::cmpxchgx_hint_acquire_lock(),
2890            noreg,
2891            &cas_failed,
2892            /*check without membar and ldarx first*/true);
2893   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2894 
2895   // If the compare-and-exchange succeeded, then we found an unlocked
2896   // object and we have now locked it.
2897   b(cont);
2898 
2899   bind(cas_failed);
2900   // We did not see an unlocked object so try the fast recursive case.
2901 
2902   // Check if the owner is self by comparing the value in the markOop of object
2903   // (current_header) with the stack pointer.
2904   sub(current_header, current_header, R1_SP);
2905   load_const_optimized(temp, ~(os::vm_page_size()-1) | markOopDesc::lock_mask_in_place);
2906 
2907   and_(R0/*==0?*/, current_header, temp);
2908   // If condition is true we are cont and hence we can store 0 as the
2909   // displaced header in the box, which indicates that it is a recursive lock.
2910   mcrf(flag,CCR0);
2911   std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box);
2912 
2913   // Handle existing monitor.
2914   if ((EmitSync & 0x02) == 0) {
2915     b(cont);
2916 
2917     bind(object_has_monitor);
2918     // The object's monitor m is unlocked iff m->owner == NULL,
2919     // otherwise m->owner may contain a thread or a stack address.
2920 
2921 #if INCLUDE_RTM_OPT
2922     // Use the same RTM locking code in 32- and 64-bit VM.
2923     if (use_rtm) {
2924       rtm_inflated_locking(flag, oop, displaced_header, box, temp, /*temp*/ current_header,
2925                            rtm_counters, method_data, profile_rtm, cont);
2926     } else {
2927 #endif // INCLUDE_RTM_OPT
2928 
2929     // Try to CAS m->owner from NULL to current thread.
2930     addi(temp, displaced_header, ObjectMonitor::owner_offset_in_bytes()-markOopDesc::monitor_value);
2931     cmpxchgd(/*flag=*/flag,
2932              /*current_value=*/current_header,
2933              /*compare_value=*/(intptr_t)0,
2934              /*exchange_value=*/R16_thread,
2935              /*where=*/temp,
2936              MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2937              MacroAssembler::cmpxchgx_hint_acquire_lock());
2938 
2939     // Store a non-null value into the box.
2940     std(box, BasicLock::displaced_header_offset_in_bytes(), box);
2941 
2942 #   ifdef ASSERT
2943     bne(flag, cont);
2944     // We have acquired the monitor, check some invariants.
2945     addi(/*monitor=*/temp, temp, -ObjectMonitor::owner_offset_in_bytes());
2946     // Invariant 1: _recursions should be 0.
2947     //assert(ObjectMonitor::recursions_size_in_bytes() == 8, "unexpected size");
2948     asm_assert_mem8_is_zero(ObjectMonitor::recursions_offset_in_bytes(), temp,
2949                             "monitor->_recursions should be 0", -1);
2950 #   endif
2951 
2952 #if INCLUDE_RTM_OPT
2953     } // use_rtm()
2954 #endif
2955   }
2956 
2957   bind(cont);
2958   // flag == EQ indicates success
2959   // flag == NE indicates failure
2960 }
2961 
2962 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box,
2963                                                  Register temp, Register displaced_header, Register current_header,
2964                                                  bool try_bias, bool use_rtm) {
2965   assert_different_registers(oop, box, temp, displaced_header, current_header);
2966   assert(flag != CCR0, "bad condition register");
2967   Label cont;
2968   Label object_has_monitor;
2969 
2970   // Always do locking in runtime.
2971   if (EmitSync & 0x01) {
2972     cmpdi(flag, oop, 0); // Oop can't be 0 here => always false.
2973     return;
2974   }
2975 
2976   if (try_bias) {
2977     biased_locking_exit(flag, oop, current_header, cont);
2978   }
2979 
2980 #if INCLUDE_RTM_OPT
2981   if (UseRTMForStackLocks && use_rtm) {
2982     assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
2983     Label L_regular_unlock;
2984     ld(current_header, oopDesc::mark_offset_in_bytes(), oop);         // fetch markword
2985     andi(R0, current_header, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
2986     cmpwi(flag, R0, markOopDesc::unlocked_value);                     // bits = 001 unlocked
2987     bne(flag, L_regular_unlock);                                      // else RegularLock
2988     tend_();                                                          // otherwise end...
2989     b(cont);                                                          // ... and we're done
2990     bind(L_regular_unlock);
2991   }
2992 #endif
2993 
2994   // Find the lock address and load the displaced header from the stack.
2995   ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2996 
2997   // If the displaced header is 0, we have a recursive unlock.
2998   cmpdi(flag, displaced_header, 0);
2999   beq(flag, cont);
3000 
3001   // Handle existing monitor.
3002   if ((EmitSync & 0x02) == 0) {
3003     // The object has an existing monitor iff (mark & monitor_value) != 0.
3004     RTM_OPT_ONLY( if (!(UseRTMForStackLocks && use_rtm)) ) // skip load if already done
3005     ld(current_header, oopDesc::mark_offset_in_bytes(), oop);
3006     andi_(R0, current_header, markOopDesc::monitor_value);
3007     bne(CCR0, object_has_monitor);
3008   }
3009 
3010   // Check if it is still a light weight lock, this is is true if we see
3011   // the stack address of the basicLock in the markOop of the object.
3012   // Cmpxchg sets flag to cmpd(current_header, box).
3013   cmpxchgd(/*flag=*/flag,
3014            /*current_value=*/current_header,
3015            /*compare_value=*/box,
3016            /*exchange_value=*/displaced_header,
3017            /*where=*/oop,
3018            MacroAssembler::MemBarRel,
3019            MacroAssembler::cmpxchgx_hint_release_lock(),
3020            noreg,
3021            &cont);
3022 
3023   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
3024 
3025   // Handle existing monitor.
3026   if ((EmitSync & 0x02) == 0) {
3027     b(cont);
3028 
3029     bind(object_has_monitor);
3030     addi(current_header, current_header, -markOopDesc::monitor_value); // monitor
3031     ld(temp,             ObjectMonitor::owner_offset_in_bytes(), current_header);
3032 
3033     // It's inflated.
3034 #if INCLUDE_RTM_OPT
3035     if (use_rtm) {
3036       Label L_regular_inflated_unlock;
3037       // Clean monitor_value bit to get valid pointer
3038       cmpdi(flag, temp, 0);
3039       bne(flag, L_regular_inflated_unlock);
3040       tend_();
3041       b(cont);
3042       bind(L_regular_inflated_unlock);
3043     }
3044 #endif
3045 
3046     ld(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header);
3047     xorr(temp, R16_thread, temp);      // Will be 0 if we are the owner.
3048     orr(temp, temp, displaced_header); // Will be 0 if there are 0 recursions.
3049     cmpdi(flag, temp, 0);
3050     bne(flag, cont);
3051 
3052     ld(temp,             ObjectMonitor::EntryList_offset_in_bytes(), current_header);
3053     ld(displaced_header, ObjectMonitor::cxq_offset_in_bytes(), current_header);
3054     orr(temp, temp, displaced_header); // Will be 0 if both are 0.
3055     cmpdi(flag, temp, 0);
3056     bne(flag, cont);
3057     release();
3058     std(temp, ObjectMonitor::owner_offset_in_bytes(), current_header);
3059   }
3060 
3061   bind(cont);
3062   // flag == EQ indicates success
3063   // flag == NE indicates failure
3064 }
3065 
3066 // Write serialization page so VM thread can do a pseudo remote membar.
3067 // We use the current thread pointer to calculate a thread specific
3068 // offset to write to within the page. This minimizes bus traffic
3069 // due to cache line collision.
3070 void MacroAssembler::serialize_memory(Register thread, Register tmp1, Register tmp2) {
3071   srdi(tmp2, thread, os::get_serialize_page_shift_count());
3072 
3073   int mask = os::vm_page_size() - sizeof(int);
3074   if (Assembler::is_simm(mask, 16)) {
3075     andi(tmp2, tmp2, mask);
3076   } else {
3077     lis(tmp1, (int)((signed short) (mask >> 16)));
3078     ori(tmp1, tmp1, mask & 0x0000ffff);
3079     andr(tmp2, tmp2, tmp1);
3080   }
3081 
3082   load_const(tmp1, (long) os::get_memory_serialize_page());
3083   release();
3084   stwx(R0, tmp1, tmp2);
3085 }
3086 
3087 void MacroAssembler::safepoint_poll(Label& slow_path, Register temp_reg) {
3088   if (SafepointMechanism::uses_thread_local_poll()) {
3089     ld(temp_reg, in_bytes(Thread::polling_page_offset()), R16_thread);
3090     // Armed page has poll_bit set.
3091     andi_(temp_reg, temp_reg, SafepointMechanism::poll_bit());
3092   } else {
3093     lwz(temp_reg, (RegisterOrConstant)(intptr_t)SafepointSynchronize::address_of_state());
3094     cmpwi(CCR0, temp_reg, SafepointSynchronize::_not_synchronized);
3095   }
3096   bne(CCR0, slow_path);
3097 }
3098 
3099 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2, bool needs_frame) {
3100   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3101   bs->resolve_jobject(this, value, tmp1, tmp2, needs_frame);
3102 }
3103 
3104 // Values for last_Java_pc, and last_Java_sp must comply to the rules
3105 // in frame_ppc.hpp.
3106 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) {
3107   // Always set last_Java_pc and flags first because once last_Java_sp
3108   // is visible has_last_Java_frame is true and users will look at the
3109   // rest of the fields. (Note: flags should always be zero before we
3110   // get here so doesn't need to be set.)
3111 
3112   // Verify that last_Java_pc was zeroed on return to Java
3113   asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread,
3114                           "last_Java_pc not zeroed before leaving Java", 0x200);
3115 
3116   // When returning from calling out from Java mode the frame anchor's
3117   // last_Java_pc will always be set to NULL. It is set here so that
3118   // if we are doing a call to native (not VM) that we capture the
3119   // known pc and don't have to rely on the native call having a
3120   // standard frame linkage where we can find the pc.
3121   if (last_Java_pc != noreg)
3122     std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
3123 
3124   // Set last_Java_sp last.
3125   std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
3126 }
3127 
3128 void MacroAssembler::reset_last_Java_frame(void) {
3129   asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()),
3130                              R16_thread, "SP was not set, still zero", 0x202);
3131 
3132   BLOCK_COMMENT("reset_last_Java_frame {");
3133   li(R0, 0);
3134 
3135   // _last_Java_sp = 0
3136   std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
3137 
3138   // _last_Java_pc = 0
3139   std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
3140   BLOCK_COMMENT("} reset_last_Java_frame");
3141 }
3142 
3143 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) {
3144   assert_different_registers(sp, tmp1);
3145 
3146   // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via
3147   // TOP_IJAVA_FRAME_ABI.
3148   // FIXME: assert that we really have a TOP_IJAVA_FRAME here!
3149   address entry = pc();
3150   load_const_optimized(tmp1, entry);
3151 
3152   set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1);
3153 }
3154 
3155 void MacroAssembler::get_vm_result(Register oop_result) {
3156   // Read:
3157   //   R16_thread
3158   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
3159   //
3160   // Updated:
3161   //   oop_result
3162   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
3163 
3164   verify_thread();
3165 
3166   ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread);
3167   li(R0, 0);
3168   std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread);
3169 
3170   verify_oop(oop_result);
3171 }
3172 
3173 void MacroAssembler::get_vm_result_2(Register metadata_result) {
3174   // Read:
3175   //   R16_thread
3176   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
3177   //
3178   // Updated:
3179   //   metadata_result
3180   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
3181 
3182   ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
3183   li(R0, 0);
3184   std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
3185 }
3186 
3187 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3188   Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided.
3189   if (Universe::narrow_klass_base() != 0) {
3190     // Use dst as temp if it is free.
3191     sub_const_optimized(dst, current, Universe::narrow_klass_base(), R0);
3192     current = dst;
3193   }
3194   if (Universe::narrow_klass_shift() != 0) {
3195     srdi(dst, current, Universe::narrow_klass_shift());
3196     current = dst;
3197   }
3198   return current;
3199 }
3200 
3201 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) {
3202   if (UseCompressedClassPointers) {
3203     Register compressedKlass = encode_klass_not_null(ck, klass);
3204     stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop);
3205   } else {
3206     std(klass, oopDesc::klass_offset_in_bytes(), dst_oop);
3207   }
3208 }
3209 
3210 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) {
3211   if (UseCompressedClassPointers) {
3212     if (val == noreg) {
3213       val = R0;
3214       li(val, 0);
3215     }
3216     stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed
3217   }
3218 }
3219 
3220 int MacroAssembler::instr_size_for_decode_klass_not_null() {
3221   if (!UseCompressedClassPointers) return 0;
3222   int num_instrs = 1;  // shift or move
3223   if (Universe::narrow_klass_base() != 0) num_instrs = 7;  // shift + load const + add
3224   return num_instrs * BytesPerInstWord;
3225 }
3226 
3227 void MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3228   assert(dst != R0, "Dst reg may not be R0, as R0 is used here.");
3229   if (src == noreg) src = dst;
3230   Register shifted_src = src;
3231   if (Universe::narrow_klass_shift() != 0 ||
3232       Universe::narrow_klass_base() == 0 && src != dst) {  // Move required.
3233     shifted_src = dst;
3234     sldi(shifted_src, src, Universe::narrow_klass_shift());
3235   }
3236   if (Universe::narrow_klass_base() != 0) {
3237     add_const_optimized(dst, shifted_src, Universe::narrow_klass_base(), R0);
3238   }
3239 }
3240 
3241 void MacroAssembler::load_klass(Register dst, Register src) {
3242   if (UseCompressedClassPointers) {
3243     lwz(dst, oopDesc::klass_offset_in_bytes(), src);
3244     // Attention: no null check here!
3245     decode_klass_not_null(dst, dst);
3246   } else {
3247     ld(dst, oopDesc::klass_offset_in_bytes(), src);
3248   }
3249 }
3250 
3251 // ((OopHandle)result).resolve();
3252 void MacroAssembler::resolve_oop_handle(Register result) {
3253   // OopHandle::resolve is an indirection.
3254   ld(result, 0, result);
3255 }
3256 
3257 void MacroAssembler::load_mirror_from_const_method(Register mirror, Register const_method) {
3258   ld(mirror, in_bytes(ConstMethod::constants_offset()), const_method);
3259   ld(mirror, ConstantPool::pool_holder_offset_in_bytes(), mirror);
3260   ld(mirror, in_bytes(Klass::java_mirror_offset()), mirror);
3261   resolve_oop_handle(mirror);
3262 }
3263 
3264 // Clear Array
3265 // For very short arrays. tmp == R0 is allowed.
3266 void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) {
3267   if (cnt_dwords > 0) { li(tmp, 0); }
3268   for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); }
3269 }
3270 
3271 // Version for constant short array length. Kills base_ptr. tmp == R0 is allowed.
3272 void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) {
3273   if (cnt_dwords < 8) {
3274     clear_memory_unrolled(base_ptr, cnt_dwords, tmp);
3275     return;
3276   }
3277 
3278   Label loop;
3279   const long loopcnt   = cnt_dwords >> 1,
3280              remainder = cnt_dwords & 1;
3281 
3282   li(tmp, loopcnt);
3283   mtctr(tmp);
3284   li(tmp, 0);
3285   bind(loop);
3286     std(tmp, 0, base_ptr);
3287     std(tmp, 8, base_ptr);
3288     addi(base_ptr, base_ptr, 16);
3289     bdnz(loop);
3290   if (remainder) { std(tmp, 0, base_ptr); }
3291 }
3292 
3293 // Kills both input registers. tmp == R0 is allowed.
3294 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) {
3295   // Procedure for large arrays (uses data cache block zero instruction).
3296     Label startloop, fast, fastloop, small_rest, restloop, done;
3297     const int cl_size         = VM_Version::L1_data_cache_line_size(),
3298               cl_dwords       = cl_size >> 3,
3299               cl_dw_addr_bits = exact_log2(cl_dwords),
3300               dcbz_min        = 1,  // Min count of dcbz executions, needs to be >0.
3301               min_cnt         = ((dcbz_min + 1) << cl_dw_addr_bits) - 1;
3302 
3303   if (const_cnt >= 0) {
3304     // Constant case.
3305     if (const_cnt < min_cnt) {
3306       clear_memory_constlen(base_ptr, const_cnt, tmp);
3307       return;
3308     }
3309     load_const_optimized(cnt_dwords, const_cnt, tmp);
3310   } else {
3311     // cnt_dwords already loaded in register. Need to check size.
3312     cmpdi(CCR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included).
3313     blt(CCR1, small_rest);
3314   }
3315     rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line.
3316     beq(CCR0, fast);                                  // Already 128byte aligned.
3317 
3318     subfic(tmp, tmp, cl_dwords);
3319     mtctr(tmp);                        // Set ctr to hit 128byte boundary (0<ctr<cl_dwords).
3320     subf(cnt_dwords, tmp, cnt_dwords); // rest.
3321     li(tmp, 0);
3322 
3323   bind(startloop);                     // Clear at the beginning to reach 128byte boundary.
3324     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3325     addi(base_ptr, base_ptr, 8);
3326     bdnz(startloop);
3327 
3328   bind(fast);                                  // Clear 128byte blocks.
3329     srdi(tmp, cnt_dwords, cl_dw_addr_bits);    // Loop count for 128byte loop (>0).
3330     andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords.
3331     mtctr(tmp);                                // Load counter.
3332 
3333   bind(fastloop);
3334     dcbz(base_ptr);                    // Clear 128byte aligned block.
3335     addi(base_ptr, base_ptr, cl_size);
3336     bdnz(fastloop);
3337 
3338   bind(small_rest);
3339     cmpdi(CCR0, cnt_dwords, 0);        // size 0?
3340     beq(CCR0, done);                   // rest == 0
3341     li(tmp, 0);
3342     mtctr(cnt_dwords);                 // Load counter.
3343 
3344   bind(restloop);                      // Clear rest.
3345     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3346     addi(base_ptr, base_ptr, 8);
3347     bdnz(restloop);
3348 
3349   bind(done);
3350 }
3351 
3352 /////////////////////////////////////////// String intrinsics ////////////////////////////////////////////
3353 
3354 #ifdef COMPILER2
3355 // Intrinsics for CompactStrings
3356 
3357 // Compress char[] to byte[] by compressing 16 bytes at once.
3358 void MacroAssembler::string_compress_16(Register src, Register dst, Register cnt,
3359                                         Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5,
3360                                         Label& Lfailure) {
3361 
3362   const Register tmp0 = R0;
3363   assert_different_registers(src, dst, cnt, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5);
3364   Label Lloop, Lslow;
3365 
3366   // Check if cnt >= 8 (= 16 bytes)
3367   lis(tmp1, 0xFF);                // tmp1 = 0x00FF00FF00FF00FF
3368   srwi_(tmp2, cnt, 3);
3369   beq(CCR0, Lslow);
3370   ori(tmp1, tmp1, 0xFF);
3371   rldimi(tmp1, tmp1, 32, 0);
3372   mtctr(tmp2);
3373 
3374   // 2x unrolled loop
3375   bind(Lloop);
3376   ld(tmp2, 0, src);               // _0_1_2_3 (Big Endian)
3377   ld(tmp4, 8, src);               // _4_5_6_7
3378 
3379   orr(tmp0, tmp2, tmp4);
3380   rldicl(tmp3, tmp2, 6*8, 64-24); // _____1_2
3381   rldimi(tmp2, tmp2, 2*8, 2*8);   // _0_2_3_3
3382   rldicl(tmp5, tmp4, 6*8, 64-24); // _____5_6
3383   rldimi(tmp4, tmp4, 2*8, 2*8);   // _4_6_7_7
3384 
3385   andc_(tmp0, tmp0, tmp1);
3386   bne(CCR0, Lfailure);            // Not latin1.
3387   addi(src, src, 16);
3388 
3389   rlwimi(tmp3, tmp2, 0*8, 24, 31);// _____1_3
3390   srdi(tmp2, tmp2, 3*8);          // ____0_2_
3391   rlwimi(tmp5, tmp4, 0*8, 24, 31);// _____5_7
3392   srdi(tmp4, tmp4, 3*8);          // ____4_6_
3393 
3394   orr(tmp2, tmp2, tmp3);          // ____0123
3395   orr(tmp4, tmp4, tmp5);          // ____4567
3396 
3397   stw(tmp2, 0, dst);
3398   stw(tmp4, 4, dst);
3399   addi(dst, dst, 8);
3400   bdnz(Lloop);
3401 
3402   bind(Lslow);                    // Fallback to slow version
3403 }
3404 
3405 // Compress char[] to byte[]. cnt must be positive int.
3406 void MacroAssembler::string_compress(Register src, Register dst, Register cnt, Register tmp, Label& Lfailure) {
3407   Label Lloop;
3408   mtctr(cnt);
3409 
3410   bind(Lloop);
3411   lhz(tmp, 0, src);
3412   cmplwi(CCR0, tmp, 0xff);
3413   bgt(CCR0, Lfailure);            // Not latin1.
3414   addi(src, src, 2);
3415   stb(tmp, 0, dst);
3416   addi(dst, dst, 1);
3417   bdnz(Lloop);
3418 }
3419 
3420 // Inflate byte[] to char[] by inflating 16 bytes at once.
3421 void MacroAssembler::string_inflate_16(Register src, Register dst, Register cnt,
3422                                        Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
3423   const Register tmp0 = R0;
3424   assert_different_registers(src, dst, cnt, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5);
3425   Label Lloop, Lslow;
3426 
3427   // Check if cnt >= 8
3428   srwi_(tmp2, cnt, 3);
3429   beq(CCR0, Lslow);
3430   lis(tmp1, 0xFF);                // tmp1 = 0x00FF00FF
3431   ori(tmp1, tmp1, 0xFF);
3432   mtctr(tmp2);
3433 
3434   // 2x unrolled loop
3435   bind(Lloop);
3436   lwz(tmp2, 0, src);              // ____0123 (Big Endian)
3437   lwz(tmp4, 4, src);              // ____4567
3438   addi(src, src, 8);
3439 
3440   rldicl(tmp3, tmp2, 7*8, 64-8);  // _______2
3441   rlwimi(tmp2, tmp2, 3*8, 16, 23);// ____0113
3442   rldicl(tmp5, tmp4, 7*8, 64-8);  // _______6
3443   rlwimi(tmp4, tmp4, 3*8, 16, 23);// ____4557
3444 
3445   andc(tmp0, tmp2, tmp1);         // ____0_1_
3446   rlwimi(tmp2, tmp3, 2*8, 0, 23); // _____2_3
3447   andc(tmp3, tmp4, tmp1);         // ____4_5_
3448   rlwimi(tmp4, tmp5, 2*8, 0, 23); // _____6_7
3449 
3450   rldimi(tmp2, tmp0, 3*8, 0*8);   // _0_1_2_3
3451   rldimi(tmp4, tmp3, 3*8, 0*8);   // _4_5_6_7
3452 
3453   std(tmp2, 0, dst);
3454   std(tmp4, 8, dst);
3455   addi(dst, dst, 16);
3456   bdnz(Lloop);
3457 
3458   bind(Lslow);                    // Fallback to slow version
3459 }
3460 
3461 // Inflate byte[] to char[]. cnt must be positive int.
3462 void MacroAssembler::string_inflate(Register src, Register dst, Register cnt, Register tmp) {
3463   Label Lloop;
3464   mtctr(cnt);
3465 
3466   bind(Lloop);
3467   lbz(tmp, 0, src);
3468   addi(src, src, 1);
3469   sth(tmp, 0, dst);
3470   addi(dst, dst, 2);
3471   bdnz(Lloop);
3472 }
3473 
3474 void MacroAssembler::string_compare(Register str1, Register str2,
3475                                     Register cnt1, Register cnt2,
3476                                     Register tmp1, Register result, int ae) {
3477   const Register tmp0 = R0,
3478                  diff = tmp1;
3479 
3480   assert_different_registers(str1, str2, cnt1, cnt2, tmp0, tmp1, result);
3481   Label Ldone, Lslow, Lloop, Lreturn_diff;
3482 
3483   // Note: Making use of the fact that compareTo(a, b) == -compareTo(b, a)
3484   // we interchange str1 and str2 in the UL case and negate the result.
3485   // Like this, str1 is always latin1 encoded, except for the UU case.
3486   // In addition, we need 0 (or sign which is 0) extend.
3487 
3488   if (ae == StrIntrinsicNode::UU) {
3489     srwi(cnt1, cnt1, 1);
3490   } else {
3491     clrldi(cnt1, cnt1, 32);
3492   }
3493 
3494   if (ae != StrIntrinsicNode::LL) {
3495     srwi(cnt2, cnt2, 1);
3496   } else {
3497     clrldi(cnt2, cnt2, 32);
3498   }
3499 
3500   // See if the lengths are different, and calculate min in cnt1.
3501   // Save diff in case we need it for a tie-breaker.
3502   subf_(diff, cnt2, cnt1); // diff = cnt1 - cnt2
3503   // if (diff > 0) { cnt1 = cnt2; }
3504   if (VM_Version::has_isel()) {
3505     isel(cnt1, CCR0, Assembler::greater, /*invert*/ false, cnt2);
3506   } else {
3507     Label Lskip;
3508     blt(CCR0, Lskip);
3509     mr(cnt1, cnt2);
3510     bind(Lskip);
3511   }
3512 
3513   // Rename registers
3514   Register chr1 = result;
3515   Register chr2 = tmp0;
3516 
3517   // Compare multiple characters in fast loop (only implemented for same encoding).
3518   int stride1 = 8, stride2 = 8;
3519   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3520     int log2_chars_per_iter = (ae == StrIntrinsicNode::LL) ? 3 : 2;
3521     Label Lfastloop, Lskipfast;
3522 
3523     srwi_(tmp0, cnt1, log2_chars_per_iter);
3524     beq(CCR0, Lskipfast);
3525     rldicl(cnt2, cnt1, 0, 64 - log2_chars_per_iter); // Remaining characters.
3526     li(cnt1, 1 << log2_chars_per_iter); // Initialize for failure case: Rescan characters from current iteration.
3527     mtctr(tmp0);
3528 
3529     bind(Lfastloop);
3530     ld(chr1, 0, str1);
3531     ld(chr2, 0, str2);
3532     cmpd(CCR0, chr1, chr2);
3533     bne(CCR0, Lslow);
3534     addi(str1, str1, stride1);
3535     addi(str2, str2, stride2);
3536     bdnz(Lfastloop);
3537     mr(cnt1, cnt2); // Remaining characters.
3538     bind(Lskipfast);
3539   }
3540 
3541   // Loop which searches the first difference character by character.
3542   cmpwi(CCR0, cnt1, 0);
3543   beq(CCR0, Lreturn_diff);
3544   bind(Lslow);
3545   mtctr(cnt1);
3546 
3547   switch (ae) {
3548     case StrIntrinsicNode::LL: stride1 = 1; stride2 = 1; break;
3549     case StrIntrinsicNode::UL: // fallthru (see comment above)
3550     case StrIntrinsicNode::LU: stride1 = 1; stride2 = 2; break;
3551     case StrIntrinsicNode::UU: stride1 = 2; stride2 = 2; break;
3552     default: ShouldNotReachHere(); break;
3553   }
3554 
3555   bind(Lloop);
3556   if (stride1 == 1) { lbz(chr1, 0, str1); } else { lhz(chr1, 0, str1); }
3557   if (stride2 == 1) { lbz(chr2, 0, str2); } else { lhz(chr2, 0, str2); }
3558   subf_(result, chr2, chr1); // result = chr1 - chr2
3559   bne(CCR0, Ldone);
3560   addi(str1, str1, stride1);
3561   addi(str2, str2, stride2);
3562   bdnz(Lloop);
3563 
3564   // If strings are equal up to min length, return the length difference.
3565   bind(Lreturn_diff);
3566   mr(result, diff);
3567 
3568   // Otherwise, return the difference between the first mismatched chars.
3569   bind(Ldone);
3570   if (ae == StrIntrinsicNode::UL) {
3571     neg(result, result); // Negate result (see note above).
3572   }
3573 }
3574 
3575 void MacroAssembler::array_equals(bool is_array_equ, Register ary1, Register ary2,
3576                                   Register limit, Register tmp1, Register result, bool is_byte) {
3577   const Register tmp0 = R0;
3578   assert_different_registers(ary1, ary2, limit, tmp0, tmp1, result);
3579   Label Ldone, Lskiploop, Lloop, Lfastloop, Lskipfast;
3580   bool limit_needs_shift = false;
3581 
3582   if (is_array_equ) {
3583     const int length_offset = arrayOopDesc::length_offset_in_bytes();
3584     const int base_offset   = arrayOopDesc::base_offset_in_bytes(is_byte ? T_BYTE : T_CHAR);
3585 
3586     // Return true if the same array.
3587     cmpd(CCR0, ary1, ary2);
3588     beq(CCR0, Lskiploop);
3589 
3590     // Return false if one of them is NULL.
3591     cmpdi(CCR0, ary1, 0);
3592     cmpdi(CCR1, ary2, 0);
3593     li(result, 0);
3594     cror(CCR0, Assembler::equal, CCR1, Assembler::equal);
3595     beq(CCR0, Ldone);
3596 
3597     // Load the lengths of arrays.
3598     lwz(limit, length_offset, ary1);
3599     lwz(tmp0, length_offset, ary2);
3600 
3601     // Return false if the two arrays are not equal length.
3602     cmpw(CCR0, limit, tmp0);
3603     bne(CCR0, Ldone);
3604 
3605     // Load array addresses.
3606     addi(ary1, ary1, base_offset);
3607     addi(ary2, ary2, base_offset);
3608   } else {
3609     limit_needs_shift = !is_byte;
3610     li(result, 0); // Assume not equal.
3611   }
3612 
3613   // Rename registers
3614   Register chr1 = tmp0;
3615   Register chr2 = tmp1;
3616 
3617   // Compare 8 bytes per iteration in fast loop.
3618   const int log2_chars_per_iter = is_byte ? 3 : 2;
3619 
3620   srwi_(tmp0, limit, log2_chars_per_iter + (limit_needs_shift ? 1 : 0));
3621   beq(CCR0, Lskipfast);
3622   mtctr(tmp0);
3623 
3624   bind(Lfastloop);
3625   ld(chr1, 0, ary1);
3626   ld(chr2, 0, ary2);
3627   addi(ary1, ary1, 8);
3628   addi(ary2, ary2, 8);
3629   cmpd(CCR0, chr1, chr2);
3630   bne(CCR0, Ldone);
3631   bdnz(Lfastloop);
3632 
3633   bind(Lskipfast);
3634   rldicl_(limit, limit, limit_needs_shift ? 64 - 1 : 0, 64 - log2_chars_per_iter); // Remaining characters.
3635   beq(CCR0, Lskiploop);
3636   mtctr(limit);
3637 
3638   // Character by character.
3639   bind(Lloop);
3640   if (is_byte) {
3641     lbz(chr1, 0, ary1);
3642     lbz(chr2, 0, ary2);
3643     addi(ary1, ary1, 1);
3644     addi(ary2, ary2, 1);
3645   } else {
3646     lhz(chr1, 0, ary1);
3647     lhz(chr2, 0, ary2);
3648     addi(ary1, ary1, 2);
3649     addi(ary2, ary2, 2);
3650   }
3651   cmpw(CCR0, chr1, chr2);
3652   bne(CCR0, Ldone);
3653   bdnz(Lloop);
3654 
3655   bind(Lskiploop);
3656   li(result, 1); // All characters are equal.
3657   bind(Ldone);
3658 }
3659 
3660 void MacroAssembler::string_indexof(Register result, Register haystack, Register haycnt,
3661                                     Register needle, ciTypeArray* needle_values, Register needlecnt, int needlecntval,
3662                                     Register tmp1, Register tmp2, Register tmp3, Register tmp4, int ae) {
3663 
3664   // Ensure 0<needlecnt<=haycnt in ideal graph as prerequisite!
3665   Label L_TooShort, L_Found, L_NotFound, L_End;
3666   Register last_addr = haycnt, // Kill haycnt at the beginning.
3667   addr      = tmp1,
3668   n_start   = tmp2,
3669   ch1       = tmp3,
3670   ch2       = R0;
3671 
3672   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
3673   const int h_csize = (ae == StrIntrinsicNode::LL) ? 1 : 2;
3674   const int n_csize = (ae == StrIntrinsicNode::UU) ? 2 : 1;
3675 
3676   // **************************************************************************************************
3677   // Prepare for main loop: optimized for needle count >=2, bail out otherwise.
3678   // **************************************************************************************************
3679 
3680   // Compute last haystack addr to use if no match gets found.
3681   clrldi(haycnt, haycnt, 32);         // Ensure positive int is valid as 64 bit value.
3682   addi(addr, haystack, -h_csize);     // Accesses use pre-increment.
3683   if (needlecntval == 0) { // variable needlecnt
3684    cmpwi(CCR6, needlecnt, 2);
3685    clrldi(needlecnt, needlecnt, 32);  // Ensure positive int is valid as 64 bit value.
3686    blt(CCR6, L_TooShort);             // Variable needlecnt: handle short needle separately.
3687   }
3688 
3689   if (n_csize == 2) { lwz(n_start, 0, needle); } else { lhz(n_start, 0, needle); } // Load first 2 characters of needle.
3690 
3691   if (needlecntval == 0) { // variable needlecnt
3692    subf(ch1, needlecnt, haycnt);      // Last character index to compare is haycnt-needlecnt.
3693    addi(needlecnt, needlecnt, -2);    // Rest of needle.
3694   } else { // constant needlecnt
3695   guarantee(needlecntval != 1, "IndexOf with single-character needle must be handled separately");
3696   assert((needlecntval & 0x7fff) == needlecntval, "wrong immediate");
3697    addi(ch1, haycnt, -needlecntval);  // Last character index to compare is haycnt-needlecnt.
3698    if (needlecntval > 3) { li(needlecnt, needlecntval - 2); } // Rest of needle.
3699   }
3700 
3701   if (h_csize == 2) { slwi(ch1, ch1, 1); } // Scale to number of bytes.
3702 
3703   if (ae ==StrIntrinsicNode::UL) {
3704    srwi(tmp4, n_start, 1*8);          // ___0
3705    rlwimi(n_start, tmp4, 2*8, 0, 23); // _0_1
3706   }
3707 
3708   add(last_addr, haystack, ch1);      // Point to last address to compare (haystack+2*(haycnt-needlecnt)).
3709 
3710   // Main Loop (now we have at least 2 characters).
3711   Label L_OuterLoop, L_InnerLoop, L_FinalCheck, L_Comp1, L_Comp2;
3712   bind(L_OuterLoop); // Search for 1st 2 characters.
3713   Register addr_diff = tmp4;
3714    subf(addr_diff, addr, last_addr);  // Difference between already checked address and last address to check.
3715    addi(addr, addr, h_csize);         // This is the new address we want to use for comparing.
3716    srdi_(ch2, addr_diff, h_csize);
3717    beq(CCR0, L_FinalCheck);           // 2 characters left?
3718    mtctr(ch2);                        // num of characters / 2
3719   bind(L_InnerLoop);                  // Main work horse (2x unrolled search loop)
3720    if (h_csize == 2) {                // Load 2 characters of haystack (ignore alignment).
3721     lwz(ch1, 0, addr);
3722     lwz(ch2, 2, addr);
3723    } else {
3724     lhz(ch1, 0, addr);
3725     lhz(ch2, 1, addr);
3726    }
3727    cmpw(CCR0, ch1, n_start);          // Compare 2 characters (1 would be sufficient but try to reduce branches to CompLoop).
3728    cmpw(CCR1, ch2, n_start);
3729    beq(CCR0, L_Comp1);                // Did we find the needle start?
3730    beq(CCR1, L_Comp2);
3731    addi(addr, addr, 2 * h_csize);
3732    bdnz(L_InnerLoop);
3733   bind(L_FinalCheck);
3734    andi_(addr_diff, addr_diff, h_csize); // Remaining characters not covered by InnerLoop: (num of characters) & 1.
3735    beq(CCR0, L_NotFound);
3736    if (h_csize == 2) { lwz(ch1, 0, addr); } else { lhz(ch1, 0, addr); } // One position left at which we have to compare.
3737    cmpw(CCR1, ch1, n_start);
3738    beq(CCR1, L_Comp1);
3739   bind(L_NotFound);
3740    li(result, -1);                    // not found
3741    b(L_End);
3742 
3743    // **************************************************************************************************
3744    // Special Case: unfortunately, the variable needle case can be called with needlecnt<2
3745    // **************************************************************************************************
3746   if (needlecntval == 0) {           // We have to handle these cases separately.
3747   Label L_OneCharLoop;
3748   bind(L_TooShort);
3749    mtctr(haycnt);
3750    if (n_csize == 2) { lhz(n_start, 0, needle); } else { lbz(n_start, 0, needle); } // First character of needle
3751   bind(L_OneCharLoop);
3752    if (h_csize == 2) { lhzu(ch1, 2, addr); } else { lbzu(ch1, 1, addr); }
3753    cmpw(CCR1, ch1, n_start);
3754    beq(CCR1, L_Found);               // Did we find the one character needle?
3755    bdnz(L_OneCharLoop);
3756    li(result, -1);                   // Not found.
3757    b(L_End);
3758   }
3759 
3760   // **************************************************************************************************
3761   // Regular Case Part II: compare rest of needle (first 2 characters have been compared already)
3762   // **************************************************************************************************
3763 
3764   // Compare the rest
3765   bind(L_Comp2);
3766    addi(addr, addr, h_csize);        // First comparison has failed, 2nd one hit.
3767   bind(L_Comp1);                     // Addr points to possible needle start.
3768   if (needlecntval != 2) {           // Const needlecnt==2?
3769    if (needlecntval != 3) {
3770     if (needlecntval == 0) { beq(CCR6, L_Found); } // Variable needlecnt==2?
3771     Register n_ind = tmp4,
3772              h_ind = n_ind;
3773     li(n_ind, 2 * n_csize);          // First 2 characters are already compared, use index 2.
3774     mtctr(needlecnt);                // Decremented by 2, still > 0.
3775    Label L_CompLoop;
3776    bind(L_CompLoop);
3777     if (ae ==StrIntrinsicNode::UL) {
3778       h_ind = ch1;
3779       sldi(h_ind, n_ind, 1);
3780     }
3781     if (n_csize == 2) { lhzx(ch2, needle, n_ind); } else { lbzx(ch2, needle, n_ind); }
3782     if (h_csize == 2) { lhzx(ch1, addr, h_ind); } else { lbzx(ch1, addr, h_ind); }
3783     cmpw(CCR1, ch1, ch2);
3784     bne(CCR1, L_OuterLoop);
3785     addi(n_ind, n_ind, n_csize);
3786     bdnz(L_CompLoop);
3787    } else { // No loop required if there's only one needle character left.
3788     if (n_csize == 2) { lhz(ch2, 2 * 2, needle); } else { lbz(ch2, 2 * 1, needle); }
3789     if (h_csize == 2) { lhz(ch1, 2 * 2, addr); } else { lbz(ch1, 2 * 1, addr); }
3790     cmpw(CCR1, ch1, ch2);
3791     bne(CCR1, L_OuterLoop);
3792    }
3793   }
3794   // Return index ...
3795   bind(L_Found);
3796    subf(result, haystack, addr);     // relative to haystack, ...
3797    if (h_csize == 2) { srdi(result, result, 1); } // in characters.
3798   bind(L_End);
3799 } // string_indexof
3800 
3801 void MacroAssembler::string_indexof_char(Register result, Register haystack, Register haycnt,
3802                                          Register needle, jchar needleChar, Register tmp1, Register tmp2, bool is_byte) {
3803   assert_different_registers(haystack, haycnt, needle, tmp1, tmp2);
3804 
3805   Label L_InnerLoop, L_FinalCheck, L_Found1, L_Found2, L_NotFound, L_End;
3806   Register addr = tmp1,
3807            ch1 = tmp2,
3808            ch2 = R0;
3809 
3810   const int h_csize = is_byte ? 1 : 2;
3811 
3812 //4:
3813    srwi_(tmp2, haycnt, 1);   // Shift right by exact_log2(UNROLL_FACTOR).
3814    mr(addr, haystack);
3815    beq(CCR0, L_FinalCheck);
3816    mtctr(tmp2);              // Move to count register.
3817 //8:
3818   bind(L_InnerLoop);         // Main work horse (2x unrolled search loop).
3819    if (!is_byte) {
3820     lhz(ch1, 0, addr);
3821     lhz(ch2, 2, addr);
3822    } else {
3823     lbz(ch1, 0, addr);
3824     lbz(ch2, 1, addr);
3825    }
3826    (needle != R0) ? cmpw(CCR0, ch1, needle) : cmplwi(CCR0, ch1, (unsigned int)needleChar);
3827    (needle != R0) ? cmpw(CCR1, ch2, needle) : cmplwi(CCR1, ch2, (unsigned int)needleChar);
3828    beq(CCR0, L_Found1);      // Did we find the needle?
3829    beq(CCR1, L_Found2);
3830    addi(addr, addr, 2 * h_csize);
3831    bdnz(L_InnerLoop);
3832 //16:
3833   bind(L_FinalCheck);
3834    andi_(R0, haycnt, 1);
3835    beq(CCR0, L_NotFound);
3836    if (!is_byte) { lhz(ch1, 0, addr); } else { lbz(ch1, 0, addr); } // One position left at which we have to compare.
3837    (needle != R0) ? cmpw(CCR1, ch1, needle) : cmplwi(CCR1, ch1, (unsigned int)needleChar);
3838    beq(CCR1, L_Found1);
3839 //21:
3840   bind(L_NotFound);
3841    li(result, -1);           // Not found.
3842    b(L_End);
3843 
3844   bind(L_Found2);
3845    addi(addr, addr, h_csize);
3846 //24:
3847   bind(L_Found1);            // Return index ...
3848    subf(result, haystack, addr); // relative to haystack, ...
3849    if (!is_byte) { srdi(result, result, 1); } // in characters.
3850   bind(L_End);
3851 } // string_indexof_char
3852 
3853 
3854 void MacroAssembler::has_negatives(Register src, Register cnt, Register result,
3855                                    Register tmp1, Register tmp2) {
3856   const Register tmp0 = R0;
3857   assert_different_registers(src, result, cnt, tmp0, tmp1, tmp2);
3858   Label Lfastloop, Lslow, Lloop, Lnoneg, Ldone;
3859 
3860   // Check if cnt >= 8 (= 16 bytes)
3861   lis(tmp1, (int)(short)0x8080);  // tmp1 = 0x8080808080808080
3862   srwi_(tmp2, cnt, 4);
3863   li(result, 1);                  // Assume there's a negative byte.
3864   beq(CCR0, Lslow);
3865   ori(tmp1, tmp1, 0x8080);
3866   rldimi(tmp1, tmp1, 32, 0);
3867   mtctr(tmp2);
3868 
3869   // 2x unrolled loop
3870   bind(Lfastloop);
3871   ld(tmp2, 0, src);
3872   ld(tmp0, 8, src);
3873 
3874   orr(tmp0, tmp2, tmp0);
3875 
3876   and_(tmp0, tmp0, tmp1);
3877   bne(CCR0, Ldone);               // Found negative byte.
3878   addi(src, src, 16);
3879 
3880   bdnz(Lfastloop);
3881 
3882   bind(Lslow);                    // Fallback to slow version
3883   rldicl_(tmp0, cnt, 0, 64-4);
3884   beq(CCR0, Lnoneg);
3885   mtctr(tmp0);
3886   bind(Lloop);
3887   lbz(tmp0, 0, src);
3888   addi(src, src, 1);
3889   andi_(tmp0, tmp0, 0x80);
3890   bne(CCR0, Ldone);               // Found negative byte.
3891   bdnz(Lloop);
3892   bind(Lnoneg);
3893   li(result, 0);
3894 
3895   bind(Ldone);
3896 }
3897 
3898 #endif // Compiler2
3899 
3900 // Helpers for Intrinsic Emitters
3901 //
3902 // Revert the byte order of a 32bit value in a register
3903 //   src: 0x44556677
3904 //   dst: 0x77665544
3905 // Three steps to obtain the result:
3906 //  1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word
3907 //     into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared.
3908 //     This value initializes dst.
3909 //  2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost
3910 //     byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go.
3911 //     This value is mask inserted into dst with a [0..23] mask of 1s.
3912 //  3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position.
3913 //     This value is mask inserted into dst with a [8..15] mask of 1s.
3914 void MacroAssembler::load_reverse_32(Register dst, Register src) {
3915   assert_different_registers(dst, src);
3916 
3917   rldicl(dst, src, (4+1)*8, 56);       // Rotate byte 4 into position 7 (rightmost), clear all to the left.
3918   rlwimi(dst, src,     3*8,  0, 23);   // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone.
3919   rlwimi(dst, src,     1*8,  8, 15);   // Insert byte 6 into position 5, leave the rest alone.
3920 }
3921 
3922 // Calculate the column addresses of the crc32 lookup table into distinct registers.
3923 // This loop-invariant calculation is moved out of the loop body, reducing the loop
3924 // body size from 20 to 16 instructions.
3925 // Returns the offset that was used to calculate the address of column tc3.
3926 // Due to register shortage, setting tc3 may overwrite table. With the return offset
3927 // at hand, the original table address can be easily reconstructed.
3928 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {
3929 
3930 #ifdef VM_LITTLE_ENDIAN
3931   // This is what we implement (the DOLIT4 part):
3932   // ========================================================================= */
3933   // #define DOLIT4 c ^= *buf4++; \
3934   //         c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \
3935   //             crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24]
3936   // #define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4
3937   // ========================================================================= */
3938   const int ix0 = 3*(4*CRC32_COLUMN_SIZE);
3939   const int ix1 = 2*(4*CRC32_COLUMN_SIZE);
3940   const int ix2 = 1*(4*CRC32_COLUMN_SIZE);
3941   const int ix3 = 0*(4*CRC32_COLUMN_SIZE);
3942 #else
3943   // This is what we implement (the DOBIG4 part):
3944   // =========================================================================
3945   // #define DOBIG4 c ^= *++buf4; \
3946   //         c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \
3947   //             crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24]
3948   // #define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4
3949   // =========================================================================
3950   const int ix0 = 4*(4*CRC32_COLUMN_SIZE);
3951   const int ix1 = 5*(4*CRC32_COLUMN_SIZE);
3952   const int ix2 = 6*(4*CRC32_COLUMN_SIZE);
3953   const int ix3 = 7*(4*CRC32_COLUMN_SIZE);
3954 #endif
3955   assert_different_registers(table, tc0, tc1, tc2);
3956   assert(table == tc3, "must be!");
3957 
3958   addi(tc0, table, ix0);
3959   addi(tc1, table, ix1);
3960   addi(tc2, table, ix2);
3961   if (ix3 != 0) addi(tc3, table, ix3);
3962 
3963   return ix3;
3964 }
3965 
3966 /**
3967  * uint32_t crc;
3968  * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
3969  */
3970 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) {
3971   assert_different_registers(crc, table, tmp);
3972   assert_different_registers(val, table);
3973 
3974   if (crc == val) {                   // Must rotate first to use the unmodified value.
3975     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3976                                       // As we use a word (4-byte) instruction, we have to adapt the mask bit positions.
3977     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
3978   } else {
3979     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
3980     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3981   }
3982   lwzx(tmp, table, tmp);
3983   xorr(crc, crc, tmp);
3984 }
3985 
3986 /**
3987  * uint32_t crc;
3988  * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
3989  */
3990 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
3991   fold_byte_crc32(crc, crc, table, tmp);
3992 }
3993 
3994 /**
3995  * Emits code to update CRC-32 with a byte value according to constants in table.
3996  *
3997  * @param [in,out]crc   Register containing the crc.
3998  * @param [in]val       Register containing the byte to fold into the CRC.
3999  * @param [in]table     Register containing the table of crc constants.
4000  *
4001  * uint32_t crc;
4002  * val = crc_table[(val ^ crc) & 0xFF];
4003  * crc = val ^ (crc >> 8);
4004  */
4005 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
4006   BLOCK_COMMENT("update_byte_crc32:");
4007   xorr(val, val, crc);
4008   fold_byte_crc32(crc, val, table, val);
4009 }
4010 
4011 /**
4012  * @param crc   register containing existing CRC (32-bit)
4013  * @param buf   register pointing to input byte buffer (byte*)
4014  * @param len   register containing number of bytes
4015  * @param table register pointing to CRC table
4016  */
4017 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,
4018                                            Register data, bool loopAlignment) {
4019   assert_different_registers(crc, buf, len, table, data);
4020 
4021   Label L_mainLoop, L_done;
4022   const int mainLoop_stepping  = 1;
4023   const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4;
4024 
4025   // Process all bytes in a single-byte loop.
4026   clrldi_(len, len, 32);                         // Enforce 32 bit. Anything to do?
4027   beq(CCR0, L_done);
4028 
4029   mtctr(len);
4030   align(mainLoop_alignment);
4031   BIND(L_mainLoop);
4032     lbz(data, 0, buf);                           // Byte from buffer, zero-extended.
4033     addi(buf, buf, mainLoop_stepping);           // Advance buffer position.
4034     update_byte_crc32(crc, data, table);
4035     bdnz(L_mainLoop);                            // Iterate.
4036 
4037   bind(L_done);
4038 }
4039 
4040 /**
4041  * Emits code to update CRC-32 with a 4-byte value according to constants in table
4042  * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
4043  */
4044 // A not on the lookup table address(es):
4045 // The lookup table consists of two sets of four columns each.
4046 // The columns {0..3} are used for little-endian machines.
4047 // The columns {4..7} are used for big-endian machines.
4048 // To save the effort of adding the column offset to the table address each time
4049 // a table element is looked up, it is possible to pass the pre-calculated
4050 // column addresses.
4051 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary.
4052 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,
4053                                         Register t0,  Register t1,  Register t2,  Register t3,
4054                                         Register tc0, Register tc1, Register tc2, Register tc3) {
4055   assert_different_registers(crc, t3);
4056 
4057   // XOR crc with next four bytes of buffer.
4058   lwz(t3, bufDisp, buf);
4059   if (bufInc != 0) {
4060     addi(buf, buf, bufInc);
4061   }
4062   xorr(t3, t3, crc);
4063 
4064   // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices.
4065   rlwinm(t0, t3,  2,         24-2, 31-2);  // ((t1 >>  0) & 0xff) << 2
4066   rlwinm(t1, t3,  32+(2- 8), 24-2, 31-2);  // ((t1 >>  8) & 0xff) << 2
4067   rlwinm(t2, t3,  32+(2-16), 24-2, 31-2);  // ((t1 >> 16) & 0xff) << 2
4068   rlwinm(t3, t3,  32+(2-24), 24-2, 31-2);  // ((t1 >> 24) & 0xff) << 2
4069 
4070   // Use the pre-calculated column addresses.
4071   // Load pre-calculated table values.
4072   lwzx(t0, tc0, t0);
4073   lwzx(t1, tc1, t1);
4074   lwzx(t2, tc2, t2);
4075   lwzx(t3, tc3, t3);
4076 
4077   // Calculate new crc from table values.
4078   xorr(t0,  t0, t1);
4079   xorr(t2,  t2, t3);
4080   xorr(crc, t0, t2);  // Now crc contains the final checksum value.
4081 }
4082 
4083 /**
4084  * @param crc   register containing existing CRC (32-bit)
4085  * @param buf   register pointing to input byte buffer (byte*)
4086  * @param len   register containing number of bytes
4087  * @param table register pointing to CRC table
4088  *
4089  * Uses R9..R12 as work register. Must be saved/restored by caller!
4090  */
4091 void MacroAssembler::kernel_crc32_2word(Register crc, Register buf, Register len, Register table,
4092                                         Register t0,  Register t1,  Register t2,  Register t3,
4093                                         Register tc0, Register tc1, Register tc2, Register tc3,
4094                                         bool invertCRC) {
4095   assert_different_registers(crc, buf, len, table);
4096 
4097   Label L_mainLoop, L_tail;
4098   Register  tmp  = t0;
4099   Register  data = t0;
4100   Register  tmp2 = t1;
4101   const int mainLoop_stepping  = 8;
4102   const int tailLoop_stepping  = 1;
4103   const int log_stepping       = exact_log2(mainLoop_stepping);
4104   const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
4105   const int complexThreshold   = 2*mainLoop_stepping;
4106 
4107   // Don't test for len <= 0 here. This pathological case should not occur anyway.
4108   // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles
4109   // for all well-behaved cases. The situation itself is detected and handled correctly
4110   // within update_byteLoop_crc32.
4111   assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
4112 
4113   BLOCK_COMMENT("kernel_crc32_2word {");
4114 
4115   if (invertCRC) {
4116     nand(crc, crc, crc);                      // 1s complement of crc
4117   }
4118 
4119   // Check for short (<mainLoop_stepping) buffer.
4120   cmpdi(CCR0, len, complexThreshold);
4121   blt(CCR0, L_tail);
4122 
4123   // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
4124   // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
4125   {
4126     // Align buf addr to mainLoop_stepping boundary.
4127     neg(tmp2, buf);                           // Calculate # preLoop iterations for alignment.
4128     rldicl(tmp2, tmp2, 0, 64-log_stepping);   // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
4129 
4130     if (complexThreshold > mainLoop_stepping) {
4131       sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4132     } else {
4133       sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
4134       cmpdi(CCR0, tmp, mainLoop_stepping);
4135       blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
4136       mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4137     }
4138     update_byteLoop_crc32(crc, buf, tmp2, table, data, false);
4139   }
4140 
4141   srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
4142   andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
4143   mtctr(tmp2);
4144 
4145 #ifdef VM_LITTLE_ENDIAN
4146   Register crc_rv = crc;
4147 #else
4148   Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
4149                                                  // Occupies tmp, but frees up crc.
4150   load_reverse_32(crc_rv, crc);                  // Revert byte order because we are dealing with big-endian data.
4151   tmp = crc;
4152 #endif
4153 
4154   int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
4155 
4156   align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
4157   BIND(L_mainLoop);
4158     update_1word_crc32(crc_rv, buf, table, 0, 0, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
4159     update_1word_crc32(crc_rv, buf, table, 4, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
4160     bdnz(L_mainLoop);
4161 
4162 #ifndef VM_LITTLE_ENDIAN
4163   load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
4164   tmp = crc_rv;                                  // Tmp uses it's original register again.
4165 #endif
4166 
4167   // Restore original table address for tailLoop.
4168   if (reconstructTableOffset != 0) {
4169     addi(table, table, -reconstructTableOffset);
4170   }
4171 
4172   // Process last few (<complexThreshold) bytes of buffer.
4173   BIND(L_tail);
4174   update_byteLoop_crc32(crc, buf, len, table, data, false);
4175 
4176   if (invertCRC) {
4177     nand(crc, crc, crc);                      // 1s complement of crc
4178   }
4179   BLOCK_COMMENT("} kernel_crc32_2word");
4180 }
4181 
4182 /**
4183  * @param crc   register containing existing CRC (32-bit)
4184  * @param buf   register pointing to input byte buffer (byte*)
4185  * @param len   register containing number of bytes
4186  * @param table register pointing to CRC table
4187  *
4188  * uses R9..R12 as work register. Must be saved/restored by caller!
4189  */
4190 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table,
4191                                         Register t0,  Register t1,  Register t2,  Register t3,
4192                                         Register tc0, Register tc1, Register tc2, Register tc3,
4193                                         bool invertCRC) {
4194   assert_different_registers(crc, buf, len, table);
4195 
4196   Label L_mainLoop, L_tail;
4197   Register  tmp          = t0;
4198   Register  data         = t0;
4199   Register  tmp2         = t1;
4200   const int mainLoop_stepping  = 4;
4201   const int tailLoop_stepping  = 1;
4202   const int log_stepping       = exact_log2(mainLoop_stepping);
4203   const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
4204   const int complexThreshold   = 2*mainLoop_stepping;
4205 
4206   // Don't test for len <= 0 here. This pathological case should not occur anyway.
4207   // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles
4208   // for all well-behaved cases. The situation itself is detected and handled correctly
4209   // within update_byteLoop_crc32.
4210   assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
4211 
4212   BLOCK_COMMENT("kernel_crc32_1word {");
4213 
4214   if (invertCRC) {
4215     nand(crc, crc, crc);                      // 1s complement of crc
4216   }
4217 
4218   // Check for short (<mainLoop_stepping) buffer.
4219   cmpdi(CCR0, len, complexThreshold);
4220   blt(CCR0, L_tail);
4221 
4222   // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
4223   // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
4224   {
4225     // Align buf addr to mainLoop_stepping boundary.
4226     neg(tmp2, buf);                              // Calculate # preLoop iterations for alignment.
4227     rldicl(tmp2, tmp2, 0, 64-log_stepping);      // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
4228 
4229     if (complexThreshold > mainLoop_stepping) {
4230       sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4231     } else {
4232       sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
4233       cmpdi(CCR0, tmp, mainLoop_stepping);
4234       blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
4235       mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4236     }
4237     update_byteLoop_crc32(crc, buf, tmp2, table, data, false);
4238   }
4239 
4240   srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
4241   andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
4242   mtctr(tmp2);
4243 
4244 #ifdef VM_LITTLE_ENDIAN
4245   Register crc_rv = crc;
4246 #else
4247   Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
4248                                                  // Occupies tmp, but frees up crc.
4249   load_reverse_32(crc_rv, crc);                  // Revert byte order because we are dealing with big-endian data.
4250   tmp = crc;
4251 #endif
4252 
4253   int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
4254 
4255   align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
4256   BIND(L_mainLoop);
4257     update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
4258     bdnz(L_mainLoop);
4259 
4260 #ifndef VM_LITTLE_ENDIAN
4261   load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
4262   tmp = crc_rv;                                  // Tmp uses it's original register again.
4263 #endif
4264 
4265   // Restore original table address for tailLoop.
4266   if (reconstructTableOffset != 0) {
4267     addi(table, table, -reconstructTableOffset);
4268   }
4269 
4270   // Process last few (<complexThreshold) bytes of buffer.
4271   BIND(L_tail);
4272   update_byteLoop_crc32(crc, buf, len, table, data, false);
4273 
4274   if (invertCRC) {
4275     nand(crc, crc, crc);                      // 1s complement of crc
4276   }
4277   BLOCK_COMMENT("} kernel_crc32_1word");
4278 }
4279 
4280 /**
4281  * @param crc   register containing existing CRC (32-bit)
4282  * @param buf   register pointing to input byte buffer (byte*)
4283  * @param len   register containing number of bytes
4284  * @param table register pointing to CRC table
4285  *
4286  * Uses R7_ARG5, R8_ARG6 as work registers.
4287  */
4288 void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len, Register table,
4289                                         Register t0,  Register t1,  Register t2,  Register t3,
4290                                         bool invertCRC) {
4291   assert_different_registers(crc, buf, len, table);
4292 
4293   Register  data = t0;                   // Holds the current byte to be folded into crc.
4294 
4295   BLOCK_COMMENT("kernel_crc32_1byte {");
4296 
4297   if (invertCRC) {
4298     nand(crc, crc, crc);                      // 1s complement of crc
4299   }
4300 
4301   // Process all bytes in a single-byte loop.
4302   update_byteLoop_crc32(crc, buf, len, table, data, true);
4303 
4304   if (invertCRC) {
4305     nand(crc, crc, crc);                      // 1s complement of crc
4306   }
4307   BLOCK_COMMENT("} kernel_crc32_1byte");
4308 }
4309 
4310 /**
4311  * @param crc             register containing existing CRC (32-bit)
4312  * @param buf             register pointing to input byte buffer (byte*)
4313  * @param len             register containing number of bytes
4314  * @param table           register pointing to CRC table
4315  * @param constants       register pointing to CRC table for 128-bit aligned memory
4316  * @param barretConstants register pointing to table for barrett reduction
4317  * @param t0-t4           temp registers
4318  */
4319 void MacroAssembler::kernel_crc32_1word_vpmsum(Register crc, Register buf, Register len, Register table,
4320                                                Register constants, Register barretConstants,
4321                                                Register t0, Register t1, Register t2, Register t3, Register t4,
4322                                                bool invertCRC) {
4323   assert_different_registers(crc, buf, len, table);
4324 
4325   Label L_alignedHead, L_tail;
4326 
4327   BLOCK_COMMENT("kernel_crc32_1word_vpmsum {");
4328 
4329   // 1. ~c
4330   if (invertCRC) {
4331     nand(crc, crc, crc);                      // 1s complement of crc
4332   }
4333 
4334   // 2. use kernel_crc32_1word for short len
4335   clrldi(len, len, 32);
4336   cmpdi(CCR0, len, 512);
4337   blt(CCR0, L_tail);
4338 
4339   // 3. calculate from 0 to first aligned address
4340   const int alignment = 16;
4341   Register prealign = t0;
4342 
4343   andi_(prealign, buf, alignment - 1);
4344   beq(CCR0, L_alignedHead);
4345   subfic(prealign, prealign, alignment);
4346 
4347   subf(len, prealign, len);
4348   update_byteLoop_crc32(crc, buf, prealign, table, t2, false);
4349 
4350   // 4. calculate from first aligned address as far as possible
4351   BIND(L_alignedHead);
4352   kernel_crc32_1word_aligned(crc, buf, len, constants, barretConstants, t0, t1, t2, t3, t4);
4353 
4354   // 5. remaining bytes
4355   BIND(L_tail);
4356   Register tc0 = t4;
4357   Register tc1 = constants;
4358   Register tc2 = barretConstants;
4359   kernel_crc32_1word(crc, buf, len, table, t0, t1, t2, t3, tc0, tc1, tc2, table, false);
4360 
4361   // 6. ~c
4362   if (invertCRC) {
4363     nand(crc, crc, crc);                      // 1s complement of crc
4364   }
4365 
4366   BLOCK_COMMENT("} kernel_crc32_1word_vpmsum");
4367 }
4368 
4369 /**
4370  * @param crc             register containing existing CRC (32-bit)
4371  * @param buf             register pointing to input byte buffer (byte*)
4372  * @param len             register containing number of bytes (will get updated to remaining bytes)
4373  * @param constants       register pointing to CRC table for 128-bit aligned memory
4374  * @param barretConstants register pointing to table for barrett reduction
4375  * @param t0-t4           temp registers
4376  * Precondition: len should be >= 512. Otherwise, nothing will be done.
4377  */
4378 void MacroAssembler::kernel_crc32_1word_aligned(Register crc, Register buf, Register len,
4379     Register constants, Register barretConstants,
4380     Register t0, Register t1, Register t2, Register t3, Register t4) {
4381 
4382   // Save non-volatile vector registers (frameless).
4383   Register offset = t1;
4384   int offsetInt = 0;
4385   offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP);
4386   offsetInt -= 16; li(offset, offsetInt); stvx(VR21, offset, R1_SP);
4387   offsetInt -= 16; li(offset, offsetInt); stvx(VR22, offset, R1_SP);
4388   offsetInt -= 16; li(offset, offsetInt); stvx(VR23, offset, R1_SP);
4389   offsetInt -= 16; li(offset, offsetInt); stvx(VR24, offset, R1_SP);
4390   offsetInt -= 16; li(offset, offsetInt); stvx(VR25, offset, R1_SP);
4391 #ifndef VM_LITTLE_ENDIAN
4392   offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP);
4393 #endif
4394   offsetInt -= 8; std(R14, offsetInt, R1_SP);
4395   offsetInt -= 8; std(R15, offsetInt, R1_SP);
4396   offsetInt -= 8; std(R16, offsetInt, R1_SP);
4397   offsetInt -= 8; std(R17, offsetInt, R1_SP);
4398 
4399   // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor
4400   // bytes per iteration. The basic scheme is:
4401   // lvx: load vector (Big Endian needs reversal)
4402   // vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift
4403   // vxor: xor partial results together to get unroll_factor2 vectors
4404 
4405   // Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors.
4406 
4407   // Using 16 * unroll_factor / unroll_factor_2 bytes for constants.
4408   const int unroll_factor = 2048;
4409   const int unroll_factor2 = 8;
4410 
4411   // Support registers.
4412   Register offs[] = { noreg, t0, t1, t2, t3, t4, crc /* will live in VCRC */, R14 };
4413   Register num_bytes = R15,
4414            loop_count = R16,
4415            cur_const = R17;
4416   // Constant array for outer loop: unroll_factor2 - 1 registers,
4417   // Constant array for inner loop: unroll_factor / unroll_factor2 registers.
4418   VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 },
4419                  consts1[] = { VR23, VR24 };
4420   // Data register arrays: 2 arrays with unroll_factor2 registers.
4421   VectorRegister data0[] = { VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7 },
4422                  data1[] = { VR8, VR9, VR10, VR11, VR12, VR13, VR14, VR15 };
4423 
4424   VectorRegister VCRC = data0[0];
4425   VectorRegister Vc = VR25;
4426   VectorRegister swap_bytes = VR26; // Only for Big Endian.
4427 
4428   // We have at least 1 iteration (ensured by caller).
4429   Label L_outer_loop, L_inner_loop, L_last;
4430 
4431   // If supported set DSCR pre-fetch to deepest.
4432   if (VM_Version::has_mfdscr()) {
4433     load_const_optimized(t0, VM_Version::_dscr_val | 7);
4434     mtdscr(t0);
4435   }
4436 
4437   mtvrwz(VCRC, crc); // crc lives lives in VCRC, now
4438 
4439   for (int i = 1; i < unroll_factor2; ++i) {
4440     li(offs[i], 16 * i);
4441   }
4442 
4443   // Load consts for outer loop
4444   lvx(consts0[0], constants);
4445   for (int i = 1; i < unroll_factor2 - 1; ++i) {
4446     lvx(consts0[i], offs[i], constants);
4447   }
4448   addi(constants, constants, (unroll_factor2 - 1) * 16);
4449 
4450   load_const_optimized(num_bytes, 16 * unroll_factor);
4451   load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off.
4452 
4453   // Reuse data registers outside of the loop.
4454   VectorRegister Vtmp = data1[0];
4455   VectorRegister Vtmp2 = data1[1];
4456   VectorRegister zeroes = data1[2];
4457 
4458   vspltisb(Vtmp, 0);
4459   vsldoi(VCRC, Vtmp, VCRC, 8); // 96 bit zeroes, 32 bit CRC.
4460 
4461   // Load vector for vpermxor (to xor both 64 bit parts together)
4462   lvsl(Vtmp, buf);   // 000102030405060708090a0b0c0d0e0f
4463   vspltisb(Vc, 4);
4464   vsl(Vc, Vtmp, Vc); // 00102030405060708090a0b0c0d0e0f0
4465   xxspltd(Vc->to_vsr(), Vc->to_vsr(), 0);
4466   vor(Vc, Vtmp, Vc); // 001122334455667708192a3b4c5d6e7f
4467 
4468 #ifdef VM_LITTLE_ENDIAN
4469 #define BE_swap_bytes(x)
4470 #else
4471   vspltisb(Vtmp2, 0xf);
4472   vxor(swap_bytes, Vtmp, Vtmp2);
4473 #define BE_swap_bytes(x) vperm(x, x, x, swap_bytes)
4474 #endif
4475 
4476   cmpd(CCR0, len, num_bytes);
4477   blt(CCR0, L_last);
4478 
4479   // ********** Main loop start **********
4480   align(32);
4481   bind(L_outer_loop);
4482 
4483   // Begin of unrolled first iteration (no xor).
4484   lvx(data1[0], buf);
4485   mr(cur_const, constants);
4486   for (int i = 1; i < unroll_factor2 / 2; ++i) {
4487     lvx(data1[i], offs[i], buf);
4488   }
4489   vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
4490   lvx(consts1[0], cur_const);
4491   mtctr(loop_count);
4492   for (int i = 0; i < unroll_factor2 / 2; ++i) {
4493     BE_swap_bytes(data1[i]);
4494     if (i == 0) { vxor(data1[0], data1[0], VCRC); } // xor in previous CRC.
4495     lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
4496     vpmsumw(data0[i], data1[i], consts1[0]);
4497   }
4498   addi(buf, buf, 16 * unroll_factor2);
4499   subf(len, num_bytes, len);
4500   lvx(consts1[1], offs[1], cur_const);
4501   addi(cur_const, cur_const, 32);
4502   // Begin of unrolled second iteration (head).
4503   for (int i = 0; i < unroll_factor2 / 2; ++i) {
4504     BE_swap_bytes(data1[i + unroll_factor2 / 2]);
4505     if (i == 0) { lvx(data1[0], buf); } else { lvx(data1[i], offs[i], buf); }
4506     vpmsumw(data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[0]);
4507   }
4508   for (int i = 0; i < unroll_factor2 / 2; ++i) {
4509     BE_swap_bytes(data1[i]);
4510     lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
4511     vpmsumw(data1[i], data1[i], consts1[1]);
4512   }
4513   addi(buf, buf, 16 * unroll_factor2);
4514 
4515   // Generate most performance relevant code. Loads + half of the vpmsumw have been generated.
4516   // Double-iteration allows using the 2 constant registers alternatingly.
4517   align(32);
4518   bind(L_inner_loop);
4519   for (int j = 1; j < 3; ++j) { // j < unroll_factor / unroll_factor2 - 1 for complete unrolling.
4520     if (j & 1) {
4521       lvx(consts1[0], cur_const);
4522     } else {
4523       lvx(consts1[1], offs[1], cur_const);
4524       addi(cur_const, cur_const, 32);
4525     }
4526     for (int i = 0; i < unroll_factor2; ++i) {
4527       int idx = i + unroll_factor2 / 2, inc = 0; // For modulo-scheduled input.
4528       if (idx >= unroll_factor2) { idx -= unroll_factor2; inc = 1; }
4529       BE_swap_bytes(data1[idx]);
4530       vxor(data0[i], data0[i], data1[i]);
4531       if (i == 0) lvx(data1[0], buf); else lvx(data1[i], offs[i], buf);
4532       vpmsumw(data1[idx], data1[idx], consts1[(j + inc) & 1]);
4533     }
4534     addi(buf, buf, 16 * unroll_factor2);
4535   }
4536   bdnz(L_inner_loop);
4537 
4538   // Tail of last iteration (no loads).
4539   for (int i = 0; i < unroll_factor2 / 2; ++i) {
4540     BE_swap_bytes(data1[i + unroll_factor2 / 2]);
4541     vxor(data0[i], data0[i], data1[i]);
4542     vpmsumw(data1[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[1]);
4543   }
4544   for (int i = 0; i < unroll_factor2 / 2; ++i) {
4545     vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); // First half of fixup shifts.
4546     vxor(data0[i + unroll_factor2 / 2], data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2]);
4547   }
4548 
4549   // Last data register is ok, other ones need fixup shift.
4550   for (int i = unroll_factor2 / 2; i < unroll_factor2 - 1; ++i) {
4551     vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]);
4552   }
4553 
4554   // Combine to 128 bit result vector VCRC = data0[0].
4555   for (int i = 1; i < unroll_factor2; i<<=1) {
4556     for (int j = 0; j <= unroll_factor2 - 2*i; j+=2*i) {
4557       vxor(data0[j], data0[j], data0[j+i]);
4558     }
4559   }
4560   cmpd(CCR0, len, num_bytes);
4561   bge(CCR0, L_outer_loop);
4562 
4563   // Last chance with lower num_bytes.
4564   bind(L_last);
4565   srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations.
4566   add_const_optimized(constants, constants, 16 * (unroll_factor / unroll_factor2)); // Point behind last one.
4567   sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used.
4568   clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2));
4569   subf(constants, R0, constants); // Point to constant to be used first.
4570 
4571   addic_(loop_count, loop_count, -1); // One double-iteration peeled off.
4572   bgt(CCR0, L_outer_loop);
4573   // ********** Main loop end **********
4574 #undef BE_swap_bytes
4575 
4576   // Restore DSCR pre-fetch value.
4577   if (VM_Version::has_mfdscr()) {
4578     load_const_optimized(t0, VM_Version::_dscr_val);
4579     mtdscr(t0);
4580   }
4581 
4582   vspltisb(zeroes, 0);
4583 
4584   // Combine to 64 bit result.
4585   vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
4586 
4587   // Reduce to 32 bit CRC: Remainder by multiply-high.
4588   lvx(Vtmp, barretConstants);
4589   vsldoi(Vtmp2, zeroes, VCRC, 12);  // Extract high 32 bit.
4590   vpmsumd(Vtmp2, Vtmp2, Vtmp);      // Multiply by inverse long poly.
4591   vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit.
4592   vsldoi(Vtmp, zeroes, Vtmp, 8);
4593   vpmsumd(Vtmp2, Vtmp2, Vtmp);      // Multiply quotient by long poly.
4594   vxor(VCRC, VCRC, Vtmp2);          // Remainder fits into 32 bit.
4595 
4596   // Move result. len is already updated.
4597   vsldoi(VCRC, VCRC, zeroes, 8);
4598   mfvrd(crc, VCRC);
4599 
4600   // Restore non-volatile Vector registers (frameless).
4601   offsetInt = 0;
4602   offsetInt -= 16; li(offset, offsetInt); lvx(VR20, offset, R1_SP);
4603   offsetInt -= 16; li(offset, offsetInt); lvx(VR21, offset, R1_SP);
4604   offsetInt -= 16; li(offset, offsetInt); lvx(VR22, offset, R1_SP);
4605   offsetInt -= 16; li(offset, offsetInt); lvx(VR23, offset, R1_SP);
4606   offsetInt -= 16; li(offset, offsetInt); lvx(VR24, offset, R1_SP);
4607   offsetInt -= 16; li(offset, offsetInt); lvx(VR25, offset, R1_SP);
4608 #ifndef VM_LITTLE_ENDIAN
4609   offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP);
4610 #endif
4611   offsetInt -= 8;  ld(R14, offsetInt, R1_SP);
4612   offsetInt -= 8;  ld(R15, offsetInt, R1_SP);
4613   offsetInt -= 8;  ld(R16, offsetInt, R1_SP);
4614   offsetInt -= 8;  ld(R17, offsetInt, R1_SP);
4615 }
4616 
4617 void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp, bool invertCRC) {
4618   assert_different_registers(crc, buf, /* len,  not used!! */ table, tmp);
4619 
4620   BLOCK_COMMENT("kernel_crc32_singleByte:");
4621   if (invertCRC) {
4622     nand(crc, crc, crc);                // 1s complement of crc
4623   }
4624 
4625   lbz(tmp, 0, buf);                     // Byte from buffer, zero-extended.
4626   update_byte_crc32(crc, tmp, table);
4627 
4628   if (invertCRC) {
4629     nand(crc, crc, crc);                // 1s complement of crc
4630   }
4631 }
4632 
4633 void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) {
4634   assert_different_registers(crc, val, table);
4635 
4636   BLOCK_COMMENT("kernel_crc32_singleByteReg:");
4637   if (invertCRC) {
4638     nand(crc, crc, crc);                // 1s complement of crc
4639   }
4640 
4641   update_byte_crc32(crc, val, table);
4642 
4643   if (invertCRC) {
4644     nand(crc, crc, crc);                // 1s complement of crc
4645   }
4646 }
4647 
4648 // dest_lo += src1 + src2
4649 // dest_hi += carry1 + carry2
4650 void MacroAssembler::add2_with_carry(Register dest_hi,
4651                                      Register dest_lo,
4652                                      Register src1, Register src2) {
4653   li(R0, 0);
4654   addc(dest_lo, dest_lo, src1);
4655   adde(dest_hi, dest_hi, R0);
4656   addc(dest_lo, dest_lo, src2);
4657   adde(dest_hi, dest_hi, R0);
4658 }
4659 
4660 // Multiply 64 bit by 64 bit first loop.
4661 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,
4662                                            Register x_xstart,
4663                                            Register y, Register y_idx,
4664                                            Register z,
4665                                            Register carry,
4666                                            Register product_high, Register product,
4667                                            Register idx, Register kdx,
4668                                            Register tmp) {
4669   //  jlong carry, x[], y[], z[];
4670   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
4671   //    huge_128 product = y[idx] * x[xstart] + carry;
4672   //    z[kdx] = (jlong)product;
4673   //    carry  = (jlong)(product >>> 64);
4674   //  }
4675   //  z[xstart] = carry;
4676 
4677   Label L_first_loop, L_first_loop_exit;
4678   Label L_one_x, L_one_y, L_multiply;
4679 
4680   addic_(xstart, xstart, -1);
4681   blt(CCR0, L_one_x);   // Special case: length of x is 1.
4682 
4683   // Load next two integers of x.
4684   sldi(tmp, xstart, LogBytesPerInt);
4685   ldx(x_xstart, x, tmp);
4686 #ifdef VM_LITTLE_ENDIAN
4687   rldicl(x_xstart, x_xstart, 32, 0);
4688 #endif
4689 
4690   align(32, 16);
4691   bind(L_first_loop);
4692 
4693   cmpdi(CCR0, idx, 1);
4694   blt(CCR0, L_first_loop_exit);
4695   addi(idx, idx, -2);
4696   beq(CCR0, L_one_y);
4697 
4698   // Load next two integers of y.
4699   sldi(tmp, idx, LogBytesPerInt);
4700   ldx(y_idx, y, tmp);
4701 #ifdef VM_LITTLE_ENDIAN
4702   rldicl(y_idx, y_idx, 32, 0);
4703 #endif
4704 
4705 
4706   bind(L_multiply);
4707   multiply64(product_high, product, x_xstart, y_idx);
4708 
4709   li(tmp, 0);
4710   addc(product, product, carry);         // Add carry to result.
4711   adde(product_high, product_high, tmp); // Add carry of the last addition.
4712   addi(kdx, kdx, -2);
4713 
4714   // Store result.
4715 #ifdef VM_LITTLE_ENDIAN
4716   rldicl(product, product, 32, 0);
4717 #endif
4718   sldi(tmp, kdx, LogBytesPerInt);
4719   stdx(product, z, tmp);
4720   mr_if_needed(carry, product_high);
4721   b(L_first_loop);
4722 
4723 
4724   bind(L_one_y); // Load one 32 bit portion of y as (0,value).
4725 
4726   lwz(y_idx, 0, y);
4727   b(L_multiply);
4728 
4729 
4730   bind(L_one_x); // Load one 32 bit portion of x as (0,value).
4731 
4732   lwz(x_xstart, 0, x);
4733   b(L_first_loop);
4734 
4735   bind(L_first_loop_exit);
4736 }
4737 
4738 // Multiply 64 bit by 64 bit and add 128 bit.
4739 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y,
4740                                             Register z, Register yz_idx,
4741                                             Register idx, Register carry,
4742                                             Register product_high, Register product,
4743                                             Register tmp, int offset) {
4744 
4745   //  huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
4746   //  z[kdx] = (jlong)product;
4747 
4748   sldi(tmp, idx, LogBytesPerInt);
4749   if (offset) {
4750     addi(tmp, tmp, offset);
4751   }
4752   ldx(yz_idx, y, tmp);
4753 #ifdef VM_LITTLE_ENDIAN
4754   rldicl(yz_idx, yz_idx, 32, 0);
4755 #endif
4756 
4757   multiply64(product_high, product, x_xstart, yz_idx);
4758   ldx(yz_idx, z, tmp);
4759 #ifdef VM_LITTLE_ENDIAN
4760   rldicl(yz_idx, yz_idx, 32, 0);
4761 #endif
4762 
4763   add2_with_carry(product_high, product, carry, yz_idx);
4764 
4765   sldi(tmp, idx, LogBytesPerInt);
4766   if (offset) {
4767     addi(tmp, tmp, offset);
4768   }
4769 #ifdef VM_LITTLE_ENDIAN
4770   rldicl(product, product, 32, 0);
4771 #endif
4772   stdx(product, z, tmp);
4773 }
4774 
4775 // Multiply 128 bit by 128 bit. Unrolled inner loop.
4776 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart,
4777                                              Register y, Register z,
4778                                              Register yz_idx, Register idx, Register carry,
4779                                              Register product_high, Register product,
4780                                              Register carry2, Register tmp) {
4781 
4782   //  jlong carry, x[], y[], z[];
4783   //  int kdx = ystart+1;
4784   //  for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
4785   //    huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
4786   //    z[kdx+idx+1] = (jlong)product;
4787   //    jlong carry2 = (jlong)(product >>> 64);
4788   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
4789   //    z[kdx+idx] = (jlong)product;
4790   //    carry = (jlong)(product >>> 64);
4791   //  }
4792   //  idx += 2;
4793   //  if (idx > 0) {
4794   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
4795   //    z[kdx+idx] = (jlong)product;
4796   //    carry = (jlong)(product >>> 64);
4797   //  }
4798 
4799   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
4800   const Register jdx = R0;
4801 
4802   // Scale the index.
4803   srdi_(jdx, idx, 2);
4804   beq(CCR0, L_third_loop_exit);
4805   mtctr(jdx);
4806 
4807   align(32, 16);
4808   bind(L_third_loop);
4809 
4810   addi(idx, idx, -4);
4811 
4812   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8);
4813   mr_if_needed(carry2, product_high);
4814 
4815   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0);
4816   mr_if_needed(carry, product_high);
4817   bdnz(L_third_loop);
4818 
4819   bind(L_third_loop_exit);  // Handle any left-over operand parts.
4820 
4821   andi_(idx, idx, 0x3);
4822   beq(CCR0, L_post_third_loop_done);
4823 
4824   Label L_check_1;
4825 
4826   addic_(idx, idx, -2);
4827   blt(CCR0, L_check_1);
4828 
4829   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0);
4830   mr_if_needed(carry, product_high);
4831 
4832   bind(L_check_1);
4833 
4834   addi(idx, idx, 0x2);
4835   andi_(idx, idx, 0x1);
4836   addic_(idx, idx, -1);
4837   blt(CCR0, L_post_third_loop_done);
4838 
4839   sldi(tmp, idx, LogBytesPerInt);
4840   lwzx(yz_idx, y, tmp);
4841   multiply64(product_high, product, x_xstart, yz_idx);
4842   lwzx(yz_idx, z, tmp);
4843 
4844   add2_with_carry(product_high, product, yz_idx, carry);
4845 
4846   sldi(tmp, idx, LogBytesPerInt);
4847   stwx(product, z, tmp);
4848   srdi(product, product, 32);
4849 
4850   sldi(product_high, product_high, 32);
4851   orr(product, product, product_high);
4852   mr_if_needed(carry, product);
4853 
4854   bind(L_post_third_loop_done);
4855 }   // multiply_128_x_128_loop
4856 
4857 void MacroAssembler::muladd(Register out, Register in,
4858                             Register offset, Register len, Register k,
4859                             Register tmp1, Register tmp2, Register carry) {
4860 
4861   // Labels
4862   Label LOOP, SKIP;
4863 
4864   // Make sure length is positive.
4865   cmpdi  (CCR0,    len,     0);
4866 
4867   // Prepare variables
4868   subi   (offset,  offset,  4);
4869   li     (carry,   0);
4870   ble    (CCR0,    SKIP);
4871 
4872   mtctr  (len);
4873   subi   (len,     len,     1    );
4874   sldi   (len,     len,     2    );
4875 
4876   // Main loop
4877   bind(LOOP);
4878   lwzx   (tmp1,    len,     in   );
4879   lwzx   (tmp2,    offset,  out  );
4880   mulld  (tmp1,    tmp1,    k    );
4881   add    (tmp2,    carry,   tmp2 );
4882   add    (tmp2,    tmp1,    tmp2 );
4883   stwx   (tmp2,    offset,  out  );
4884   srdi   (carry,   tmp2,    32   );
4885   subi   (offset,  offset,  4    );
4886   subi   (len,     len,     4    );
4887   bdnz   (LOOP);
4888   bind(SKIP);
4889 }
4890 
4891 void MacroAssembler::multiply_to_len(Register x, Register xlen,
4892                                      Register y, Register ylen,
4893                                      Register z, Register zlen,
4894                                      Register tmp1, Register tmp2,
4895                                      Register tmp3, Register tmp4,
4896                                      Register tmp5, Register tmp6,
4897                                      Register tmp7, Register tmp8,
4898                                      Register tmp9, Register tmp10,
4899                                      Register tmp11, Register tmp12,
4900                                      Register tmp13) {
4901 
4902   ShortBranchVerifier sbv(this);
4903 
4904   assert_different_registers(x, xlen, y, ylen, z, zlen,
4905                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
4906   assert_different_registers(x, xlen, y, ylen, z, zlen,
4907                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp7);
4908   assert_different_registers(x, xlen, y, ylen, z, zlen,
4909                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp8);
4910 
4911   const Register idx = tmp1;
4912   const Register kdx = tmp2;
4913   const Register xstart = tmp3;
4914 
4915   const Register y_idx = tmp4;
4916   const Register carry = tmp5;
4917   const Register product = tmp6;
4918   const Register product_high = tmp7;
4919   const Register x_xstart = tmp8;
4920   const Register tmp = tmp9;
4921 
4922   // First Loop.
4923   //
4924   //  final static long LONG_MASK = 0xffffffffL;
4925   //  int xstart = xlen - 1;
4926   //  int ystart = ylen - 1;
4927   //  long carry = 0;
4928   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
4929   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
4930   //    z[kdx] = (int)product;
4931   //    carry = product >>> 32;
4932   //  }
4933   //  z[xstart] = (int)carry;
4934 
4935   mr_if_needed(idx, ylen);        // idx = ylen
4936   mr_if_needed(kdx, zlen);        // kdx = xlen + ylen
4937   li(carry, 0);                   // carry = 0
4938 
4939   Label L_done;
4940 
4941   addic_(xstart, xlen, -1);
4942   blt(CCR0, L_done);
4943 
4944   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z,
4945                         carry, product_high, product, idx, kdx, tmp);
4946 
4947   Label L_second_loop;
4948 
4949   cmpdi(CCR0, kdx, 0);
4950   beq(CCR0, L_second_loop);
4951 
4952   Label L_carry;
4953 
4954   addic_(kdx, kdx, -1);
4955   beq(CCR0, L_carry);
4956 
4957   // Store lower 32 bits of carry.
4958   sldi(tmp, kdx, LogBytesPerInt);
4959   stwx(carry, z, tmp);
4960   srdi(carry, carry, 32);
4961   addi(kdx, kdx, -1);
4962 
4963 
4964   bind(L_carry);
4965 
4966   // Store upper 32 bits of carry.
4967   sldi(tmp, kdx, LogBytesPerInt);
4968   stwx(carry, z, tmp);
4969 
4970   // Second and third (nested) loops.
4971   //
4972   //  for (int i = xstart-1; i >= 0; i--) { // Second loop
4973   //    carry = 0;
4974   //    for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
4975   //      long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
4976   //                     (z[k] & LONG_MASK) + carry;
4977   //      z[k] = (int)product;
4978   //      carry = product >>> 32;
4979   //    }
4980   //    z[i] = (int)carry;
4981   //  }
4982   //
4983   //  i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
4984 
4985   bind(L_second_loop);
4986 
4987   li(carry, 0);                   // carry = 0;
4988 
4989   addic_(xstart, xstart, -1);     // i = xstart-1;
4990   blt(CCR0, L_done);
4991 
4992   Register zsave = tmp10;
4993 
4994   mr(zsave, z);
4995 
4996 
4997   Label L_last_x;
4998 
4999   sldi(tmp, xstart, LogBytesPerInt);
5000   add(z, z, tmp);                 // z = z + k - j
5001   addi(z, z, 4);
5002   addic_(xstart, xstart, -1);     // i = xstart-1;
5003   blt(CCR0, L_last_x);
5004 
5005   sldi(tmp, xstart, LogBytesPerInt);
5006   ldx(x_xstart, x, tmp);
5007 #ifdef VM_LITTLE_ENDIAN
5008   rldicl(x_xstart, x_xstart, 32, 0);
5009 #endif
5010 
5011 
5012   Label L_third_loop_prologue;
5013 
5014   bind(L_third_loop_prologue);
5015 
5016   Register xsave = tmp11;
5017   Register xlensave = tmp12;
5018   Register ylensave = tmp13;
5019 
5020   mr(xsave, x);
5021   mr(xlensave, xstart);
5022   mr(ylensave, ylen);
5023 
5024 
5025   multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen,
5026                           carry, product_high, product, x, tmp);
5027 
5028   mr(z, zsave);
5029   mr(x, xsave);
5030   mr(xlen, xlensave);   // This is the decrement of the loop counter!
5031   mr(ylen, ylensave);
5032 
5033   addi(tmp3, xlen, 1);
5034   sldi(tmp, tmp3, LogBytesPerInt);
5035   stwx(carry, z, tmp);
5036   addic_(tmp3, tmp3, -1);
5037   blt(CCR0, L_done);
5038 
5039   srdi(carry, carry, 32);
5040   sldi(tmp, tmp3, LogBytesPerInt);
5041   stwx(carry, z, tmp);
5042   b(L_second_loop);
5043 
5044   // Next infrequent code is moved outside loops.
5045   bind(L_last_x);
5046 
5047   lwz(x_xstart, 0, x);
5048   b(L_third_loop_prologue);
5049 
5050   bind(L_done);
5051 }   // multiply_to_len
5052 
5053 void MacroAssembler::asm_assert(bool check_equal, const char *msg, int id) {
5054 #ifdef ASSERT
5055   Label ok;
5056   if (check_equal) {
5057     beq(CCR0, ok);
5058   } else {
5059     bne(CCR0, ok);
5060   }
5061   stop(msg, id);
5062   bind(ok);
5063 #endif
5064 }
5065 
5066 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset,
5067                                           Register mem_base, const char* msg, int id) {
5068 #ifdef ASSERT
5069   switch (size) {
5070     case 4:
5071       lwz(R0, mem_offset, mem_base);
5072       cmpwi(CCR0, R0, 0);
5073       break;
5074     case 8:
5075       ld(R0, mem_offset, mem_base);
5076       cmpdi(CCR0, R0, 0);
5077       break;
5078     default:
5079       ShouldNotReachHere();
5080   }
5081   asm_assert(check_equal, msg, id);
5082 #endif // ASSERT
5083 }
5084 
5085 void MacroAssembler::verify_thread() {
5086   if (VerifyThread) {
5087     unimplemented("'VerifyThread' currently not implemented on PPC");
5088   }
5089 }
5090 
5091 // READ: oop. KILL: R0. Volatile floats perhaps.
5092 void MacroAssembler::verify_oop(Register oop, const char* msg) {
5093   if (!VerifyOops) {
5094     return;
5095   }
5096 
5097   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
5098   const Register tmp = R11; // Will be preserved.
5099   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
5100   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
5101 
5102   mr_if_needed(R4_ARG2, oop);
5103   save_LR_CR(tmp); // save in old frame
5104   push_frame_reg_args(nbytes_save, tmp);
5105   // load FunctionDescriptor** / entry_address *
5106   load_const_optimized(tmp, fd, R0);
5107   // load FunctionDescriptor* / entry_address
5108   ld(tmp, 0, tmp);
5109   load_const_optimized(R3_ARG1, (address)msg, R0);
5110   // Call destination for its side effect.
5111   call_c(tmp);
5112 
5113   pop_frame();
5114   restore_LR_CR(tmp);
5115   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
5116 }
5117 
5118 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) {
5119   if (!VerifyOops) {
5120     return;
5121   }
5122 
5123   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
5124   const Register tmp = R11; // Will be preserved.
5125   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
5126   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
5127 
5128   ld(R4_ARG2, offs, base);
5129   save_LR_CR(tmp); // save in old frame
5130   push_frame_reg_args(nbytes_save, tmp);
5131   // load FunctionDescriptor** / entry_address *
5132   load_const_optimized(tmp, fd, R0);
5133   // load FunctionDescriptor* / entry_address
5134   ld(tmp, 0, tmp);
5135   load_const_optimized(R3_ARG1, (address)msg, R0);
5136   // Call destination for its side effect.
5137   call_c(tmp);
5138 
5139   pop_frame();
5140   restore_LR_CR(tmp);
5141   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
5142 }
5143 
5144 const char* stop_types[] = {
5145   "stop",
5146   "untested",
5147   "unimplemented",
5148   "shouldnotreachhere"
5149 };
5150 
5151 static void stop_on_request(int tp, const char* msg) {
5152   tty->print("PPC assembly code requires stop: (%s) %s\n", stop_types[tp%/*stop_end*/4], msg);
5153   guarantee(false, "PPC assembly code requires stop: %s", msg);
5154 }
5155 
5156 // Call a C-function that prints output.
5157 void MacroAssembler::stop(int type, const char* msg, int id) {
5158 #ifndef PRODUCT
5159   block_comment(err_msg("stop: %s %s {", stop_types[type%stop_end], msg));
5160 #else
5161   block_comment("stop {");
5162 #endif
5163 
5164   // setup arguments
5165   load_const_optimized(R3_ARG1, type);
5166   load_const_optimized(R4_ARG2, (void *)msg, /*tmp=*/R0);
5167   call_VM_leaf(CAST_FROM_FN_PTR(address, stop_on_request), R3_ARG1, R4_ARG2);
5168   illtrap();
5169   emit_int32(id);
5170   block_comment("} stop;");
5171 }
5172 
5173 #ifndef PRODUCT
5174 // Write pattern 0x0101010101010101 in memory region [low-before, high+after].
5175 // Val, addr are temp registers.
5176 // If low == addr, addr is killed.
5177 // High is preserved.
5178 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) {
5179   if (!ZapMemory) return;
5180 
5181   assert_different_registers(low, val);
5182 
5183   BLOCK_COMMENT("zap memory region {");
5184   load_const_optimized(val, 0x0101010101010101);
5185   int size = before + after;
5186   if (low == high && size < 5 && size > 0) {
5187     int offset = -before*BytesPerWord;
5188     for (int i = 0; i < size; ++i) {
5189       std(val, offset, low);
5190       offset += (1*BytesPerWord);
5191     }
5192   } else {
5193     addi(addr, low, -before*BytesPerWord);
5194     assert_different_registers(high, val);
5195     if (after) addi(high, high, after * BytesPerWord);
5196     Label loop;
5197     bind(loop);
5198     std(val, 0, addr);
5199     addi(addr, addr, 8);
5200     cmpd(CCR6, addr, high);
5201     ble(CCR6, loop);
5202     if (after) addi(high, high, -after * BytesPerWord);  // Correct back to old value.
5203   }
5204   BLOCK_COMMENT("} zap memory region");
5205 }
5206 
5207 #endif // !PRODUCT
5208 
5209 void SkipIfEqualZero::skip_to_label_if_equal_zero(MacroAssembler* masm, Register temp,
5210                                                   const bool* flag_addr, Label& label) {
5211   int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true);
5212   assert(sizeof(bool) == 1, "PowerPC ABI");
5213   masm->lbz(temp, simm16_offset, temp);
5214   masm->cmpwi(CCR0, temp, 0);
5215   masm->beq(CCR0, label);
5216 }
5217 
5218 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() {
5219   skip_to_label_if_equal_zero(masm, temp, flag_addr, _label);
5220 }
5221 
5222 SkipIfEqualZero::~SkipIfEqualZero() {
5223   _masm->bind(_label);
5224 }