rev 53441 : 8217459: [PPC64] Cleanup non-vector version of CRC32
Reviewed-by:

   1 /*
   2  * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2012, 2019, SAP SE. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "compiler/disassembler.hpp"
  29 #include "gc/shared/collectedHeap.inline.hpp"
  30 #include "gc/shared/barrierSet.hpp"
  31 #include "gc/shared/barrierSetAssembler.hpp"
  32 #include "interpreter/interpreter.hpp"
  33 #include "memory/resourceArea.hpp"
  34 #include "nativeInst_ppc.hpp"
  35 #include "prims/methodHandles.hpp"
  36 #include "runtime/biasedLocking.hpp"
  37 #include "runtime/icache.hpp"
  38 #include "runtime/interfaceSupport.inline.hpp"
  39 #include "runtime/objectMonitor.hpp"
  40 #include "runtime/os.hpp"
  41 #include "runtime/safepoint.hpp"
  42 #include "runtime/safepointMechanism.hpp"
  43 #include "runtime/sharedRuntime.hpp"
  44 #include "runtime/stubRoutines.hpp"
  45 #include "utilities/macros.hpp"
  46 #ifdef COMPILER2
  47 #include "opto/intrinsicnode.hpp"
  48 #endif
  49 
  50 #ifdef PRODUCT
  51 #define BLOCK_COMMENT(str) // nothing
  52 #else
  53 #define BLOCK_COMMENT(str) block_comment(str)
  54 #endif
  55 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  56 
  57 #ifdef ASSERT
  58 // On RISC, there's no benefit to verifying instruction boundaries.
  59 bool AbstractAssembler::pd_check_instruction_mark() { return false; }
  60 #endif
  61 
  62 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) {
  63   assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range");
  64   if (Assembler::is_simm(si31, 16)) {
  65     ld(d, si31, a);
  66     if (emit_filler_nop) nop();
  67   } else {
  68     const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31);
  69     const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31);
  70     addis(d, a, hi);
  71     ld(d, lo, d);
  72   }
  73 }
  74 
  75 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) {
  76   assert_different_registers(d, a);
  77   ld_largeoffset_unchecked(d, si31, a, emit_filler_nop);
  78 }
  79 
  80 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base,
  81                                       size_t size_in_bytes, bool is_signed) {
  82   switch (size_in_bytes) {
  83   case  8:              ld(dst, offs, base);                         break;
  84   case  4:  is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break;
  85   case  2:  is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break;
  86   case  1:  lbz(dst, offs, base); if (is_signed) extsb(dst, dst);    break; // lba doesn't exist :(
  87   default:  ShouldNotReachHere();
  88   }
  89 }
  90 
  91 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base,
  92                                        size_t size_in_bytes) {
  93   switch (size_in_bytes) {
  94   case  8:  std(dst, offs, base); break;
  95   case  4:  stw(dst, offs, base); break;
  96   case  2:  sth(dst, offs, base); break;
  97   case  1:  stb(dst, offs, base); break;
  98   default:  ShouldNotReachHere();
  99   }
 100 }
 101 
 102 void MacroAssembler::align(int modulus, int max, int rem) {
 103   int padding = (rem + modulus - (offset() % modulus)) % modulus;
 104   if (padding > max) return;
 105   for (int c = (padding >> 2); c > 0; --c) { nop(); }
 106 }
 107 
 108 // Issue instructions that calculate given TOC from global TOC.
 109 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16,
 110                                                        bool add_relocation, bool emit_dummy_addr) {
 111   int offset = -1;
 112   if (emit_dummy_addr) {
 113     offset = -128; // dummy address
 114   } else if (addr != (address)(intptr_t)-1) {
 115     offset = MacroAssembler::offset_to_global_toc(addr);
 116   }
 117 
 118   if (hi16) {
 119     addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset));
 120   }
 121   if (lo16) {
 122     if (add_relocation) {
 123       // Relocate at the addi to avoid confusion with a load from the method's TOC.
 124       relocate(internal_word_Relocation::spec(addr));
 125     }
 126     addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset));
 127   }
 128 }
 129 
 130 address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) {
 131   const int offset = MacroAssembler::offset_to_global_toc(addr);
 132 
 133   const address inst2_addr = a;
 134   const int inst2 = *(int *)inst2_addr;
 135 
 136   // The relocation points to the second instruction, the addi,
 137   // and the addi reads and writes the same register dst.
 138   const int dst = inv_rt_field(inst2);
 139   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 140 
 141   // Now, find the preceding addis which writes to dst.
 142   int inst1 = 0;
 143   address inst1_addr = inst2_addr - BytesPerInstWord;
 144   while (inst1_addr >= bound) {
 145     inst1 = *(int *) inst1_addr;
 146     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 147       // Stop, found the addis which writes dst.
 148       break;
 149     }
 150     inst1_addr -= BytesPerInstWord;
 151   }
 152 
 153   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 154   set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset));
 155   set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset));
 156   return inst1_addr;
 157 }
 158 
 159 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) {
 160   const address inst2_addr = a;
 161   const int inst2 = *(int *)inst2_addr;
 162 
 163   // The relocation points to the second instruction, the addi,
 164   // and the addi reads and writes the same register dst.
 165   const int dst = inv_rt_field(inst2);
 166   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 167 
 168   // Now, find the preceding addis which writes to dst.
 169   int inst1 = 0;
 170   address inst1_addr = inst2_addr - BytesPerInstWord;
 171   while (inst1_addr >= bound) {
 172     inst1 = *(int *) inst1_addr;
 173     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 174       // stop, found the addis which writes dst
 175       break;
 176     }
 177     inst1_addr -= BytesPerInstWord;
 178   }
 179 
 180   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 181 
 182   int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0);
 183   // -1 is a special case
 184   if (offset == -1) {
 185     return (address)(intptr_t)-1;
 186   } else {
 187     return global_toc() + offset;
 188   }
 189 }
 190 
 191 #ifdef _LP64
 192 // Patch compressed oops or klass constants.
 193 // Assembler sequence is
 194 // 1) compressed oops:
 195 //    lis  rx = const.hi
 196 //    ori rx = rx | const.lo
 197 // 2) compressed klass:
 198 //    lis  rx = const.hi
 199 //    clrldi rx = rx & 0xFFFFffff // clearMS32b, optional
 200 //    ori rx = rx | const.lo
 201 // Clrldi will be passed by.
 202 address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) {
 203   assert(UseCompressedOops, "Should only patch compressed oops");
 204 
 205   const address inst2_addr = a;
 206   const int inst2 = *(int *)inst2_addr;
 207 
 208   // The relocation points to the second instruction, the ori,
 209   // and the ori reads and writes the same register dst.
 210   const int dst = inv_rta_field(inst2);
 211   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 212   // Now, find the preceding addis which writes to dst.
 213   int inst1 = 0;
 214   address inst1_addr = inst2_addr - BytesPerInstWord;
 215   bool inst1_found = false;
 216   while (inst1_addr >= bound) {
 217     inst1 = *(int *)inst1_addr;
 218     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; }
 219     inst1_addr -= BytesPerInstWord;
 220   }
 221   assert(inst1_found, "inst is not lis");
 222 
 223   int xc = (data >> 16) & 0xffff;
 224   int xd = (data >>  0) & 0xffff;
 225 
 226   set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo
 227   set_imm((int *)inst2_addr,        (xd)); // unsigned int
 228   return inst1_addr;
 229 }
 230 
 231 // Get compressed oop or klass constant.
 232 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) {
 233   assert(UseCompressedOops, "Should only patch compressed oops");
 234 
 235   const address inst2_addr = a;
 236   const int inst2 = *(int *)inst2_addr;
 237 
 238   // The relocation points to the second instruction, the ori,
 239   // and the ori reads and writes the same register dst.
 240   const int dst = inv_rta_field(inst2);
 241   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 242   // Now, find the preceding lis which writes to dst.
 243   int inst1 = 0;
 244   address inst1_addr = inst2_addr - BytesPerInstWord;
 245   bool inst1_found = false;
 246 
 247   while (inst1_addr >= bound) {
 248     inst1 = *(int *) inst1_addr;
 249     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;}
 250     inst1_addr -= BytesPerInstWord;
 251   }
 252   assert(inst1_found, "inst is not lis");
 253 
 254   uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff));
 255   uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16);
 256 
 257   return (int) (xl | xh);
 258 }
 259 #endif // _LP64
 260 
 261 // Returns true if successful.
 262 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a,
 263                                                 Register toc, bool fixed_size) {
 264   int toc_offset = 0;
 265   // Use RelocationHolder::none for the constant pool entry, otherwise
 266   // we will end up with a failing NativeCall::verify(x) where x is
 267   // the address of the constant pool entry.
 268   // FIXME: We should insert relocation information for oops at the constant
 269   // pool entries instead of inserting it at the loads; patching of a constant
 270   // pool entry should be less expensive.
 271   address const_address = address_constant((address)a.value(), RelocationHolder::none);
 272   if (const_address == NULL) { return false; } // allocation failure
 273   // Relocate at the pc of the load.
 274   relocate(a.rspec());
 275   toc_offset = (int)(const_address - code()->consts()->start());
 276   ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size);
 277   return true;
 278 }
 279 
 280 bool MacroAssembler::is_load_const_from_method_toc_at(address a) {
 281   const address inst1_addr = a;
 282   const int inst1 = *(int *)inst1_addr;
 283 
 284    // The relocation points to the ld or the addis.
 285    return (is_ld(inst1)) ||
 286           (is_addis(inst1) && inv_ra_field(inst1) != 0);
 287 }
 288 
 289 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) {
 290   assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc");
 291 
 292   const address inst1_addr = a;
 293   const int inst1 = *(int *)inst1_addr;
 294 
 295   if (is_ld(inst1)) {
 296     return inv_d1_field(inst1);
 297   } else if (is_addis(inst1)) {
 298     const int dst = inv_rt_field(inst1);
 299 
 300     // Now, find the succeeding ld which reads and writes to dst.
 301     address inst2_addr = inst1_addr + BytesPerInstWord;
 302     int inst2 = 0;
 303     while (true) {
 304       inst2 = *(int *) inst2_addr;
 305       if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) {
 306         // Stop, found the ld which reads and writes dst.
 307         break;
 308       }
 309       inst2_addr += BytesPerInstWord;
 310     }
 311     return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2);
 312   }
 313   ShouldNotReachHere();
 314   return 0;
 315 }
 316 
 317 // Get the constant from a `load_const' sequence.
 318 long MacroAssembler::get_const(address a) {
 319   assert(is_load_const_at(a), "not a load of a constant");
 320   const int *p = (const int*) a;
 321   unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48);
 322   if (is_ori(*(p+1))) {
 323     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32);
 324     x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16);
 325     x |= (((unsigned long) (get_imm(a,4) & 0xffff)));
 326   } else if (is_lis(*(p+1))) {
 327     x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32);
 328     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16);
 329     x |= (((unsigned long) (get_imm(a,3) & 0xffff)));
 330   } else {
 331     ShouldNotReachHere();
 332     return (long) 0;
 333   }
 334   return (long) x;
 335 }
 336 
 337 // Patch the 64 bit constant of a `load_const' sequence. This is a low
 338 // level procedure. It neither flushes the instruction cache nor is it
 339 // mt safe.
 340 void MacroAssembler::patch_const(address a, long x) {
 341   assert(is_load_const_at(a), "not a load of a constant");
 342   int *p = (int*) a;
 343   if (is_ori(*(p+1))) {
 344     set_imm(0 + p, (x >> 48) & 0xffff);
 345     set_imm(1 + p, (x >> 32) & 0xffff);
 346     set_imm(3 + p, (x >> 16) & 0xffff);
 347     set_imm(4 + p, x & 0xffff);
 348   } else if (is_lis(*(p+1))) {
 349     set_imm(0 + p, (x >> 48) & 0xffff);
 350     set_imm(2 + p, (x >> 32) & 0xffff);
 351     set_imm(1 + p, (x >> 16) & 0xffff);
 352     set_imm(3 + p, x & 0xffff);
 353   } else {
 354     ShouldNotReachHere();
 355   }
 356 }
 357 
 358 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) {
 359   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
 360   int index = oop_recorder()->allocate_metadata_index(obj);
 361   RelocationHolder rspec = metadata_Relocation::spec(index);
 362   return AddressLiteral((address)obj, rspec);
 363 }
 364 
 365 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) {
 366   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
 367   int index = oop_recorder()->find_index(obj);
 368   RelocationHolder rspec = metadata_Relocation::spec(index);
 369   return AddressLiteral((address)obj, rspec);
 370 }
 371 
 372 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) {
 373   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
 374   int oop_index = oop_recorder()->allocate_oop_index(obj);
 375   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 376 }
 377 
 378 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) {
 379   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
 380   int oop_index = oop_recorder()->find_index(obj);
 381   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 382 }
 383 
 384 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
 385                                                       Register tmp, int offset) {
 386   intptr_t value = *delayed_value_addr;
 387   if (value != 0) {
 388     return RegisterOrConstant(value + offset);
 389   }
 390 
 391   // Load indirectly to solve generation ordering problem.
 392   // static address, no relocation
 393   int simm16_offset = load_const_optimized(tmp, delayed_value_addr, noreg, true);
 394   ld(tmp, simm16_offset, tmp); // must be aligned ((xa & 3) == 0)
 395 
 396   if (offset != 0) {
 397     addi(tmp, tmp, offset);
 398   }
 399 
 400   return RegisterOrConstant(tmp);
 401 }
 402 
 403 #ifndef PRODUCT
 404 void MacroAssembler::pd_print_patched_instruction(address branch) {
 405   Unimplemented(); // TODO: PPC port
 406 }
 407 #endif // ndef PRODUCT
 408 
 409 // Conditional far branch for destinations encodable in 24+2 bits.
 410 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) {
 411 
 412   // If requested by flag optimize, relocate the bc_far as a
 413   // runtime_call and prepare for optimizing it when the code gets
 414   // relocated.
 415   if (optimize == bc_far_optimize_on_relocate) {
 416     relocate(relocInfo::runtime_call_type);
 417   }
 418 
 419   // variant 2:
 420   //
 421   //    b!cxx SKIP
 422   //    bxx   DEST
 423   //  SKIP:
 424   //
 425 
 426   const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 427                                                 opposite_bcond(inv_boint_bcond(boint)));
 428 
 429   // We emit two branches.
 430   // First, a conditional branch which jumps around the far branch.
 431   const address not_taken_pc = pc() + 2 * BytesPerInstWord;
 432   const address bc_pc        = pc();
 433   bc(opposite_boint, biint, not_taken_pc);
 434 
 435   const int bc_instr = *(int*)bc_pc;
 436   assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition");
 437   assert(opposite_boint == inv_bo_field(bc_instr), "postcondition");
 438   assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))),
 439                                      opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))),
 440          "postcondition");
 441   assert(biint == inv_bi_field(bc_instr), "postcondition");
 442 
 443   // Second, an unconditional far branch which jumps to dest.
 444   // Note: target(dest) remembers the current pc (see CodeSection::target)
 445   //       and returns the current pc if the label is not bound yet; when
 446   //       the label gets bound, the unconditional far branch will be patched.
 447   const address target_pc = target(dest);
 448   const address b_pc  = pc();
 449   b(target_pc);
 450 
 451   assert(not_taken_pc == pc(),                     "postcondition");
 452   assert(dest.is_bound() || target_pc == b_pc, "postcondition");
 453 }
 454 
 455 // 1 or 2 instructions
 456 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) {
 457   if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) {
 458     bc(boint, biint, dest);
 459   } else {
 460     bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate);
 461   }
 462 }
 463 
 464 bool MacroAssembler::is_bc_far_at(address instruction_addr) {
 465   return is_bc_far_variant1_at(instruction_addr) ||
 466          is_bc_far_variant2_at(instruction_addr) ||
 467          is_bc_far_variant3_at(instruction_addr);
 468 }
 469 
 470 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) {
 471   if (is_bc_far_variant1_at(instruction_addr)) {
 472     const address instruction_1_addr = instruction_addr;
 473     const int instruction_1 = *(int*)instruction_1_addr;
 474     return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr);
 475   } else if (is_bc_far_variant2_at(instruction_addr)) {
 476     const address instruction_2_addr = instruction_addr + 4;
 477     return bxx_destination(instruction_2_addr);
 478   } else if (is_bc_far_variant3_at(instruction_addr)) {
 479     return instruction_addr + 8;
 480   }
 481   // variant 4 ???
 482   ShouldNotReachHere();
 483   return NULL;
 484 }
 485 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) {
 486 
 487   if (is_bc_far_variant3_at(instruction_addr)) {
 488     // variant 3, far cond branch to the next instruction, already patched to nops:
 489     //
 490     //    nop
 491     //    endgroup
 492     //  SKIP/DEST:
 493     //
 494     return;
 495   }
 496 
 497   // first, extract boint and biint from the current branch
 498   int boint = 0;
 499   int biint = 0;
 500 
 501   ResourceMark rm;
 502   const int code_size = 2 * BytesPerInstWord;
 503   CodeBuffer buf(instruction_addr, code_size);
 504   MacroAssembler masm(&buf);
 505   if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) {
 506     // Far branch to next instruction: Optimize it by patching nops (produce variant 3).
 507     masm.nop();
 508     masm.endgroup();
 509   } else {
 510     if (is_bc_far_variant1_at(instruction_addr)) {
 511       // variant 1, the 1st instruction contains the destination address:
 512       //
 513       //    bcxx  DEST
 514       //    nop
 515       //
 516       const int instruction_1 = *(int*)(instruction_addr);
 517       boint = inv_bo_field(instruction_1);
 518       biint = inv_bi_field(instruction_1);
 519     } else if (is_bc_far_variant2_at(instruction_addr)) {
 520       // variant 2, the 2nd instruction contains the destination address:
 521       //
 522       //    b!cxx SKIP
 523       //    bxx   DEST
 524       //  SKIP:
 525       //
 526       const int instruction_1 = *(int*)(instruction_addr);
 527       boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))),
 528           opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1))));
 529       biint = inv_bi_field(instruction_1);
 530     } else {
 531       // variant 4???
 532       ShouldNotReachHere();
 533     }
 534 
 535     // second, set the new branch destination and optimize the code
 536     if (dest != instruction_addr + 4 && // the bc_far is still unbound!
 537         masm.is_within_range_of_bcxx(dest, instruction_addr)) {
 538       // variant 1:
 539       //
 540       //    bcxx  DEST
 541       //    nop
 542       //
 543       masm.bc(boint, biint, dest);
 544       masm.nop();
 545     } else {
 546       // variant 2:
 547       //
 548       //    b!cxx SKIP
 549       //    bxx   DEST
 550       //  SKIP:
 551       //
 552       const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 553                                                     opposite_bcond(inv_boint_bcond(boint)));
 554       const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord;
 555       masm.bc(opposite_boint, biint, not_taken_pc);
 556       masm.b(dest);
 557     }
 558   }
 559   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 560 }
 561 
 562 // Emit a NOT mt-safe patchable 64 bit absolute call/jump.
 563 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) {
 564   // get current pc
 565   uint64_t start_pc = (uint64_t) pc();
 566 
 567   const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last
 568   const address pc_of_b  = (address) (start_pc + (0*BytesPerInstWord)); // b is first
 569 
 570   // relocate here
 571   if (rt != relocInfo::none) {
 572     relocate(rt);
 573   }
 574 
 575   if ( ReoptimizeCallSequences &&
 576        (( link && is_within_range_of_b(dest, pc_of_bl)) ||
 577         (!link && is_within_range_of_b(dest, pc_of_b)))) {
 578     // variant 2:
 579     // Emit an optimized, pc-relative call/jump.
 580 
 581     if (link) {
 582       // some padding
 583       nop();
 584       nop();
 585       nop();
 586       nop();
 587       nop();
 588       nop();
 589 
 590       // do the call
 591       assert(pc() == pc_of_bl, "just checking");
 592       bl(dest, relocInfo::none);
 593     } else {
 594       // do the jump
 595       assert(pc() == pc_of_b, "just checking");
 596       b(dest, relocInfo::none);
 597 
 598       // some padding
 599       nop();
 600       nop();
 601       nop();
 602       nop();
 603       nop();
 604       nop();
 605     }
 606 
 607     // Assert that we can identify the emitted call/jump.
 608     assert(is_bxx64_patchable_variant2_at((address)start_pc, link),
 609            "can't identify emitted call");
 610   } else {
 611     // variant 1:
 612     mr(R0, R11);  // spill R11 -> R0.
 613 
 614     // Load the destination address into CTR,
 615     // calculate destination relative to global toc.
 616     calculate_address_from_global_toc(R11, dest, true, true, false);
 617 
 618     mtctr(R11);
 619     mr(R11, R0);  // spill R11 <- R0.
 620     nop();
 621 
 622     // do the call/jump
 623     if (link) {
 624       bctrl();
 625     } else{
 626       bctr();
 627     }
 628     // Assert that we can identify the emitted call/jump.
 629     assert(is_bxx64_patchable_variant1b_at((address)start_pc, link),
 630            "can't identify emitted call");
 631   }
 632 
 633   // Assert that we can identify the emitted call/jump.
 634   assert(is_bxx64_patchable_at((address)start_pc, link),
 635          "can't identify emitted call");
 636   assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest,
 637          "wrong encoding of dest address");
 638 }
 639 
 640 // Identify a bxx64_patchable instruction.
 641 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) {
 642   return is_bxx64_patchable_variant1b_at(instruction_addr, link)
 643     //|| is_bxx64_patchable_variant1_at(instruction_addr, link)
 644       || is_bxx64_patchable_variant2_at(instruction_addr, link);
 645 }
 646 
 647 // Does the call64_patchable instruction use a pc-relative encoding of
 648 // the call destination?
 649 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) {
 650   // variant 2 is pc-relative
 651   return is_bxx64_patchable_variant2_at(instruction_addr, link);
 652 }
 653 
 654 // Identify variant 1.
 655 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) {
 656   unsigned int* instr = (unsigned int*) instruction_addr;
 657   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 658       && is_mtctr(instr[5]) // mtctr
 659     && is_load_const_at(instruction_addr);
 660 }
 661 
 662 // Identify variant 1b: load destination relative to global toc.
 663 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) {
 664   unsigned int* instr = (unsigned int*) instruction_addr;
 665   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 666     && is_mtctr(instr[3]) // mtctr
 667     && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr);
 668 }
 669 
 670 // Identify variant 2.
 671 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) {
 672   unsigned int* instr = (unsigned int*) instruction_addr;
 673   if (link) {
 674     return is_bl (instr[6])  // bl dest is last
 675       && is_nop(instr[0])  // nop
 676       && is_nop(instr[1])  // nop
 677       && is_nop(instr[2])  // nop
 678       && is_nop(instr[3])  // nop
 679       && is_nop(instr[4])  // nop
 680       && is_nop(instr[5]); // nop
 681   } else {
 682     return is_b  (instr[0])  // b  dest is first
 683       && is_nop(instr[1])  // nop
 684       && is_nop(instr[2])  // nop
 685       && is_nop(instr[3])  // nop
 686       && is_nop(instr[4])  // nop
 687       && is_nop(instr[5])  // nop
 688       && is_nop(instr[6]); // nop
 689   }
 690 }
 691 
 692 // Set dest address of a bxx64_patchable instruction.
 693 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) {
 694   ResourceMark rm;
 695   int code_size = MacroAssembler::bxx64_patchable_size;
 696   CodeBuffer buf(instruction_addr, code_size);
 697   MacroAssembler masm(&buf);
 698   masm.bxx64_patchable(dest, relocInfo::none, link);
 699   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 700 }
 701 
 702 // Get dest address of a bxx64_patchable instruction.
 703 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) {
 704   if (is_bxx64_patchable_variant1_at(instruction_addr, link)) {
 705     return (address) (unsigned long) get_const(instruction_addr);
 706   } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) {
 707     unsigned int* instr = (unsigned int*) instruction_addr;
 708     if (link) {
 709       const int instr_idx = 6; // bl is last
 710       int branchoffset = branch_destination(instr[instr_idx], 0);
 711       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 712     } else {
 713       const int instr_idx = 0; // b is first
 714       int branchoffset = branch_destination(instr[instr_idx], 0);
 715       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 716     }
 717   // Load dest relative to global toc.
 718   } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) {
 719     return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord,
 720                                                                instruction_addr);
 721   } else {
 722     ShouldNotReachHere();
 723     return NULL;
 724   }
 725 }
 726 
 727 // Uses ordering which corresponds to ABI:
 728 //    _savegpr0_14:  std  r14,-144(r1)
 729 //    _savegpr0_15:  std  r15,-136(r1)
 730 //    _savegpr0_16:  std  r16,-128(r1)
 731 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) {
 732   std(R14, offset, dst);   offset += 8;
 733   std(R15, offset, dst);   offset += 8;
 734   std(R16, offset, dst);   offset += 8;
 735   std(R17, offset, dst);   offset += 8;
 736   std(R18, offset, dst);   offset += 8;
 737   std(R19, offset, dst);   offset += 8;
 738   std(R20, offset, dst);   offset += 8;
 739   std(R21, offset, dst);   offset += 8;
 740   std(R22, offset, dst);   offset += 8;
 741   std(R23, offset, dst);   offset += 8;
 742   std(R24, offset, dst);   offset += 8;
 743   std(R25, offset, dst);   offset += 8;
 744   std(R26, offset, dst);   offset += 8;
 745   std(R27, offset, dst);   offset += 8;
 746   std(R28, offset, dst);   offset += 8;
 747   std(R29, offset, dst);   offset += 8;
 748   std(R30, offset, dst);   offset += 8;
 749   std(R31, offset, dst);   offset += 8;
 750 
 751   stfd(F14, offset, dst);   offset += 8;
 752   stfd(F15, offset, dst);   offset += 8;
 753   stfd(F16, offset, dst);   offset += 8;
 754   stfd(F17, offset, dst);   offset += 8;
 755   stfd(F18, offset, dst);   offset += 8;
 756   stfd(F19, offset, dst);   offset += 8;
 757   stfd(F20, offset, dst);   offset += 8;
 758   stfd(F21, offset, dst);   offset += 8;
 759   stfd(F22, offset, dst);   offset += 8;
 760   stfd(F23, offset, dst);   offset += 8;
 761   stfd(F24, offset, dst);   offset += 8;
 762   stfd(F25, offset, dst);   offset += 8;
 763   stfd(F26, offset, dst);   offset += 8;
 764   stfd(F27, offset, dst);   offset += 8;
 765   stfd(F28, offset, dst);   offset += 8;
 766   stfd(F29, offset, dst);   offset += 8;
 767   stfd(F30, offset, dst);   offset += 8;
 768   stfd(F31, offset, dst);
 769 }
 770 
 771 // Uses ordering which corresponds to ABI:
 772 //    _restgpr0_14:  ld   r14,-144(r1)
 773 //    _restgpr0_15:  ld   r15,-136(r1)
 774 //    _restgpr0_16:  ld   r16,-128(r1)
 775 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) {
 776   ld(R14, offset, src);   offset += 8;
 777   ld(R15, offset, src);   offset += 8;
 778   ld(R16, offset, src);   offset += 8;
 779   ld(R17, offset, src);   offset += 8;
 780   ld(R18, offset, src);   offset += 8;
 781   ld(R19, offset, src);   offset += 8;
 782   ld(R20, offset, src);   offset += 8;
 783   ld(R21, offset, src);   offset += 8;
 784   ld(R22, offset, src);   offset += 8;
 785   ld(R23, offset, src);   offset += 8;
 786   ld(R24, offset, src);   offset += 8;
 787   ld(R25, offset, src);   offset += 8;
 788   ld(R26, offset, src);   offset += 8;
 789   ld(R27, offset, src);   offset += 8;
 790   ld(R28, offset, src);   offset += 8;
 791   ld(R29, offset, src);   offset += 8;
 792   ld(R30, offset, src);   offset += 8;
 793   ld(R31, offset, src);   offset += 8;
 794 
 795   // FP registers
 796   lfd(F14, offset, src);   offset += 8;
 797   lfd(F15, offset, src);   offset += 8;
 798   lfd(F16, offset, src);   offset += 8;
 799   lfd(F17, offset, src);   offset += 8;
 800   lfd(F18, offset, src);   offset += 8;
 801   lfd(F19, offset, src);   offset += 8;
 802   lfd(F20, offset, src);   offset += 8;
 803   lfd(F21, offset, src);   offset += 8;
 804   lfd(F22, offset, src);   offset += 8;
 805   lfd(F23, offset, src);   offset += 8;
 806   lfd(F24, offset, src);   offset += 8;
 807   lfd(F25, offset, src);   offset += 8;
 808   lfd(F26, offset, src);   offset += 8;
 809   lfd(F27, offset, src);   offset += 8;
 810   lfd(F28, offset, src);   offset += 8;
 811   lfd(F29, offset, src);   offset += 8;
 812   lfd(F30, offset, src);   offset += 8;
 813   lfd(F31, offset, src);
 814 }
 815 
 816 // For verify_oops.
 817 void MacroAssembler::save_volatile_gprs(Register dst, int offset) {
 818   std(R2,  offset, dst);   offset += 8;
 819   std(R3,  offset, dst);   offset += 8;
 820   std(R4,  offset, dst);   offset += 8;
 821   std(R5,  offset, dst);   offset += 8;
 822   std(R6,  offset, dst);   offset += 8;
 823   std(R7,  offset, dst);   offset += 8;
 824   std(R8,  offset, dst);   offset += 8;
 825   std(R9,  offset, dst);   offset += 8;
 826   std(R10, offset, dst);   offset += 8;
 827   std(R11, offset, dst);   offset += 8;
 828   std(R12, offset, dst);   offset += 8;
 829 
 830   stfd(F0, offset, dst);   offset += 8;
 831   stfd(F1, offset, dst);   offset += 8;
 832   stfd(F2, offset, dst);   offset += 8;
 833   stfd(F3, offset, dst);   offset += 8;
 834   stfd(F4, offset, dst);   offset += 8;
 835   stfd(F5, offset, dst);   offset += 8;
 836   stfd(F6, offset, dst);   offset += 8;
 837   stfd(F7, offset, dst);   offset += 8;
 838   stfd(F8, offset, dst);   offset += 8;
 839   stfd(F9, offset, dst);   offset += 8;
 840   stfd(F10, offset, dst);  offset += 8;
 841   stfd(F11, offset, dst);  offset += 8;
 842   stfd(F12, offset, dst);  offset += 8;
 843   stfd(F13, offset, dst);
 844 }
 845 
 846 // For verify_oops.
 847 void MacroAssembler::restore_volatile_gprs(Register src, int offset) {
 848   ld(R2,  offset, src);   offset += 8;
 849   ld(R3,  offset, src);   offset += 8;
 850   ld(R4,  offset, src);   offset += 8;
 851   ld(R5,  offset, src);   offset += 8;
 852   ld(R6,  offset, src);   offset += 8;
 853   ld(R7,  offset, src);   offset += 8;
 854   ld(R8,  offset, src);   offset += 8;
 855   ld(R9,  offset, src);   offset += 8;
 856   ld(R10, offset, src);   offset += 8;
 857   ld(R11, offset, src);   offset += 8;
 858   ld(R12, offset, src);   offset += 8;
 859 
 860   lfd(F0, offset, src);   offset += 8;
 861   lfd(F1, offset, src);   offset += 8;
 862   lfd(F2, offset, src);   offset += 8;
 863   lfd(F3, offset, src);   offset += 8;
 864   lfd(F4, offset, src);   offset += 8;
 865   lfd(F5, offset, src);   offset += 8;
 866   lfd(F6, offset, src);   offset += 8;
 867   lfd(F7, offset, src);   offset += 8;
 868   lfd(F8, offset, src);   offset += 8;
 869   lfd(F9, offset, src);   offset += 8;
 870   lfd(F10, offset, src);  offset += 8;
 871   lfd(F11, offset, src);  offset += 8;
 872   lfd(F12, offset, src);  offset += 8;
 873   lfd(F13, offset, src);
 874 }
 875 
 876 void MacroAssembler::save_LR_CR(Register tmp) {
 877   mfcr(tmp);
 878   std(tmp, _abi(cr), R1_SP);
 879   mflr(tmp);
 880   std(tmp, _abi(lr), R1_SP);
 881   // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad)
 882 }
 883 
 884 void MacroAssembler::restore_LR_CR(Register tmp) {
 885   assert(tmp != R1_SP, "must be distinct");
 886   ld(tmp, _abi(lr), R1_SP);
 887   mtlr(tmp);
 888   ld(tmp, _abi(cr), R1_SP);
 889   mtcr(tmp);
 890 }
 891 
 892 address MacroAssembler::get_PC_trash_LR(Register result) {
 893   Label L;
 894   bl(L);
 895   bind(L);
 896   address lr_pc = pc();
 897   mflr(result);
 898   return lr_pc;
 899 }
 900 
 901 void MacroAssembler::resize_frame(Register offset, Register tmp) {
 902 #ifdef ASSERT
 903   assert_different_registers(offset, tmp, R1_SP);
 904   andi_(tmp, offset, frame::alignment_in_bytes-1);
 905   asm_assert_eq("resize_frame: unaligned", 0x204);
 906 #endif
 907 
 908   // tmp <- *(SP)
 909   ld(tmp, _abi(callers_sp), R1_SP);
 910   // addr <- SP + offset;
 911   // *(addr) <- tmp;
 912   // SP <- addr
 913   stdux(tmp, R1_SP, offset);
 914 }
 915 
 916 void MacroAssembler::resize_frame(int offset, Register tmp) {
 917   assert(is_simm(offset, 16), "too big an offset");
 918   assert_different_registers(tmp, R1_SP);
 919   assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned");
 920   // tmp <- *(SP)
 921   ld(tmp, _abi(callers_sp), R1_SP);
 922   // addr <- SP + offset;
 923   // *(addr) <- tmp;
 924   // SP <- addr
 925   stdu(tmp, offset, R1_SP);
 926 }
 927 
 928 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) {
 929   // (addr == tmp1) || (addr == tmp2) is allowed here!
 930   assert(tmp1 != tmp2, "must be distinct");
 931 
 932   // compute offset w.r.t. current stack pointer
 933   // tmp_1 <- addr - SP (!)
 934   subf(tmp1, R1_SP, addr);
 935 
 936   // atomically update SP keeping back link.
 937   resize_frame(tmp1/* offset */, tmp2/* tmp */);
 938 }
 939 
 940 void MacroAssembler::push_frame(Register bytes, Register tmp) {
 941 #ifdef ASSERT
 942   assert(bytes != R0, "r0 not allowed here");
 943   andi_(R0, bytes, frame::alignment_in_bytes-1);
 944   asm_assert_eq("push_frame(Reg, Reg): unaligned", 0x203);
 945 #endif
 946   neg(tmp, bytes);
 947   stdux(R1_SP, R1_SP, tmp);
 948 }
 949 
 950 // Push a frame of size `bytes'.
 951 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) {
 952   long offset = align_addr(bytes, frame::alignment_in_bytes);
 953   if (is_simm(-offset, 16)) {
 954     stdu(R1_SP, -offset, R1_SP);
 955   } else {
 956     load_const_optimized(tmp, -offset);
 957     stdux(R1_SP, R1_SP, tmp);
 958   }
 959 }
 960 
 961 // Push a frame of size `bytes' plus abi_reg_args on top.
 962 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) {
 963   push_frame(bytes + frame::abi_reg_args_size, tmp);
 964 }
 965 
 966 // Setup up a new C frame with a spill area for non-volatile GPRs and
 967 // additional space for local variables.
 968 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes,
 969                                                       Register tmp) {
 970   push_frame(bytes + frame::abi_reg_args_size + frame::spill_nonvolatiles_size, tmp);
 971 }
 972 
 973 // Pop current C frame.
 974 void MacroAssembler::pop_frame() {
 975   ld(R1_SP, _abi(callers_sp), R1_SP);
 976 }
 977 
 978 #if defined(ABI_ELFv2)
 979 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) {
 980   // TODO(asmundak): make sure the caller uses R12 as function descriptor
 981   // most of the times.
 982   if (R12 != r_function_entry) {
 983     mr(R12, r_function_entry);
 984   }
 985   mtctr(R12);
 986   // Do a call or a branch.
 987   if (and_link) {
 988     bctrl();
 989   } else {
 990     bctr();
 991   }
 992   _last_calls_return_pc = pc();
 993 
 994   return _last_calls_return_pc;
 995 }
 996 
 997 // Call a C function via a function descriptor and use full C
 998 // calling conventions. Updates and returns _last_calls_return_pc.
 999 address MacroAssembler::call_c(Register r_function_entry) {
1000   return branch_to(r_function_entry, /*and_link=*/true);
1001 }
1002 
1003 // For tail calls: only branch, don't link, so callee returns to caller of this function.
1004 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) {
1005   return branch_to(r_function_entry, /*and_link=*/false);
1006 }
1007 
1008 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) {
1009   load_const(R12, function_entry, R0);
1010   return branch_to(R12,  /*and_link=*/true);
1011 }
1012 
1013 #else
1014 // Generic version of a call to C function via a function descriptor
1015 // with variable support for C calling conventions (TOC, ENV, etc.).
1016 // Updates and returns _last_calls_return_pc.
1017 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call,
1018                                   bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) {
1019   // we emit standard ptrgl glue code here
1020   assert((function_descriptor != R0), "function_descriptor cannot be R0");
1021 
1022   // retrieve necessary entries from the function descriptor
1023   ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor);
1024   mtctr(R0);
1025 
1026   if (load_toc_of_callee) {
1027     ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor);
1028   }
1029   if (load_env_of_callee) {
1030     ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor);
1031   } else if (load_toc_of_callee) {
1032     li(R11, 0);
1033   }
1034 
1035   // do a call or a branch
1036   if (and_link) {
1037     bctrl();
1038   } else {
1039     bctr();
1040   }
1041   _last_calls_return_pc = pc();
1042 
1043   return _last_calls_return_pc;
1044 }
1045 
1046 // Call a C function via a function descriptor and use full C calling
1047 // conventions.
1048 // We don't use the TOC in generated code, so there is no need to save
1049 // and restore its value.
1050 address MacroAssembler::call_c(Register fd) {
1051   return branch_to(fd, /*and_link=*/true,
1052                        /*save toc=*/false,
1053                        /*restore toc=*/false,
1054                        /*load toc=*/true,
1055                        /*load env=*/true);
1056 }
1057 
1058 address MacroAssembler::call_c_and_return_to_caller(Register fd) {
1059   return branch_to(fd, /*and_link=*/false,
1060                        /*save toc=*/false,
1061                        /*restore toc=*/false,
1062                        /*load toc=*/true,
1063                        /*load env=*/true);
1064 }
1065 
1066 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) {
1067   if (rt != relocInfo::none) {
1068     // this call needs to be relocatable
1069     if (!ReoptimizeCallSequences
1070         || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1071         || fd == NULL   // support code-size estimation
1072         || !fd->is_friend_function()
1073         || fd->entry() == NULL) {
1074       // it's not a friend function as defined by class FunctionDescriptor,
1075       // so do a full call-c here.
1076       load_const(R11, (address)fd, R0);
1077 
1078       bool has_env = (fd != NULL && fd->env() != NULL);
1079       return branch_to(R11, /*and_link=*/true,
1080                             /*save toc=*/false,
1081                             /*restore toc=*/false,
1082                             /*load toc=*/true,
1083                             /*load env=*/has_env);
1084     } else {
1085       // It's a friend function. Load the entry point and don't care about
1086       // toc and env. Use an optimizable call instruction, but ensure the
1087       // same code-size as in the case of a non-friend function.
1088       nop();
1089       nop();
1090       nop();
1091       bl64_patchable(fd->entry(), rt);
1092       _last_calls_return_pc = pc();
1093       return _last_calls_return_pc;
1094     }
1095   } else {
1096     // This call does not need to be relocatable, do more aggressive
1097     // optimizations.
1098     if (!ReoptimizeCallSequences
1099       || !fd->is_friend_function()) {
1100       // It's not a friend function as defined by class FunctionDescriptor,
1101       // so do a full call-c here.
1102       load_const(R11, (address)fd, R0);
1103       return branch_to(R11, /*and_link=*/true,
1104                             /*save toc=*/false,
1105                             /*restore toc=*/false,
1106                             /*load toc=*/true,
1107                             /*load env=*/true);
1108     } else {
1109       // it's a friend function, load the entry point and don't care about
1110       // toc and env.
1111       address dest = fd->entry();
1112       if (is_within_range_of_b(dest, pc())) {
1113         bl(dest);
1114       } else {
1115         bl64_patchable(dest, rt);
1116       }
1117       _last_calls_return_pc = pc();
1118       return _last_calls_return_pc;
1119     }
1120   }
1121 }
1122 
1123 // Call a C function.  All constants needed reside in TOC.
1124 //
1125 // Read the address to call from the TOC.
1126 // Read env from TOC, if fd specifies an env.
1127 // Read new TOC from TOC.
1128 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd,
1129                                          relocInfo::relocType rt, Register toc) {
1130   if (!ReoptimizeCallSequences
1131     || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1132     || !fd->is_friend_function()) {
1133     // It's not a friend function as defined by class FunctionDescriptor,
1134     // so do a full call-c here.
1135     assert(fd->entry() != NULL, "function must be linked");
1136 
1137     AddressLiteral fd_entry(fd->entry());
1138     bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true);
1139     mtctr(R11);
1140     if (fd->env() == NULL) {
1141       li(R11, 0);
1142       nop();
1143     } else {
1144       AddressLiteral fd_env(fd->env());
1145       success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true);
1146     }
1147     AddressLiteral fd_toc(fd->toc());
1148     // Set R2_TOC (load from toc)
1149     success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true);
1150     bctrl();
1151     _last_calls_return_pc = pc();
1152     if (!success) { return NULL; }
1153   } else {
1154     // It's a friend function, load the entry point and don't care about
1155     // toc and env. Use an optimizable call instruction, but ensure the
1156     // same code-size as in the case of a non-friend function.
1157     nop();
1158     bl64_patchable(fd->entry(), rt);
1159     _last_calls_return_pc = pc();
1160   }
1161   return _last_calls_return_pc;
1162 }
1163 #endif // ABI_ELFv2
1164 
1165 void MacroAssembler::call_VM_base(Register oop_result,
1166                                   Register last_java_sp,
1167                                   address  entry_point,
1168                                   bool     check_exceptions) {
1169   BLOCK_COMMENT("call_VM {");
1170   // Determine last_java_sp register.
1171   if (!last_java_sp->is_valid()) {
1172     last_java_sp = R1_SP;
1173   }
1174   set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1);
1175 
1176   // ARG1 must hold thread address.
1177   mr(R3_ARG1, R16_thread);
1178 #if defined(ABI_ELFv2)
1179   address return_pc = call_c(entry_point, relocInfo::none);
1180 #else
1181   address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none);
1182 #endif
1183 
1184   reset_last_Java_frame();
1185 
1186   // Check for pending exceptions.
1187   if (check_exceptions) {
1188     // We don't check for exceptions here.
1189     ShouldNotReachHere();
1190   }
1191 
1192   // Get oop result if there is one and reset the value in the thread.
1193   if (oop_result->is_valid()) {
1194     get_vm_result(oop_result);
1195   }
1196 
1197   _last_calls_return_pc = return_pc;
1198   BLOCK_COMMENT("} call_VM");
1199 }
1200 
1201 void MacroAssembler::call_VM_leaf_base(address entry_point) {
1202   BLOCK_COMMENT("call_VM_leaf {");
1203 #if defined(ABI_ELFv2)
1204   call_c(entry_point, relocInfo::none);
1205 #else
1206   call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none);
1207 #endif
1208   BLOCK_COMMENT("} call_VM_leaf");
1209 }
1210 
1211 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) {
1212   call_VM_base(oop_result, noreg, entry_point, check_exceptions);
1213 }
1214 
1215 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1,
1216                              bool check_exceptions) {
1217   // R3_ARG1 is reserved for the thread.
1218   mr_if_needed(R4_ARG2, arg_1);
1219   call_VM(oop_result, entry_point, check_exceptions);
1220 }
1221 
1222 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2,
1223                              bool check_exceptions) {
1224   // R3_ARG1 is reserved for the thread
1225   mr_if_needed(R4_ARG2, arg_1);
1226   assert(arg_2 != R4_ARG2, "smashed argument");
1227   mr_if_needed(R5_ARG3, arg_2);
1228   call_VM(oop_result, entry_point, check_exceptions);
1229 }
1230 
1231 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3,
1232                              bool check_exceptions) {
1233   // R3_ARG1 is reserved for the thread
1234   mr_if_needed(R4_ARG2, arg_1);
1235   assert(arg_2 != R4_ARG2, "smashed argument");
1236   mr_if_needed(R5_ARG3, arg_2);
1237   mr_if_needed(R6_ARG4, arg_3);
1238   call_VM(oop_result, entry_point, check_exceptions);
1239 }
1240 
1241 void MacroAssembler::call_VM_leaf(address entry_point) {
1242   call_VM_leaf_base(entry_point);
1243 }
1244 
1245 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) {
1246   mr_if_needed(R3_ARG1, arg_1);
1247   call_VM_leaf(entry_point);
1248 }
1249 
1250 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) {
1251   mr_if_needed(R3_ARG1, arg_1);
1252   assert(arg_2 != R3_ARG1, "smashed argument");
1253   mr_if_needed(R4_ARG2, arg_2);
1254   call_VM_leaf(entry_point);
1255 }
1256 
1257 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) {
1258   mr_if_needed(R3_ARG1, arg_1);
1259   assert(arg_2 != R3_ARG1, "smashed argument");
1260   mr_if_needed(R4_ARG2, arg_2);
1261   assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument");
1262   mr_if_needed(R5_ARG3, arg_3);
1263   call_VM_leaf(entry_point);
1264 }
1265 
1266 // Check whether instruction is a read access to the polling page
1267 // which was emitted by load_from_polling_page(..).
1268 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext,
1269                                                address* polling_address_ptr) {
1270   if (!is_ld(instruction))
1271     return false; // It's not a ld. Fail.
1272 
1273   int rt = inv_rt_field(instruction);
1274   int ra = inv_ra_field(instruction);
1275   int ds = inv_ds_field(instruction);
1276   if (!(ds == 0 && ra != 0 && rt == 0)) {
1277     return false; // It's not a ld(r0, X, ra). Fail.
1278   }
1279 
1280   if (!ucontext) {
1281     // Set polling address.
1282     if (polling_address_ptr != NULL) {
1283       *polling_address_ptr = NULL;
1284     }
1285     return true; // No ucontext given. Can't check value of ra. Assume true.
1286   }
1287 
1288 #ifdef LINUX
1289   // Ucontext given. Check that register ra contains the address of
1290   // the safepoing polling page.
1291   ucontext_t* uc = (ucontext_t*) ucontext;
1292   // Set polling address.
1293   address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds;
1294   if (polling_address_ptr != NULL) {
1295     *polling_address_ptr = addr;
1296   }
1297   return os::is_poll_address(addr);
1298 #else
1299   // Not on Linux, ucontext must be NULL.
1300   ShouldNotReachHere();
1301   return false;
1302 #endif
1303 }
1304 
1305 void MacroAssembler::bang_stack_with_offset(int offset) {
1306   // When increasing the stack, the old stack pointer will be written
1307   // to the new top of stack according to the PPC64 abi.
1308   // Therefore, stack banging is not necessary when increasing
1309   // the stack by <= os::vm_page_size() bytes.
1310   // When increasing the stack by a larger amount, this method is
1311   // called repeatedly to bang the intermediate pages.
1312 
1313   // Stack grows down, caller passes positive offset.
1314   assert(offset > 0, "must bang with positive offset");
1315 
1316   long stdoffset = -offset;
1317 
1318   if (is_simm(stdoffset, 16)) {
1319     // Signed 16 bit offset, a simple std is ok.
1320     if (UseLoadInstructionsForStackBangingPPC64) {
1321       ld(R0, (int)(signed short)stdoffset, R1_SP);
1322     } else {
1323       std(R0,(int)(signed short)stdoffset, R1_SP);
1324     }
1325   } else if (is_simm(stdoffset, 31)) {
1326     const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset);
1327     const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset);
1328 
1329     Register tmp = R11;
1330     addis(tmp, R1_SP, hi);
1331     if (UseLoadInstructionsForStackBangingPPC64) {
1332       ld(R0,  lo, tmp);
1333     } else {
1334       std(R0, lo, tmp);
1335     }
1336   } else {
1337     ShouldNotReachHere();
1338   }
1339 }
1340 
1341 // If instruction is a stack bang of the form
1342 //    std    R0,    x(Ry),       (see bang_stack_with_offset())
1343 //    stdu   R1_SP, x(R1_SP),    (see push_frame(), resize_frame())
1344 // or stdux  R1_SP, Rx, R1_SP    (see push_frame(), resize_frame())
1345 // return the banged address. Otherwise, return 0.
1346 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) {
1347 #ifdef LINUX
1348   ucontext_t* uc = (ucontext_t*) ucontext;
1349   int rs = inv_rs_field(instruction);
1350   int ra = inv_ra_field(instruction);
1351   if (   (is_ld(instruction)   && rs == 0 &&  UseLoadInstructionsForStackBangingPPC64)
1352       || (is_std(instruction)  && rs == 0 && !UseLoadInstructionsForStackBangingPPC64)
1353       || (is_stdu(instruction) && rs == 1)) {
1354     int ds = inv_ds_field(instruction);
1355     // return banged address
1356     return ds+(address)uc->uc_mcontext.regs->gpr[ra];
1357   } else if (is_stdux(instruction) && rs == 1) {
1358     int rb = inv_rb_field(instruction);
1359     address sp = (address)uc->uc_mcontext.regs->gpr[1];
1360     long rb_val = (long)uc->uc_mcontext.regs->gpr[rb];
1361     return ra != 1 || rb_val >= 0 ? NULL         // not a stack bang
1362                                   : sp + rb_val; // banged address
1363   }
1364   return NULL; // not a stack bang
1365 #else
1366   // workaround not needed on !LINUX :-)
1367   ShouldNotCallThis();
1368   return NULL;
1369 #endif
1370 }
1371 
1372 void MacroAssembler::reserved_stack_check(Register return_pc) {
1373   // Test if reserved zone needs to be enabled.
1374   Label no_reserved_zone_enabling;
1375 
1376   ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread);
1377   cmpld(CCR0, R1_SP, R0);
1378   blt_predict_taken(CCR0, no_reserved_zone_enabling);
1379 
1380   // Enable reserved zone again, throw stack overflow exception.
1381   push_frame_reg_args(0, R0);
1382   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread);
1383   pop_frame();
1384   mtlr(return_pc);
1385   load_const_optimized(R0, StubRoutines::throw_delayed_StackOverflowError_entry());
1386   mtctr(R0);
1387   bctr();
1388 
1389   should_not_reach_here();
1390 
1391   bind(no_reserved_zone_enabling);
1392 }
1393 
1394 void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base,
1395                                 bool cmpxchgx_hint) {
1396   Label retry;
1397   bind(retry);
1398   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1399   stdcx_(exchange_value, addr_base);
1400   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1401     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1402   } else {
1403     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1404   }
1405 }
1406 
1407 void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base,
1408                                 Register tmp, bool cmpxchgx_hint) {
1409   Label retry;
1410   bind(retry);
1411   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1412   add(tmp, dest_current_value, inc_value);
1413   stdcx_(tmp, addr_base);
1414   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1415     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1416   } else {
1417     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1418   }
1419 }
1420 
1421 // Word/sub-word atomic helper functions
1422 
1423 // Temps and addr_base are killed if size < 4 and processor does not support respective instructions.
1424 // Only signed types are supported with size < 4.
1425 // Atomic add always kills tmp1.
1426 void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value,
1427                                                    Register addr_base, Register tmp1, Register tmp2, Register tmp3,
1428                                                    bool cmpxchgx_hint, bool is_add, int size) {
1429   // Sub-word instructions are available since Power 8.
1430   // For older processors, instruction_type != size holds, and we
1431   // emulate the sub-word instructions by constructing a 4-byte value
1432   // that leaves the other bytes unchanged.
1433   const int instruction_type = VM_Version::has_lqarx() ? size : 4;
1434 
1435   Label retry;
1436   Register shift_amount = noreg,
1437            val32 = dest_current_value,
1438            modval = is_add ? tmp1 : exchange_value;
1439 
1440   if (instruction_type != size) {
1441     assert_different_registers(tmp1, tmp2, tmp3, dest_current_value, exchange_value, addr_base);
1442     modval = tmp1;
1443     shift_amount = tmp2;
1444     val32 = tmp3;
1445     // Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.
1446 #ifdef VM_LITTLE_ENDIAN
1447     rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;
1448     clrrdi(addr_base, addr_base, 2);
1449 #else
1450     xori(shift_amount, addr_base, (size == 1) ? 3 : 2);
1451     clrrdi(addr_base, addr_base, 2);
1452     rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;
1453 #endif
1454   }
1455 
1456   // atomic emulation loop
1457   bind(retry);
1458 
1459   switch (instruction_type) {
1460     case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1461     case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1462     case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1463     default: ShouldNotReachHere();
1464   }
1465 
1466   if (instruction_type != size) {
1467     srw(dest_current_value, val32, shift_amount);
1468   }
1469 
1470   if (is_add) { add(modval, dest_current_value, exchange_value); }
1471 
1472   if (instruction_type != size) {
1473     // Transform exchange value such that the replacement can be done by one xor instruction.
1474     xorr(modval, dest_current_value, is_add ? modval : exchange_value);
1475     clrldi(modval, modval, (size == 1) ? 56 : 48);
1476     slw(modval, modval, shift_amount);
1477     xorr(modval, val32, modval);
1478   }
1479 
1480   switch (instruction_type) {
1481     case 4: stwcx_(modval, addr_base); break;
1482     case 2: sthcx_(modval, addr_base); break;
1483     case 1: stbcx_(modval, addr_base); break;
1484     default: ShouldNotReachHere();
1485   }
1486 
1487   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1488     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1489   } else {
1490     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1491   }
1492 
1493   // l?arx zero-extends, but Java wants byte/short values sign-extended.
1494   if (size == 1) {
1495     extsb(dest_current_value, dest_current_value);
1496   } else if (size == 2) {
1497     extsh(dest_current_value, dest_current_value);
1498   };
1499 }
1500 
1501 // Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions.
1502 // Only signed types are supported with size < 4.
1503 void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value,
1504                                        Register compare_value, Register exchange_value,
1505                                        Register addr_base, Register tmp1, Register tmp2,
1506                                        Label &retry, Label &failed, bool cmpxchgx_hint, int size) {
1507   // Sub-word instructions are available since Power 8.
1508   // For older processors, instruction_type != size holds, and we
1509   // emulate the sub-word instructions by constructing a 4-byte value
1510   // that leaves the other bytes unchanged.
1511   const int instruction_type = VM_Version::has_lqarx() ? size : 4;
1512 
1513   Register shift_amount = noreg,
1514            val32 = dest_current_value,
1515            modval = exchange_value;
1516 
1517   if (instruction_type != size) {
1518     assert_different_registers(tmp1, tmp2, dest_current_value, compare_value, exchange_value, addr_base);
1519     shift_amount = tmp1;
1520     val32 = tmp2;
1521     modval = tmp2;
1522     // Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.
1523 #ifdef VM_LITTLE_ENDIAN
1524     rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;
1525     clrrdi(addr_base, addr_base, 2);
1526 #else
1527     xori(shift_amount, addr_base, (size == 1) ? 3 : 2);
1528     clrrdi(addr_base, addr_base, 2);
1529     rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;
1530 #endif
1531     // Transform exchange value such that the replacement can be done by one xor instruction.
1532     xorr(exchange_value, compare_value, exchange_value);
1533     clrldi(exchange_value, exchange_value, (size == 1) ? 56 : 48);
1534     slw(exchange_value, exchange_value, shift_amount);
1535   }
1536 
1537   // atomic emulation loop
1538   bind(retry);
1539 
1540   switch (instruction_type) {
1541     case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1542     case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1543     case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1544     default: ShouldNotReachHere();
1545   }
1546 
1547   if (instruction_type != size) {
1548     srw(dest_current_value, val32, shift_amount);
1549   }
1550   if (size == 1) {
1551     extsb(dest_current_value, dest_current_value);
1552   } else if (size == 2) {
1553     extsh(dest_current_value, dest_current_value);
1554   };
1555 
1556   cmpw(flag, dest_current_value, compare_value);
1557   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1558     bne_predict_not_taken(flag, failed);
1559   } else {
1560     bne(                  flag, failed);
1561   }
1562   // branch to done  => (flag == ne), (dest_current_value != compare_value)
1563   // fall through    => (flag == eq), (dest_current_value == compare_value)
1564 
1565   if (instruction_type != size) {
1566     xorr(modval, val32, exchange_value);
1567   }
1568 
1569   switch (instruction_type) {
1570     case 4: stwcx_(modval, addr_base); break;
1571     case 2: sthcx_(modval, addr_base); break;
1572     case 1: stbcx_(modval, addr_base); break;
1573     default: ShouldNotReachHere();
1574   }
1575 }
1576 
1577 // CmpxchgX sets condition register to cmpX(current, compare).
1578 void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value,
1579                                      Register compare_value, Register exchange_value,
1580                                      Register addr_base, Register tmp1, Register tmp2,
1581                                      int semantics, bool cmpxchgx_hint,
1582                                      Register int_flag_success, bool contention_hint, bool weak, int size) {
1583   Label retry;
1584   Label failed;
1585   Label done;
1586 
1587   // Save one branch if result is returned via register and
1588   // result register is different from the other ones.
1589   bool use_result_reg    = (int_flag_success != noreg);
1590   bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value &&
1591                             int_flag_success != exchange_value && int_flag_success != addr_base &&
1592                             int_flag_success != tmp1 && int_flag_success != tmp2);
1593   assert(!weak || flag == CCR0, "weak only supported with CCR0");
1594   assert(size == 1 || size == 2 || size == 4, "unsupported");
1595 
1596   if (use_result_reg && preset_result_reg) {
1597     li(int_flag_success, 0); // preset (assume cas failed)
1598   }
1599 
1600   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1601   if (contention_hint) { // Don't try to reserve if cmp fails.
1602     switch (size) {
1603       case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break;
1604       case 2: lha(dest_current_value, 0, addr_base); break;
1605       case 4: lwz(dest_current_value, 0, addr_base); break;
1606       default: ShouldNotReachHere();
1607     }
1608     cmpw(flag, dest_current_value, compare_value);
1609     bne(flag, failed);
1610   }
1611 
1612   // release/fence semantics
1613   if (semantics & MemBarRel) {
1614     release();
1615   }
1616 
1617   cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base, tmp1, tmp2,
1618                     retry, failed, cmpxchgx_hint, size);
1619   if (!weak || use_result_reg) {
1620     if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1621       bne_predict_not_taken(CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1622     } else {
1623       bne(                  CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1624     }
1625   }
1626   // fall through    => (flag == eq), (dest_current_value == compare_value), (swapped)
1627 
1628   // Result in register (must do this at the end because int_flag_success can be the
1629   // same register as one above).
1630   if (use_result_reg) {
1631     li(int_flag_success, 1);
1632   }
1633 
1634   if (semantics & MemBarFenceAfter) {
1635     fence();
1636   } else if (semantics & MemBarAcq) {
1637     isync();
1638   }
1639 
1640   if (use_result_reg && !preset_result_reg) {
1641     b(done);
1642   }
1643 
1644   bind(failed);
1645   if (use_result_reg && !preset_result_reg) {
1646     li(int_flag_success, 0);
1647   }
1648 
1649   bind(done);
1650   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1651   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1652 }
1653 
1654 // Preforms atomic compare exchange:
1655 //   if (compare_value == *addr_base)
1656 //     *addr_base = exchange_value
1657 //     int_flag_success = 1;
1658 //   else
1659 //     int_flag_success = 0;
1660 //
1661 // ConditionRegister flag       = cmp(compare_value, *addr_base)
1662 // Register dest_current_value  = *addr_base
1663 // Register compare_value       Used to compare with value in memory
1664 // Register exchange_value      Written to memory if compare_value == *addr_base
1665 // Register addr_base           The memory location to compareXChange
1666 // Register int_flag_success    Set to 1 if exchange_value was written to *addr_base
1667 //
1668 // To avoid the costly compare exchange the value is tested beforehand.
1669 // Several special cases exist to avoid that unnecessary information is generated.
1670 //
1671 void MacroAssembler::cmpxchgd(ConditionRegister flag,
1672                               Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value,
1673                               Register addr_base, int semantics, bool cmpxchgx_hint,
1674                               Register int_flag_success, Label* failed_ext, bool contention_hint, bool weak) {
1675   Label retry;
1676   Label failed_int;
1677   Label& failed = (failed_ext != NULL) ? *failed_ext : failed_int;
1678   Label done;
1679 
1680   // Save one branch if result is returned via register and result register is different from the other ones.
1681   bool use_result_reg    = (int_flag_success!=noreg);
1682   bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() &&
1683                             int_flag_success!=exchange_value && int_flag_success!=addr_base);
1684   assert(!weak || flag == CCR0, "weak only supported with CCR0");
1685   assert(int_flag_success == noreg || failed_ext == NULL, "cannot have both");
1686 
1687   if (use_result_reg && preset_result_reg) {
1688     li(int_flag_success, 0); // preset (assume cas failed)
1689   }
1690 
1691   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1692   if (contention_hint) { // Don't try to reserve if cmp fails.
1693     ld(dest_current_value, 0, addr_base);
1694     cmpd(flag, compare_value, dest_current_value);
1695     bne(flag, failed);
1696   }
1697 
1698   // release/fence semantics
1699   if (semantics & MemBarRel) {
1700     release();
1701   }
1702 
1703   // atomic emulation loop
1704   bind(retry);
1705 
1706   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1707   cmpd(flag, compare_value, dest_current_value);
1708   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1709     bne_predict_not_taken(flag, failed);
1710   } else {
1711     bne(                  flag, failed);
1712   }
1713 
1714   stdcx_(exchange_value, addr_base);
1715   if (!weak || use_result_reg || failed_ext) {
1716     if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1717       bne_predict_not_taken(CCR0, weak ? failed : retry); // stXcx_ sets CCR0
1718     } else {
1719       bne(                  CCR0, weak ? failed : retry); // stXcx_ sets CCR0
1720     }
1721   }
1722 
1723   // result in register (must do this at the end because int_flag_success can be the same register as one above)
1724   if (use_result_reg) {
1725     li(int_flag_success, 1);
1726   }
1727 
1728   if (semantics & MemBarFenceAfter) {
1729     fence();
1730   } else if (semantics & MemBarAcq) {
1731     isync();
1732   }
1733 
1734   if (use_result_reg && !preset_result_reg) {
1735     b(done);
1736   }
1737 
1738   bind(failed_int);
1739   if (use_result_reg && !preset_result_reg) {
1740     li(int_flag_success, 0);
1741   }
1742 
1743   bind(done);
1744   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1745   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1746 }
1747 
1748 // Look up the method for a megamorphic invokeinterface call.
1749 // The target method is determined by <intf_klass, itable_index>.
1750 // The receiver klass is in recv_klass.
1751 // On success, the result will be in method_result, and execution falls through.
1752 // On failure, execution transfers to the given label.
1753 void MacroAssembler::lookup_interface_method(Register recv_klass,
1754                                              Register intf_klass,
1755                                              RegisterOrConstant itable_index,
1756                                              Register method_result,
1757                                              Register scan_temp,
1758                                              Register temp2,
1759                                              Label& L_no_such_interface,
1760                                              bool return_method) {
1761   assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
1762 
1763   // Compute start of first itableOffsetEntry (which is at the end of the vtable).
1764   int vtable_base = in_bytes(Klass::vtable_start_offset());
1765   int itentry_off = itableMethodEntry::method_offset_in_bytes();
1766   int logMEsize   = exact_log2(itableMethodEntry::size() * wordSize);
1767   int scan_step   = itableOffsetEntry::size() * wordSize;
1768   int log_vte_size= exact_log2(vtableEntry::size_in_bytes());
1769 
1770   lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass);
1771   // %%% We should store the aligned, prescaled offset in the klassoop.
1772   // Then the next several instructions would fold away.
1773 
1774   sldi(scan_temp, scan_temp, log_vte_size);
1775   addi(scan_temp, scan_temp, vtable_base);
1776   add(scan_temp, recv_klass, scan_temp);
1777 
1778   // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1779   if (return_method) {
1780     if (itable_index.is_register()) {
1781       Register itable_offset = itable_index.as_register();
1782       sldi(method_result, itable_offset, logMEsize);
1783       if (itentry_off) { addi(method_result, method_result, itentry_off); }
1784       add(method_result, method_result, recv_klass);
1785     } else {
1786       long itable_offset = (long)itable_index.as_constant();
1787       // static address, no relocation
1788       add_const_optimized(method_result, recv_klass, (itable_offset << logMEsize) + itentry_off, temp2);
1789     }
1790   }
1791 
1792   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
1793   //   if (scan->interface() == intf) {
1794   //     result = (klass + scan->offset() + itable_index);
1795   //   }
1796   // }
1797   Label search, found_method;
1798 
1799   for (int peel = 1; peel >= 0; peel--) {
1800     // %%%% Could load both offset and interface in one ldx, if they were
1801     // in the opposite order. This would save a load.
1802     ld(temp2, itableOffsetEntry::interface_offset_in_bytes(), scan_temp);
1803 
1804     // Check that this entry is non-null. A null entry means that
1805     // the receiver class doesn't implement the interface, and wasn't the
1806     // same as when the caller was compiled.
1807     cmpd(CCR0, temp2, intf_klass);
1808 
1809     if (peel) {
1810       beq(CCR0, found_method);
1811     } else {
1812       bne(CCR0, search);
1813       // (invert the test to fall through to found_method...)
1814     }
1815 
1816     if (!peel) break;
1817 
1818     bind(search);
1819 
1820     cmpdi(CCR0, temp2, 0);
1821     beq(CCR0, L_no_such_interface);
1822     addi(scan_temp, scan_temp, scan_step);
1823   }
1824 
1825   bind(found_method);
1826 
1827   // Got a hit.
1828   if (return_method) {
1829     int ito_offset = itableOffsetEntry::offset_offset_in_bytes();
1830     lwz(scan_temp, ito_offset, scan_temp);
1831     ldx(method_result, scan_temp, method_result);
1832   }
1833 }
1834 
1835 // virtual method calling
1836 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1837                                            RegisterOrConstant vtable_index,
1838                                            Register method_result) {
1839 
1840   assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg());
1841 
1842   const int base = in_bytes(Klass::vtable_start_offset());
1843   assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1844 
1845   if (vtable_index.is_register()) {
1846     sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord);
1847     add(recv_klass, vtable_index.as_register(), recv_klass);
1848   } else {
1849     addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord);
1850   }
1851   ld(R19_method, base + vtableEntry::method_offset_in_bytes(), recv_klass);
1852 }
1853 
1854 /////////////////////////////////////////// subtype checking ////////////////////////////////////////////
1855 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1856                                                    Register super_klass,
1857                                                    Register temp1_reg,
1858                                                    Register temp2_reg,
1859                                                    Label* L_success,
1860                                                    Label* L_failure,
1861                                                    Label* L_slow_path,
1862                                                    RegisterOrConstant super_check_offset) {
1863 
1864   const Register check_cache_offset = temp1_reg;
1865   const Register cached_super       = temp2_reg;
1866 
1867   assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super);
1868 
1869   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1870   int sc_offset  = in_bytes(Klass::secondary_super_cache_offset());
1871 
1872   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1873   bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset);
1874 
1875   Label L_fallthrough;
1876   int label_nulls = 0;
1877   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1878   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1879   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1880   assert(label_nulls <= 1 ||
1881          (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path),
1882          "at most one NULL in the batch, usually");
1883 
1884   // If the pointers are equal, we are done (e.g., String[] elements).
1885   // This self-check enables sharing of secondary supertype arrays among
1886   // non-primary types such as array-of-interface. Otherwise, each such
1887   // type would need its own customized SSA.
1888   // We move this check to the front of the fast path because many
1889   // type checks are in fact trivially successful in this manner,
1890   // so we get a nicely predicted branch right at the start of the check.
1891   cmpd(CCR0, sub_klass, super_klass);
1892   beq(CCR0, *L_success);
1893 
1894   // Check the supertype display:
1895   if (must_load_sco) {
1896     // The super check offset is always positive...
1897     lwz(check_cache_offset, sco_offset, super_klass);
1898     super_check_offset = RegisterOrConstant(check_cache_offset);
1899     // super_check_offset is register.
1900     assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register());
1901   }
1902   // The loaded value is the offset from KlassOopDesc.
1903 
1904   ld(cached_super, super_check_offset, sub_klass);
1905   cmpd(CCR0, cached_super, super_klass);
1906 
1907   // This check has worked decisively for primary supers.
1908   // Secondary supers are sought in the super_cache ('super_cache_addr').
1909   // (Secondary supers are interfaces and very deeply nested subtypes.)
1910   // This works in the same check above because of a tricky aliasing
1911   // between the super_cache and the primary super display elements.
1912   // (The 'super_check_addr' can address either, as the case requires.)
1913   // Note that the cache is updated below if it does not help us find
1914   // what we need immediately.
1915   // So if it was a primary super, we can just fail immediately.
1916   // Otherwise, it's the slow path for us (no success at this point).
1917 
1918 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); }
1919 
1920   if (super_check_offset.is_register()) {
1921     beq(CCR0, *L_success);
1922     cmpwi(CCR0, super_check_offset.as_register(), sc_offset);
1923     if (L_failure == &L_fallthrough) {
1924       beq(CCR0, *L_slow_path);
1925     } else {
1926       bne(CCR0, *L_failure);
1927       FINAL_JUMP(*L_slow_path);
1928     }
1929   } else {
1930     if (super_check_offset.as_constant() == sc_offset) {
1931       // Need a slow path; fast failure is impossible.
1932       if (L_slow_path == &L_fallthrough) {
1933         beq(CCR0, *L_success);
1934       } else {
1935         bne(CCR0, *L_slow_path);
1936         FINAL_JUMP(*L_success);
1937       }
1938     } else {
1939       // No slow path; it's a fast decision.
1940       if (L_failure == &L_fallthrough) {
1941         beq(CCR0, *L_success);
1942       } else {
1943         bne(CCR0, *L_failure);
1944         FINAL_JUMP(*L_success);
1945       }
1946     }
1947   }
1948 
1949   bind(L_fallthrough);
1950 #undef FINAL_JUMP
1951 }
1952 
1953 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1954                                                    Register super_klass,
1955                                                    Register temp1_reg,
1956                                                    Register temp2_reg,
1957                                                    Label* L_success,
1958                                                    Register result_reg) {
1959   const Register array_ptr = temp1_reg; // current value from cache array
1960   const Register temp      = temp2_reg;
1961 
1962   assert_different_registers(sub_klass, super_klass, array_ptr, temp);
1963 
1964   int source_offset = in_bytes(Klass::secondary_supers_offset());
1965   int target_offset = in_bytes(Klass::secondary_super_cache_offset());
1966 
1967   int length_offset = Array<Klass*>::length_offset_in_bytes();
1968   int base_offset   = Array<Klass*>::base_offset_in_bytes();
1969 
1970   Label hit, loop, failure, fallthru;
1971 
1972   ld(array_ptr, source_offset, sub_klass);
1973 
1974   // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated.");
1975   lwz(temp, length_offset, array_ptr);
1976   cmpwi(CCR0, temp, 0);
1977   beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0
1978 
1979   mtctr(temp); // load ctr
1980 
1981   bind(loop);
1982   // Oops in table are NO MORE compressed.
1983   ld(temp, base_offset, array_ptr);
1984   cmpd(CCR0, temp, super_klass);
1985   beq(CCR0, hit);
1986   addi(array_ptr, array_ptr, BytesPerWord);
1987   bdnz(loop);
1988 
1989   bind(failure);
1990   if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss)
1991   b(fallthru);
1992 
1993   bind(hit);
1994   std(super_klass, target_offset, sub_klass); // save result to cache
1995   if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit)
1996   if (L_success != NULL) { b(*L_success); }
1997   else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided
1998 
1999   bind(fallthru);
2000 }
2001 
2002 // Try fast path, then go to slow one if not successful
2003 void MacroAssembler::check_klass_subtype(Register sub_klass,
2004                          Register super_klass,
2005                          Register temp1_reg,
2006                          Register temp2_reg,
2007                          Label& L_success) {
2008   Label L_failure;
2009   check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure);
2010   check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success);
2011   bind(L_failure); // Fallthru if not successful.
2012 }
2013 
2014 void MacroAssembler::check_method_handle_type(Register mtype_reg, Register mh_reg,
2015                                               Register temp_reg,
2016                                               Label& wrong_method_type) {
2017   assert_different_registers(mtype_reg, mh_reg, temp_reg);
2018   // Compare method type against that of the receiver.
2019   load_heap_oop(temp_reg, delayed_value(java_lang_invoke_MethodHandle::type_offset_in_bytes, temp_reg), mh_reg,
2020                 noreg, noreg, false, IS_NOT_NULL);
2021   cmpd(CCR0, temp_reg, mtype_reg);
2022   bne(CCR0, wrong_method_type);
2023 }
2024 
2025 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot,
2026                                                    Register temp_reg,
2027                                                    int extra_slot_offset) {
2028   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
2029   int stackElementSize = Interpreter::stackElementSize;
2030   int offset = extra_slot_offset * stackElementSize;
2031   if (arg_slot.is_constant()) {
2032     offset += arg_slot.as_constant() * stackElementSize;
2033     return offset;
2034   } else {
2035     assert(temp_reg != noreg, "must specify");
2036     sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize));
2037     if (offset != 0)
2038       addi(temp_reg, temp_reg, offset);
2039     return temp_reg;
2040   }
2041 }
2042 
2043 // Supports temp2_reg = R0.
2044 void MacroAssembler::biased_locking_enter(ConditionRegister cr_reg, Register obj_reg,
2045                                           Register mark_reg, Register temp_reg,
2046                                           Register temp2_reg, Label& done, Label* slow_case) {
2047   assert(UseBiasedLocking, "why call this otherwise?");
2048 
2049 #ifdef ASSERT
2050   assert_different_registers(obj_reg, mark_reg, temp_reg, temp2_reg);
2051 #endif
2052 
2053   Label cas_label;
2054 
2055   // Branch to done if fast path fails and no slow_case provided.
2056   Label *slow_case_int = (slow_case != NULL) ? slow_case : &done;
2057 
2058   // Biased locking
2059   // See whether the lock is currently biased toward our thread and
2060   // whether the epoch is still valid
2061   // Note that the runtime guarantees sufficient alignment of JavaThread
2062   // pointers to allow age to be placed into low bits
2063   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits,
2064          "biased locking makes assumptions about bit layout");
2065 
2066   if (PrintBiasedLockingStatistics) {
2067     load_const(temp2_reg, (address) BiasedLocking::total_entry_count_addr(), temp_reg);
2068     lwzx(temp_reg, temp2_reg);
2069     addi(temp_reg, temp_reg, 1);
2070     stwx(temp_reg, temp2_reg);
2071   }
2072 
2073   andi(temp_reg, mark_reg, markOopDesc::biased_lock_mask_in_place);
2074   cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern);
2075   bne(cr_reg, cas_label);
2076 
2077   load_klass(temp_reg, obj_reg);
2078 
2079   load_const_optimized(temp2_reg, ~((int) markOopDesc::age_mask_in_place));
2080   ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
2081   orr(temp_reg, R16_thread, temp_reg);
2082   xorr(temp_reg, mark_reg, temp_reg);
2083   andr(temp_reg, temp_reg, temp2_reg);
2084   cmpdi(cr_reg, temp_reg, 0);
2085   if (PrintBiasedLockingStatistics) {
2086     Label l;
2087     bne(cr_reg, l);
2088     load_const(temp2_reg, (address) BiasedLocking::biased_lock_entry_count_addr());
2089     lwzx(mark_reg, temp2_reg);
2090     addi(mark_reg, mark_reg, 1);
2091     stwx(mark_reg, temp2_reg);
2092     // restore mark_reg
2093     ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);
2094     bind(l);
2095   }
2096   beq(cr_reg, done);
2097 
2098   Label try_revoke_bias;
2099   Label try_rebias;
2100 
2101   // At this point we know that the header has the bias pattern and
2102   // that we are not the bias owner in the current epoch. We need to
2103   // figure out more details about the state of the header in order to
2104   // know what operations can be legally performed on the object's
2105   // header.
2106 
2107   // If the low three bits in the xor result aren't clear, that means
2108   // the prototype header is no longer biased and we have to revoke
2109   // the bias on this object.
2110   andi(temp2_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
2111   cmpwi(cr_reg, temp2_reg, 0);
2112   bne(cr_reg, try_revoke_bias);
2113 
2114   // Biasing is still enabled for this data type. See whether the
2115   // epoch of the current bias is still valid, meaning that the epoch
2116   // bits of the mark word are equal to the epoch bits of the
2117   // prototype header. (Note that the prototype header's epoch bits
2118   // only change at a safepoint.) If not, attempt to rebias the object
2119   // toward the current thread. Note that we must be absolutely sure
2120   // that the current epoch is invalid in order to do this because
2121   // otherwise the manipulations it performs on the mark word are
2122   // illegal.
2123 
2124   int shift_amount = 64 - markOopDesc::epoch_shift;
2125   // rotate epoch bits to right (little) end and set other bits to 0
2126   // [ big part | epoch | little part ] -> [ 0..0 | epoch ]
2127   rldicl_(temp2_reg, temp_reg, shift_amount, 64 - markOopDesc::epoch_bits);
2128   // branch if epoch bits are != 0, i.e. they differ, because the epoch has been incremented
2129   bne(CCR0, try_rebias);
2130 
2131   // The epoch of the current bias is still valid but we know nothing
2132   // about the owner; it might be set or it might be clear. Try to
2133   // acquire the bias of the object using an atomic operation. If this
2134   // fails we will go in to the runtime to revoke the object's bias.
2135   // Note that we first construct the presumed unbiased header so we
2136   // don't accidentally blow away another thread's valid bias.
2137   andi(mark_reg, mark_reg, (markOopDesc::biased_lock_mask_in_place |
2138                                 markOopDesc::age_mask_in_place |
2139                                 markOopDesc::epoch_mask_in_place));
2140   orr(temp_reg, R16_thread, mark_reg);
2141 
2142   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2143 
2144   // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
2145   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
2146            /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
2147            /*where=*/obj_reg,
2148            MacroAssembler::MemBarAcq,
2149            MacroAssembler::cmpxchgx_hint_acquire_lock(),
2150            noreg, slow_case_int); // bail out if failed
2151 
2152   // If the biasing toward our thread failed, this means that
2153   // another thread succeeded in biasing it toward itself and we
2154   // need to revoke that bias. The revocation will occur in the
2155   // interpreter runtime in the slow case.
2156   if (PrintBiasedLockingStatistics) {
2157     load_const(temp2_reg, (address) BiasedLocking::anonymously_biased_lock_entry_count_addr(), temp_reg);
2158     lwzx(temp_reg, temp2_reg);
2159     addi(temp_reg, temp_reg, 1);
2160     stwx(temp_reg, temp2_reg);
2161   }
2162   b(done);
2163 
2164   bind(try_rebias);
2165   // At this point we know the epoch has expired, meaning that the
2166   // current "bias owner", if any, is actually invalid. Under these
2167   // circumstances _only_, we are allowed to use the current header's
2168   // value as the comparison value when doing the cas to acquire the
2169   // bias in the current epoch. In other words, we allow transfer of
2170   // the bias from one thread to another directly in this situation.
2171   load_klass(temp_reg, obj_reg);
2172   andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place);
2173   orr(temp2_reg, R16_thread, temp2_reg);
2174   ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
2175   orr(temp_reg, temp2_reg, temp_reg);
2176 
2177   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2178 
2179   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
2180                  /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
2181                  /*where=*/obj_reg,
2182                  MacroAssembler::MemBarAcq,
2183                  MacroAssembler::cmpxchgx_hint_acquire_lock(),
2184                  noreg, slow_case_int); // bail out if failed
2185 
2186   // If the biasing toward our thread failed, this means that
2187   // another thread succeeded in biasing it toward itself and we
2188   // need to revoke that bias. The revocation will occur in the
2189   // interpreter runtime in the slow case.
2190   if (PrintBiasedLockingStatistics) {
2191     load_const(temp2_reg, (address) BiasedLocking::rebiased_lock_entry_count_addr(), temp_reg);
2192     lwzx(temp_reg, temp2_reg);
2193     addi(temp_reg, temp_reg, 1);
2194     stwx(temp_reg, temp2_reg);
2195   }
2196   b(done);
2197 
2198   bind(try_revoke_bias);
2199   // The prototype mark in the klass doesn't have the bias bit set any
2200   // more, indicating that objects of this data type are not supposed
2201   // to be biased any more. We are going to try to reset the mark of
2202   // this object to the prototype value and fall through to the
2203   // CAS-based locking scheme. Note that if our CAS fails, it means
2204   // that another thread raced us for the privilege of revoking the
2205   // bias of this particular object, so it's okay to continue in the
2206   // normal locking code.
2207   load_klass(temp_reg, obj_reg);
2208   ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
2209   andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place);
2210   orr(temp_reg, temp_reg, temp2_reg);
2211 
2212   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2213 
2214   // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
2215   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
2216                  /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
2217                  /*where=*/obj_reg,
2218                  MacroAssembler::MemBarAcq,
2219                  MacroAssembler::cmpxchgx_hint_acquire_lock());
2220 
2221   // reload markOop in mark_reg before continuing with lightweight locking
2222   ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);
2223 
2224   // Fall through to the normal CAS-based lock, because no matter what
2225   // the result of the above CAS, some thread must have succeeded in
2226   // removing the bias bit from the object's header.
2227   if (PrintBiasedLockingStatistics) {
2228     Label l;
2229     bne(cr_reg, l);
2230     load_const(temp2_reg, (address) BiasedLocking::revoked_lock_entry_count_addr(), temp_reg);
2231     lwzx(temp_reg, temp2_reg);
2232     addi(temp_reg, temp_reg, 1);
2233     stwx(temp_reg, temp2_reg);
2234     bind(l);
2235   }
2236 
2237   bind(cas_label);
2238 }
2239 
2240 void MacroAssembler::biased_locking_exit (ConditionRegister cr_reg, Register mark_addr, Register temp_reg, Label& done) {
2241   // Check for biased locking unlock case, which is a no-op
2242   // Note: we do not have to check the thread ID for two reasons.
2243   // First, the interpreter checks for IllegalMonitorStateException at
2244   // a higher level. Second, if the bias was revoked while we held the
2245   // lock, the object could not be rebiased toward another thread, so
2246   // the bias bit would be clear.
2247 
2248   ld(temp_reg, 0, mark_addr);
2249   andi(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
2250 
2251   cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern);
2252   beq(cr_reg, done);
2253 }
2254 
2255 // allocation (for C1)
2256 void MacroAssembler::eden_allocate(
2257   Register obj,                      // result: pointer to object after successful allocation
2258   Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
2259   int      con_size_in_bytes,        // object size in bytes if   known at compile time
2260   Register t1,                       // temp register
2261   Register t2,                       // temp register
2262   Label&   slow_case                 // continuation point if fast allocation fails
2263 ) {
2264   b(slow_case);
2265 }
2266 
2267 void MacroAssembler::tlab_allocate(
2268   Register obj,                      // result: pointer to object after successful allocation
2269   Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
2270   int      con_size_in_bytes,        // object size in bytes if   known at compile time
2271   Register t1,                       // temp register
2272   Label&   slow_case                 // continuation point if fast allocation fails
2273 ) {
2274   // make sure arguments make sense
2275   assert_different_registers(obj, var_size_in_bytes, t1);
2276   assert(0 <= con_size_in_bytes && is_simm16(con_size_in_bytes), "illegal object size");
2277   assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment");
2278 
2279   const Register new_top = t1;
2280   //verify_tlab(); not implemented
2281 
2282   ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2283   ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread);
2284   if (var_size_in_bytes == noreg) {
2285     addi(new_top, obj, con_size_in_bytes);
2286   } else {
2287     add(new_top, obj, var_size_in_bytes);
2288   }
2289   cmpld(CCR0, new_top, R0);
2290   bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case);
2291 
2292 #ifdef ASSERT
2293   // make sure new free pointer is properly aligned
2294   {
2295     Label L;
2296     andi_(R0, new_top, MinObjAlignmentInBytesMask);
2297     beq(CCR0, L);
2298     stop("updated TLAB free is not properly aligned", 0x934);
2299     bind(L);
2300   }
2301 #endif // ASSERT
2302 
2303   // update the tlab top pointer
2304   std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2305   //verify_tlab(); not implemented
2306 }
2307 void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register t1, Register t2) {
2308   unimplemented("incr_allocated_bytes");
2309 }
2310 
2311 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset,
2312                                              int insts_call_instruction_offset, Register Rtoc) {
2313   // Start the stub.
2314   address stub = start_a_stub(64);
2315   if (stub == NULL) { return NULL; } // CodeCache full: bail out
2316 
2317   // Create a trampoline stub relocation which relates this trampoline stub
2318   // with the call instruction at insts_call_instruction_offset in the
2319   // instructions code-section.
2320   relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset));
2321   const int stub_start_offset = offset();
2322 
2323   // For java_to_interp stubs we use R11_scratch1 as scratch register
2324   // and in call trampoline stubs we use R12_scratch2. This way we
2325   // can distinguish them (see is_NativeCallTrampolineStub_at()).
2326   Register reg_scratch = R12_scratch2;
2327 
2328   // Now, create the trampoline stub's code:
2329   // - load the TOC
2330   // - load the call target from the constant pool
2331   // - call
2332   if (Rtoc == noreg) {
2333     calculate_address_from_global_toc(reg_scratch, method_toc());
2334     Rtoc = reg_scratch;
2335   }
2336 
2337   ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false);
2338   mtctr(reg_scratch);
2339   bctr();
2340 
2341   const address stub_start_addr = addr_at(stub_start_offset);
2342 
2343   // Assert that the encoded destination_toc_offset can be identified and that it is correct.
2344   assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(),
2345          "encoded offset into the constant pool must match");
2346   // Trampoline_stub_size should be good.
2347   assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size");
2348   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
2349 
2350   // End the stub.
2351   end_a_stub();
2352   return stub;
2353 }
2354 
2355 // TM on PPC64.
2356 void MacroAssembler::atomic_inc_ptr(Register addr, Register result, int simm16) {
2357   Label retry;
2358   bind(retry);
2359   ldarx(result, addr, /*hint*/ false);
2360   addi(result, result, simm16);
2361   stdcx_(result, addr);
2362   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
2363     bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
2364   } else {
2365     bne(                  CCR0, retry); // stXcx_ sets CCR0
2366   }
2367 }
2368 
2369 void MacroAssembler::atomic_ori_int(Register addr, Register result, int uimm16) {
2370   Label retry;
2371   bind(retry);
2372   lwarx(result, addr, /*hint*/ false);
2373   ori(result, result, uimm16);
2374   stwcx_(result, addr);
2375   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
2376     bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
2377   } else {
2378     bne(                  CCR0, retry); // stXcx_ sets CCR0
2379   }
2380 }
2381 
2382 #if INCLUDE_RTM_OPT
2383 
2384 // Update rtm_counters based on abort status
2385 // input: abort_status
2386 //        rtm_counters_Reg (RTMLockingCounters*)
2387 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters_Reg) {
2388   // Mapping to keep PreciseRTMLockingStatistics similar to x86.
2389   // x86 ppc (! means inverted, ? means not the same)
2390   //  0   31  Set if abort caused by XABORT instruction.
2391   //  1  ! 7  If set, the transaction may succeed on a retry. This bit is always clear if bit 0 is set.
2392   //  2   13  Set if another logical processor conflicted with a memory address that was part of the transaction that aborted.
2393   //  3   10  Set if an internal buffer overflowed.
2394   //  4  ?12  Set if a debug breakpoint was hit.
2395   //  5  ?32  Set if an abort occurred during execution of a nested transaction.
2396   const int failure_bit[] = {tm_tabort, // Signal handler will set this too.
2397                              tm_failure_persistent,
2398                              tm_non_trans_cf,
2399                              tm_trans_cf,
2400                              tm_footprint_of,
2401                              tm_failure_code,
2402                              tm_transaction_level};
2403 
2404   const int num_failure_bits = sizeof(failure_bit) / sizeof(int);
2405   const int num_counters = RTMLockingCounters::ABORT_STATUS_LIMIT;
2406 
2407   const int bit2counter_map[][num_counters] =
2408   // 0 = no map; 1 = mapped, no inverted logic; -1 = mapped, inverted logic
2409   // Inverted logic means that if a bit is set don't count it, or vice-versa.
2410   // Care must be taken when mapping bits to counters as bits for a given
2411   // counter must be mutually exclusive. Otherwise, the counter will be
2412   // incremented more than once.
2413   // counters:
2414   // 0        1        2         3         4         5
2415   // abort  , persist, conflict, overflow, debug   , nested         bits:
2416   {{ 1      , 0      , 0       , 0       , 0       , 0      },   // abort
2417    { 0      , -1     , 0       , 0       , 0       , 0      },   // failure_persistent
2418    { 0      , 0      , 1       , 0       , 0       , 0      },   // non_trans_cf
2419    { 0      , 0      , 1       , 0       , 0       , 0      },   // trans_cf
2420    { 0      , 0      , 0       , 1       , 0       , 0      },   // footprint_of
2421    { 0      , 0      , 0       , 0       , -1      , 0      },   // failure_code = 0xD4
2422    { 0      , 0      , 0       , 0       , 0       , 1      }};  // transaction_level > 1
2423   // ...
2424 
2425   // Move abort_status value to R0 and use abort_status register as a
2426   // temporary register because R0 as third operand in ld/std is treated
2427   // as base address zero (value). Likewise, R0 as second operand in addi
2428   // is problematic because it amounts to li.
2429   const Register temp_Reg = abort_status;
2430   const Register abort_status_R0 = R0;
2431   mr(abort_status_R0, abort_status);
2432 
2433   // Increment total abort counter.
2434   int counters_offs = RTMLockingCounters::abort_count_offset();
2435   ld(temp_Reg, counters_offs, rtm_counters_Reg);
2436   addi(temp_Reg, temp_Reg, 1);
2437   std(temp_Reg, counters_offs, rtm_counters_Reg);
2438 
2439   // Increment specific abort counters.
2440   if (PrintPreciseRTMLockingStatistics) {
2441 
2442     // #0 counter offset.
2443     int abortX_offs = RTMLockingCounters::abortX_count_offset();
2444 
2445     for (int nbit = 0; nbit < num_failure_bits; nbit++) {
2446       for (int ncounter = 0; ncounter < num_counters; ncounter++) {
2447         if (bit2counter_map[nbit][ncounter] != 0) {
2448           Label check_abort;
2449           int abort_counter_offs = abortX_offs + (ncounter << 3);
2450 
2451           if (failure_bit[nbit] == tm_transaction_level) {
2452             // Don't check outer transaction, TL = 1 (bit 63). Hence only
2453             // 11 bits in the TL field are checked to find out if failure
2454             // occured in a nested transaction. This check also matches
2455             // the case when nesting_of = 1 (nesting overflow).
2456             rldicr_(temp_Reg, abort_status_R0, failure_bit[nbit], 10);
2457           } else if (failure_bit[nbit] == tm_failure_code) {
2458             // Check failure code for trap or illegal caught in TM.
2459             // Bits 0:7 are tested as bit 7 (persistent) is copied from
2460             // tabort or treclaim source operand.
2461             // On Linux: trap or illegal is TM_CAUSE_SIGNAL (0xD4).
2462             rldicl(temp_Reg, abort_status_R0, 8, 56);
2463             cmpdi(CCR0, temp_Reg, 0xD4);
2464           } else {
2465             rldicr_(temp_Reg, abort_status_R0, failure_bit[nbit], 0);
2466           }
2467 
2468           if (bit2counter_map[nbit][ncounter] == 1) {
2469             beq(CCR0, check_abort);
2470           } else {
2471             bne(CCR0, check_abort);
2472           }
2473 
2474           // We don't increment atomically.
2475           ld(temp_Reg, abort_counter_offs, rtm_counters_Reg);
2476           addi(temp_Reg, temp_Reg, 1);
2477           std(temp_Reg, abort_counter_offs, rtm_counters_Reg);
2478 
2479           bind(check_abort);
2480         }
2481       }
2482     }
2483   }
2484   // Restore abort_status.
2485   mr(abort_status, abort_status_R0);
2486 }
2487 
2488 // Branch if (random & (count-1) != 0), count is 2^n
2489 // tmp and CR0 are killed
2490 void MacroAssembler::branch_on_random_using_tb(Register tmp, int count, Label& brLabel) {
2491   mftb(tmp);
2492   andi_(tmp, tmp, count-1);
2493   bne(CCR0, brLabel);
2494 }
2495 
2496 // Perform abort ratio calculation, set no_rtm bit if high ratio.
2497 // input:  rtm_counters_Reg (RTMLockingCounters* address) - KILLED
2498 void MacroAssembler::rtm_abort_ratio_calculation(Register rtm_counters_Reg,
2499                                                  RTMLockingCounters* rtm_counters,
2500                                                  Metadata* method_data) {
2501   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
2502 
2503   if (RTMLockingCalculationDelay > 0) {
2504     // Delay calculation.
2505     ld(rtm_counters_Reg, (RegisterOrConstant)(intptr_t)RTMLockingCounters::rtm_calculation_flag_addr());
2506     cmpdi(CCR0, rtm_counters_Reg, 0);
2507     beq(CCR0, L_done);
2508     load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2509   }
2510   // Abort ratio calculation only if abort_count > RTMAbortThreshold.
2511   //   Aborted transactions = abort_count * 100
2512   //   All transactions = total_count *  RTMTotalCountIncrRate
2513   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
2514   ld(R0, RTMLockingCounters::abort_count_offset(), rtm_counters_Reg);
2515   if (is_simm(RTMAbortThreshold, 16)) {   // cmpdi can handle 16bit immediate only.
2516     cmpdi(CCR0, R0, RTMAbortThreshold);
2517     blt(CCR0, L_check_always_rtm2);  // reload of rtm_counters_Reg not necessary
2518   } else {
2519     load_const_optimized(rtm_counters_Reg, RTMAbortThreshold);
2520     cmpd(CCR0, R0, rtm_counters_Reg);
2521     blt(CCR0, L_check_always_rtm1);  // reload of rtm_counters_Reg required
2522   }
2523   mulli(R0, R0, 100);
2524 
2525   const Register tmpReg = rtm_counters_Reg;
2526   ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2527   mulli(tmpReg, tmpReg, RTMTotalCountIncrRate); // allowable range: int16
2528   mulli(tmpReg, tmpReg, RTMAbortRatio);         // allowable range: int16
2529   cmpd(CCR0, R0, tmpReg);
2530   blt(CCR0, L_check_always_rtm1); // jump to reload
2531   if (method_data != NULL) {
2532     // Set rtm_state to "no rtm" in MDO.
2533     // Not using a metadata relocation. Method and Class Loader are kept alive anyway.
2534     // (See nmethod::metadata_do and CodeBuffer::finalize_oop_references.)
2535     load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2536     atomic_ori_int(R0, tmpReg, NoRTM);
2537   }
2538   b(L_done);
2539 
2540   bind(L_check_always_rtm1);
2541   load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2542   bind(L_check_always_rtm2);
2543   ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2544   int64_t thresholdValue = RTMLockingThreshold / RTMTotalCountIncrRate;
2545   if (is_simm(thresholdValue, 16)) {   // cmpdi can handle 16bit immediate only.
2546     cmpdi(CCR0, tmpReg, thresholdValue);
2547   } else {
2548     load_const_optimized(R0, thresholdValue);
2549     cmpd(CCR0, tmpReg, R0);
2550   }
2551   blt(CCR0, L_done);
2552   if (method_data != NULL) {
2553     // Set rtm_state to "always rtm" in MDO.
2554     // Not using a metadata relocation. See above.
2555     load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2556     atomic_ori_int(R0, tmpReg, UseRTM);
2557   }
2558   bind(L_done);
2559 }
2560 
2561 // Update counters and perform abort ratio calculation.
2562 // input: abort_status_Reg
2563 void MacroAssembler::rtm_profiling(Register abort_status_Reg, Register temp_Reg,
2564                                    RTMLockingCounters* rtm_counters,
2565                                    Metadata* method_data,
2566                                    bool profile_rtm) {
2567 
2568   assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2569   // Update rtm counters based on state at abort.
2570   // Reads abort_status_Reg, updates flags.
2571   assert_different_registers(abort_status_Reg, temp_Reg);
2572   load_const_optimized(temp_Reg, (address)rtm_counters, R0);
2573   rtm_counters_update(abort_status_Reg, temp_Reg);
2574   if (profile_rtm) {
2575     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2576     rtm_abort_ratio_calculation(temp_Reg, rtm_counters, method_data);
2577   }
2578 }
2579 
2580 // Retry on abort if abort's status indicates non-persistent failure.
2581 // inputs: retry_count_Reg
2582 //       : abort_status_Reg
2583 // output: retry_count_Reg decremented by 1
2584 void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg,
2585                                              Label& retryLabel, Label* checkRetry) {
2586   Label doneRetry;
2587 
2588   // Don't retry if failure is persistent.
2589   // The persistent bit is set when a (A) Disallowed operation is performed in
2590   // transactional state, like for instance trying to write the TFHAR after a
2591   // transaction is started; or when there is (B) a Nesting Overflow (too many
2592   // nested transactions); or when (C) the Footprint overflows (too many
2593   // addressess touched in TM state so there is no more space in the footprint
2594   // area to track them); or in case of (D) a Self-Induced Conflict, i.e. a
2595   // store is performed to a given address in TM state, then once in suspended
2596   // state the same address is accessed. Failure (A) is very unlikely to occur
2597   // in the JVM. Failure (D) will never occur because Suspended state is never
2598   // used in the JVM. Thus mostly (B) a Nesting Overflow or (C) a Footprint
2599   // Overflow will set the persistent bit.
2600   rldicr_(R0, abort_status_Reg, tm_failure_persistent, 0);
2601   bne(CCR0, doneRetry);
2602 
2603   // Don't retry if transaction was deliberately aborted, i.e. caused by a
2604   // tabort instruction.
2605   rldicr_(R0, abort_status_Reg, tm_tabort, 0);
2606   bne(CCR0, doneRetry);
2607 
2608   // Retry if transaction aborted due to a conflict with another thread.
2609   if (checkRetry) { bind(*checkRetry); }
2610   addic_(retry_count_Reg, retry_count_Reg, -1);
2611   blt(CCR0, doneRetry);
2612   b(retryLabel);
2613   bind(doneRetry);
2614 }
2615 
2616 // Spin and retry if lock is busy.
2617 // inputs: owner_addr_Reg (monitor address)
2618 //       : retry_count_Reg
2619 // output: retry_count_Reg decremented by 1
2620 // CTR is killed
2621 void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register owner_addr_Reg, Label& retryLabel) {
2622   Label SpinLoop, doneRetry, doRetry;
2623   addic_(retry_count_Reg, retry_count_Reg, -1);
2624   blt(CCR0, doneRetry);
2625 
2626   if (RTMSpinLoopCount > 1) {
2627     li(R0, RTMSpinLoopCount);
2628     mtctr(R0);
2629   }
2630 
2631   // low thread priority
2632   smt_prio_low();
2633   bind(SpinLoop);
2634 
2635   if (RTMSpinLoopCount > 1) {
2636     bdz(doRetry);
2637     ld(R0, 0, owner_addr_Reg);
2638     cmpdi(CCR0, R0, 0);
2639     bne(CCR0, SpinLoop);
2640   }
2641 
2642   bind(doRetry);
2643 
2644   // restore thread priority to default in userspace
2645 #ifdef LINUX
2646   smt_prio_medium_low();
2647 #else
2648   smt_prio_medium();
2649 #endif
2650 
2651   b(retryLabel);
2652 
2653   bind(doneRetry);
2654 }
2655 
2656 // Use RTM for normal stack locks.
2657 // Input: objReg (object to lock)
2658 void MacroAssembler::rtm_stack_locking(ConditionRegister flag,
2659                                        Register obj, Register mark_word, Register tmp,
2660                                        Register retry_on_abort_count_Reg,
2661                                        RTMLockingCounters* stack_rtm_counters,
2662                                        Metadata* method_data, bool profile_rtm,
2663                                        Label& DONE_LABEL, Label& IsInflated) {
2664   assert(UseRTMForStackLocks, "why call this otherwise?");
2665   assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
2666   Label L_rtm_retry, L_decrement_retry, L_on_abort;
2667 
2668   if (RTMRetryCount > 0) {
2669     load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
2670     bind(L_rtm_retry);
2671   }
2672   andi_(R0, mark_word, markOopDesc::monitor_value);  // inflated vs stack-locked|neutral|biased
2673   bne(CCR0, IsInflated);
2674 
2675   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2676     Label L_noincrement;
2677     if (RTMTotalCountIncrRate > 1) {
2678       branch_on_random_using_tb(tmp, RTMTotalCountIncrRate, L_noincrement);
2679     }
2680     assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
2681     load_const_optimized(tmp, (address)stack_rtm_counters->total_count_addr(), R0);
2682     //atomic_inc_ptr(tmp, /*temp, will be reloaded*/mark_word); We don't increment atomically
2683     ldx(mark_word, tmp);
2684     addi(mark_word, mark_word, 1);
2685     stdx(mark_word, tmp);
2686     bind(L_noincrement);
2687   }
2688   tbegin_();
2689   beq(CCR0, L_on_abort);
2690   ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);         // Reload in transaction, conflicts need to be tracked.
2691   andi(R0, mark_word, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
2692   cmpwi(flag, R0, markOopDesc::unlocked_value);                // bits = 001 unlocked
2693   beq(flag, DONE_LABEL);                                       // all done if unlocked
2694 
2695   if (UseRTMXendForLockBusy) {
2696     tend_();
2697     b(L_decrement_retry);
2698   } else {
2699     tabort_();
2700   }
2701   bind(L_on_abort);
2702   const Register abort_status_Reg = tmp;
2703   mftexasr(abort_status_Reg);
2704   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2705     rtm_profiling(abort_status_Reg, /*temp*/mark_word, stack_rtm_counters, method_data, profile_rtm);
2706   }
2707   ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // reload
2708   if (RTMRetryCount > 0) {
2709     // Retry on lock abort if abort status is not permanent.
2710     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry, &L_decrement_retry);
2711   } else {
2712     bind(L_decrement_retry);
2713   }
2714 }
2715 
2716 // Use RTM for inflating locks
2717 // inputs: obj       (object to lock)
2718 //         mark_word (current header - KILLED)
2719 //         boxReg    (on-stack box address (displaced header location) - KILLED)
2720 void MacroAssembler::rtm_inflated_locking(ConditionRegister flag,
2721                                           Register obj, Register mark_word, Register boxReg,
2722                                           Register retry_on_busy_count_Reg, Register retry_on_abort_count_Reg,
2723                                           RTMLockingCounters* rtm_counters,
2724                                           Metadata* method_data, bool profile_rtm,
2725                                           Label& DONE_LABEL) {
2726   assert(UseRTMLocking, "why call this otherwise?");
2727   Label L_rtm_retry, L_decrement_retry, L_on_abort;
2728   // Clean monitor_value bit to get valid pointer.
2729   int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markOopDesc::monitor_value;
2730 
2731   // Store non-null, using boxReg instead of (intptr_t)markOopDesc::unused_mark().
2732   std(boxReg, BasicLock::displaced_header_offset_in_bytes(), boxReg);
2733   const Register tmpReg = boxReg;
2734   const Register owner_addr_Reg = mark_word;
2735   addi(owner_addr_Reg, mark_word, owner_offset);
2736 
2737   if (RTMRetryCount > 0) {
2738     load_const_optimized(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy.
2739     load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort.
2740     bind(L_rtm_retry);
2741   }
2742   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2743     Label L_noincrement;
2744     if (RTMTotalCountIncrRate > 1) {
2745       branch_on_random_using_tb(R0, RTMTotalCountIncrRate, L_noincrement);
2746     }
2747     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2748     load_const(R0, (address)rtm_counters->total_count_addr(), tmpReg);
2749     //atomic_inc_ptr(R0, tmpReg); We don't increment atomically
2750     ldx(tmpReg, R0);
2751     addi(tmpReg, tmpReg, 1);
2752     stdx(tmpReg, R0);
2753     bind(L_noincrement);
2754   }
2755   tbegin_();
2756   beq(CCR0, L_on_abort);
2757   // We don't reload mark word. Will only be reset at safepoint.
2758   ld(R0, 0, owner_addr_Reg); // Load in transaction, conflicts need to be tracked.
2759   cmpdi(flag, R0, 0);
2760   beq(flag, DONE_LABEL);
2761 
2762   if (UseRTMXendForLockBusy) {
2763     tend_();
2764     b(L_decrement_retry);
2765   } else {
2766     tabort_();
2767   }
2768   bind(L_on_abort);
2769   const Register abort_status_Reg = tmpReg;
2770   mftexasr(abort_status_Reg);
2771   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2772     rtm_profiling(abort_status_Reg, /*temp*/ owner_addr_Reg, rtm_counters, method_data, profile_rtm);
2773     // Restore owner_addr_Reg
2774     ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);
2775 #ifdef ASSERT
2776     andi_(R0, mark_word, markOopDesc::monitor_value);
2777     asm_assert_ne("must be inflated", 0xa754); // Deflating only allowed at safepoint.
2778 #endif
2779     addi(owner_addr_Reg, mark_word, owner_offset);
2780   }
2781   if (RTMRetryCount > 0) {
2782     // Retry on lock abort if abort status is not permanent.
2783     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
2784   }
2785 
2786   // Appears unlocked - try to swing _owner from null to non-null.
2787   cmpxchgd(flag, /*current val*/ R0, (intptr_t)0, /*new val*/ R16_thread, owner_addr_Reg,
2788            MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2789            MacroAssembler::cmpxchgx_hint_acquire_lock(), noreg, &L_decrement_retry, true);
2790 
2791   if (RTMRetryCount > 0) {
2792     // success done else retry
2793     b(DONE_LABEL);
2794     bind(L_decrement_retry);
2795     // Spin and retry if lock is busy.
2796     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, owner_addr_Reg, L_rtm_retry);
2797   } else {
2798     bind(L_decrement_retry);
2799   }
2800 }
2801 
2802 #endif //  INCLUDE_RTM_OPT
2803 
2804 // "The box" is the space on the stack where we copy the object mark.
2805 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box,
2806                                                Register temp, Register displaced_header, Register current_header,
2807                                                bool try_bias,
2808                                                RTMLockingCounters* rtm_counters,
2809                                                RTMLockingCounters* stack_rtm_counters,
2810                                                Metadata* method_data,
2811                                                bool use_rtm, bool profile_rtm) {
2812   assert_different_registers(oop, box, temp, displaced_header, current_header);
2813   assert(flag != CCR0, "bad condition register");
2814   Label cont;
2815   Label object_has_monitor;
2816   Label cas_failed;
2817 
2818   // Load markOop from object into displaced_header.
2819   ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop);
2820 
2821 
2822   if (try_bias) {
2823     biased_locking_enter(flag, oop, displaced_header, temp, current_header, cont);
2824   }
2825 
2826 #if INCLUDE_RTM_OPT
2827   if (UseRTMForStackLocks && use_rtm) {
2828     rtm_stack_locking(flag, oop, displaced_header, temp, /*temp*/ current_header,
2829                       stack_rtm_counters, method_data, profile_rtm,
2830                       cont, object_has_monitor);
2831   }
2832 #endif // INCLUDE_RTM_OPT
2833 
2834   // Handle existing monitor.
2835   // The object has an existing monitor iff (mark & monitor_value) != 0.
2836   andi_(temp, displaced_header, markOopDesc::monitor_value);
2837   bne(CCR0, object_has_monitor);
2838 
2839   // Set displaced_header to be (markOop of object | UNLOCK_VALUE).
2840   ori(displaced_header, displaced_header, markOopDesc::unlocked_value);
2841 
2842   // Load Compare Value application register.
2843 
2844   // Initialize the box. (Must happen before we update the object mark!)
2845   std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2846 
2847   // Must fence, otherwise, preceding store(s) may float below cmpxchg.
2848   // Compare object markOop with mark and if equal exchange scratch1 with object markOop.
2849   cmpxchgd(/*flag=*/flag,
2850            /*current_value=*/current_header,
2851            /*compare_value=*/displaced_header,
2852            /*exchange_value=*/box,
2853            /*where=*/oop,
2854            MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2855            MacroAssembler::cmpxchgx_hint_acquire_lock(),
2856            noreg,
2857            &cas_failed,
2858            /*check without membar and ldarx first*/true);
2859   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2860 
2861   // If the compare-and-exchange succeeded, then we found an unlocked
2862   // object and we have now locked it.
2863   b(cont);
2864 
2865   bind(cas_failed);
2866   // We did not see an unlocked object so try the fast recursive case.
2867 
2868   // Check if the owner is self by comparing the value in the markOop of object
2869   // (current_header) with the stack pointer.
2870   sub(current_header, current_header, R1_SP);
2871   load_const_optimized(temp, ~(os::vm_page_size()-1) | markOopDesc::lock_mask_in_place);
2872 
2873   and_(R0/*==0?*/, current_header, temp);
2874   // If condition is true we are cont and hence we can store 0 as the
2875   // displaced header in the box, which indicates that it is a recursive lock.
2876   mcrf(flag,CCR0);
2877   std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box);
2878 
2879   // Handle existing monitor.
2880   b(cont);
2881 
2882   bind(object_has_monitor);
2883   // The object's monitor m is unlocked iff m->owner == NULL,
2884   // otherwise m->owner may contain a thread or a stack address.
2885 
2886 #if INCLUDE_RTM_OPT
2887   // Use the same RTM locking code in 32- and 64-bit VM.
2888   if (use_rtm) {
2889     rtm_inflated_locking(flag, oop, displaced_header, box, temp, /*temp*/ current_header,
2890                          rtm_counters, method_data, profile_rtm, cont);
2891   } else {
2892 #endif // INCLUDE_RTM_OPT
2893 
2894   // Try to CAS m->owner from NULL to current thread.
2895   addi(temp, displaced_header, ObjectMonitor::owner_offset_in_bytes()-markOopDesc::monitor_value);
2896   cmpxchgd(/*flag=*/flag,
2897            /*current_value=*/current_header,
2898            /*compare_value=*/(intptr_t)0,
2899            /*exchange_value=*/R16_thread,
2900            /*where=*/temp,
2901            MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2902            MacroAssembler::cmpxchgx_hint_acquire_lock());
2903 
2904   // Store a non-null value into the box.
2905   std(box, BasicLock::displaced_header_offset_in_bytes(), box);
2906 
2907 # ifdef ASSERT
2908   bne(flag, cont);
2909   // We have acquired the monitor, check some invariants.
2910   addi(/*monitor=*/temp, temp, -ObjectMonitor::owner_offset_in_bytes());
2911   // Invariant 1: _recursions should be 0.
2912   //assert(ObjectMonitor::recursions_size_in_bytes() == 8, "unexpected size");
2913   asm_assert_mem8_is_zero(ObjectMonitor::recursions_offset_in_bytes(), temp,
2914                             "monitor->_recursions should be 0", -1);
2915 # endif
2916 
2917 #if INCLUDE_RTM_OPT
2918   } // use_rtm()
2919 #endif
2920 
2921   bind(cont);
2922   // flag == EQ indicates success
2923   // flag == NE indicates failure
2924 }
2925 
2926 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box,
2927                                                  Register temp, Register displaced_header, Register current_header,
2928                                                  bool try_bias, bool use_rtm) {
2929   assert_different_registers(oop, box, temp, displaced_header, current_header);
2930   assert(flag != CCR0, "bad condition register");
2931   Label cont;
2932   Label object_has_monitor;
2933 
2934   if (try_bias) {
2935     biased_locking_exit(flag, oop, current_header, cont);
2936   }
2937 
2938 #if INCLUDE_RTM_OPT
2939   if (UseRTMForStackLocks && use_rtm) {
2940     assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
2941     Label L_regular_unlock;
2942     ld(current_header, oopDesc::mark_offset_in_bytes(), oop);         // fetch markword
2943     andi(R0, current_header, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
2944     cmpwi(flag, R0, markOopDesc::unlocked_value);                     // bits = 001 unlocked
2945     bne(flag, L_regular_unlock);                                      // else RegularLock
2946     tend_();                                                          // otherwise end...
2947     b(cont);                                                          // ... and we're done
2948     bind(L_regular_unlock);
2949   }
2950 #endif
2951 
2952   // Find the lock address and load the displaced header from the stack.
2953   ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2954 
2955   // If the displaced header is 0, we have a recursive unlock.
2956   cmpdi(flag, displaced_header, 0);
2957   beq(flag, cont);
2958 
2959   // Handle existing monitor.
2960   // The object has an existing monitor iff (mark & monitor_value) != 0.
2961   RTM_OPT_ONLY( if (!(UseRTMForStackLocks && use_rtm)) ) // skip load if already done
2962   ld(current_header, oopDesc::mark_offset_in_bytes(), oop);
2963   andi_(R0, current_header, markOopDesc::monitor_value);
2964   bne(CCR0, object_has_monitor);
2965 
2966   // Check if it is still a light weight lock, this is is true if we see
2967   // the stack address of the basicLock in the markOop of the object.
2968   // Cmpxchg sets flag to cmpd(current_header, box).
2969   cmpxchgd(/*flag=*/flag,
2970            /*current_value=*/current_header,
2971            /*compare_value=*/box,
2972            /*exchange_value=*/displaced_header,
2973            /*where=*/oop,
2974            MacroAssembler::MemBarRel,
2975            MacroAssembler::cmpxchgx_hint_release_lock(),
2976            noreg,
2977            &cont);
2978 
2979   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2980 
2981   // Handle existing monitor.
2982   b(cont);
2983 
2984   bind(object_has_monitor);
2985   addi(current_header, current_header, -markOopDesc::monitor_value); // monitor
2986   ld(temp,             ObjectMonitor::owner_offset_in_bytes(), current_header);
2987 
2988     // It's inflated.
2989 #if INCLUDE_RTM_OPT
2990   if (use_rtm) {
2991     Label L_regular_inflated_unlock;
2992     // Clean monitor_value bit to get valid pointer
2993     cmpdi(flag, temp, 0);
2994     bne(flag, L_regular_inflated_unlock);
2995     tend_();
2996     b(cont);
2997     bind(L_regular_inflated_unlock);
2998   }
2999 #endif
3000 
3001   ld(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header);
3002   xorr(temp, R16_thread, temp);      // Will be 0 if we are the owner.
3003   orr(temp, temp, displaced_header); // Will be 0 if there are 0 recursions.
3004   cmpdi(flag, temp, 0);
3005   bne(flag, cont);
3006 
3007   ld(temp,             ObjectMonitor::EntryList_offset_in_bytes(), current_header);
3008   ld(displaced_header, ObjectMonitor::cxq_offset_in_bytes(), current_header);
3009   orr(temp, temp, displaced_header); // Will be 0 if both are 0.
3010   cmpdi(flag, temp, 0);
3011   bne(flag, cont);
3012   release();
3013   std(temp, ObjectMonitor::owner_offset_in_bytes(), current_header);
3014 
3015   bind(cont);
3016   // flag == EQ indicates success
3017   // flag == NE indicates failure
3018 }
3019 
3020 void MacroAssembler::safepoint_poll(Label& slow_path, Register temp_reg) {
3021   if (SafepointMechanism::uses_thread_local_poll()) {
3022     ld(temp_reg, in_bytes(Thread::polling_page_offset()), R16_thread);
3023     // Armed page has poll_bit set.
3024     andi_(temp_reg, temp_reg, SafepointMechanism::poll_bit());
3025   } else {
3026     lwz(temp_reg, (RegisterOrConstant)(intptr_t)SafepointSynchronize::address_of_state());
3027     cmpwi(CCR0, temp_reg, SafepointSynchronize::_not_synchronized);
3028   }
3029   bne(CCR0, slow_path);
3030 }
3031 
3032 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2, bool needs_frame) {
3033   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3034   bs->resolve_jobject(this, value, tmp1, tmp2, needs_frame);
3035 }
3036 
3037 // Values for last_Java_pc, and last_Java_sp must comply to the rules
3038 // in frame_ppc.hpp.
3039 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) {
3040   // Always set last_Java_pc and flags first because once last_Java_sp
3041   // is visible has_last_Java_frame is true and users will look at the
3042   // rest of the fields. (Note: flags should always be zero before we
3043   // get here so doesn't need to be set.)
3044 
3045   // Verify that last_Java_pc was zeroed on return to Java
3046   asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread,
3047                           "last_Java_pc not zeroed before leaving Java", 0x200);
3048 
3049   // When returning from calling out from Java mode the frame anchor's
3050   // last_Java_pc will always be set to NULL. It is set here so that
3051   // if we are doing a call to native (not VM) that we capture the
3052   // known pc and don't have to rely on the native call having a
3053   // standard frame linkage where we can find the pc.
3054   if (last_Java_pc != noreg)
3055     std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
3056 
3057   // Set last_Java_sp last.
3058   std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
3059 }
3060 
3061 void MacroAssembler::reset_last_Java_frame(void) {
3062   asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()),
3063                              R16_thread, "SP was not set, still zero", 0x202);
3064 
3065   BLOCK_COMMENT("reset_last_Java_frame {");
3066   li(R0, 0);
3067 
3068   // _last_Java_sp = 0
3069   std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
3070 
3071   // _last_Java_pc = 0
3072   std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
3073   BLOCK_COMMENT("} reset_last_Java_frame");
3074 }
3075 
3076 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) {
3077   assert_different_registers(sp, tmp1);
3078 
3079   // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via
3080   // TOP_IJAVA_FRAME_ABI.
3081   // FIXME: assert that we really have a TOP_IJAVA_FRAME here!
3082   address entry = pc();
3083   load_const_optimized(tmp1, entry);
3084 
3085   set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1);
3086 }
3087 
3088 void MacroAssembler::get_vm_result(Register oop_result) {
3089   // Read:
3090   //   R16_thread
3091   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
3092   //
3093   // Updated:
3094   //   oop_result
3095   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
3096 
3097   verify_thread();
3098 
3099   ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread);
3100   li(R0, 0);
3101   std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread);
3102 
3103   verify_oop(oop_result);
3104 }
3105 
3106 void MacroAssembler::get_vm_result_2(Register metadata_result) {
3107   // Read:
3108   //   R16_thread
3109   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
3110   //
3111   // Updated:
3112   //   metadata_result
3113   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
3114 
3115   ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
3116   li(R0, 0);
3117   std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
3118 }
3119 
3120 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3121   Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided.
3122   if (Universe::narrow_klass_base() != 0) {
3123     // Use dst as temp if it is free.
3124     sub_const_optimized(dst, current, Universe::narrow_klass_base(), R0);
3125     current = dst;
3126   }
3127   if (Universe::narrow_klass_shift() != 0) {
3128     srdi(dst, current, Universe::narrow_klass_shift());
3129     current = dst;
3130   }
3131   return current;
3132 }
3133 
3134 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) {
3135   if (UseCompressedClassPointers) {
3136     Register compressedKlass = encode_klass_not_null(ck, klass);
3137     stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop);
3138   } else {
3139     std(klass, oopDesc::klass_offset_in_bytes(), dst_oop);
3140   }
3141 }
3142 
3143 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) {
3144   if (UseCompressedClassPointers) {
3145     if (val == noreg) {
3146       val = R0;
3147       li(val, 0);
3148     }
3149     stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed
3150   }
3151 }
3152 
3153 int MacroAssembler::instr_size_for_decode_klass_not_null() {
3154   if (!UseCompressedClassPointers) return 0;
3155   int num_instrs = 1;  // shift or move
3156   if (Universe::narrow_klass_base() != 0) num_instrs = 7;  // shift + load const + add
3157   return num_instrs * BytesPerInstWord;
3158 }
3159 
3160 void MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3161   assert(dst != R0, "Dst reg may not be R0, as R0 is used here.");
3162   if (src == noreg) src = dst;
3163   Register shifted_src = src;
3164   if (Universe::narrow_klass_shift() != 0 ||
3165       Universe::narrow_klass_base() == 0 && src != dst) {  // Move required.
3166     shifted_src = dst;
3167     sldi(shifted_src, src, Universe::narrow_klass_shift());
3168   }
3169   if (Universe::narrow_klass_base() != 0) {
3170     add_const_optimized(dst, shifted_src, Universe::narrow_klass_base(), R0);
3171   }
3172 }
3173 
3174 void MacroAssembler::load_klass(Register dst, Register src) {
3175   if (UseCompressedClassPointers) {
3176     lwz(dst, oopDesc::klass_offset_in_bytes(), src);
3177     // Attention: no null check here!
3178     decode_klass_not_null(dst, dst);
3179   } else {
3180     ld(dst, oopDesc::klass_offset_in_bytes(), src);
3181   }
3182 }
3183 
3184 // ((OopHandle)result).resolve();
3185 void MacroAssembler::resolve_oop_handle(Register result) {
3186   // OopHandle::resolve is an indirection.
3187   ld(result, 0, result);
3188 }
3189 
3190 void MacroAssembler::load_mirror_from_const_method(Register mirror, Register const_method) {
3191   ld(mirror, in_bytes(ConstMethod::constants_offset()), const_method);
3192   ld(mirror, ConstantPool::pool_holder_offset_in_bytes(), mirror);
3193   ld(mirror, in_bytes(Klass::java_mirror_offset()), mirror);
3194   resolve_oop_handle(mirror);
3195 }
3196 
3197 // Clear Array
3198 // For very short arrays. tmp == R0 is allowed.
3199 void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) {
3200   if (cnt_dwords > 0) { li(tmp, 0); }
3201   for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); }
3202 }
3203 
3204 // Version for constant short array length. Kills base_ptr. tmp == R0 is allowed.
3205 void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) {
3206   if (cnt_dwords < 8) {
3207     clear_memory_unrolled(base_ptr, cnt_dwords, tmp);
3208     return;
3209   }
3210 
3211   Label loop;
3212   const long loopcnt   = cnt_dwords >> 1,
3213              remainder = cnt_dwords & 1;
3214 
3215   li(tmp, loopcnt);
3216   mtctr(tmp);
3217   li(tmp, 0);
3218   bind(loop);
3219     std(tmp, 0, base_ptr);
3220     std(tmp, 8, base_ptr);
3221     addi(base_ptr, base_ptr, 16);
3222     bdnz(loop);
3223   if (remainder) { std(tmp, 0, base_ptr); }
3224 }
3225 
3226 // Kills both input registers. tmp == R0 is allowed.
3227 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) {
3228   // Procedure for large arrays (uses data cache block zero instruction).
3229     Label startloop, fast, fastloop, small_rest, restloop, done;
3230     const int cl_size         = VM_Version::L1_data_cache_line_size(),
3231               cl_dwords       = cl_size >> 3,
3232               cl_dw_addr_bits = exact_log2(cl_dwords),
3233               dcbz_min        = 1,  // Min count of dcbz executions, needs to be >0.
3234               min_cnt         = ((dcbz_min + 1) << cl_dw_addr_bits) - 1;
3235 
3236   if (const_cnt >= 0) {
3237     // Constant case.
3238     if (const_cnt < min_cnt) {
3239       clear_memory_constlen(base_ptr, const_cnt, tmp);
3240       return;
3241     }
3242     load_const_optimized(cnt_dwords, const_cnt, tmp);
3243   } else {
3244     // cnt_dwords already loaded in register. Need to check size.
3245     cmpdi(CCR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included).
3246     blt(CCR1, small_rest);
3247   }
3248     rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line.
3249     beq(CCR0, fast);                                  // Already 128byte aligned.
3250 
3251     subfic(tmp, tmp, cl_dwords);
3252     mtctr(tmp);                        // Set ctr to hit 128byte boundary (0<ctr<cl_dwords).
3253     subf(cnt_dwords, tmp, cnt_dwords); // rest.
3254     li(tmp, 0);
3255 
3256   bind(startloop);                     // Clear at the beginning to reach 128byte boundary.
3257     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3258     addi(base_ptr, base_ptr, 8);
3259     bdnz(startloop);
3260 
3261   bind(fast);                                  // Clear 128byte blocks.
3262     srdi(tmp, cnt_dwords, cl_dw_addr_bits);    // Loop count for 128byte loop (>0).
3263     andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords.
3264     mtctr(tmp);                                // Load counter.
3265 
3266   bind(fastloop);
3267     dcbz(base_ptr);                    // Clear 128byte aligned block.
3268     addi(base_ptr, base_ptr, cl_size);
3269     bdnz(fastloop);
3270 
3271   bind(small_rest);
3272     cmpdi(CCR0, cnt_dwords, 0);        // size 0?
3273     beq(CCR0, done);                   // rest == 0
3274     li(tmp, 0);
3275     mtctr(cnt_dwords);                 // Load counter.
3276 
3277   bind(restloop);                      // Clear rest.
3278     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3279     addi(base_ptr, base_ptr, 8);
3280     bdnz(restloop);
3281 
3282   bind(done);
3283 }
3284 
3285 /////////////////////////////////////////// String intrinsics ////////////////////////////////////////////
3286 
3287 #ifdef COMPILER2
3288 // Intrinsics for CompactStrings
3289 
3290 // Compress char[] to byte[] by compressing 16 bytes at once.
3291 void MacroAssembler::string_compress_16(Register src, Register dst, Register cnt,
3292                                         Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5,
3293                                         Label& Lfailure) {
3294 
3295   const Register tmp0 = R0;
3296   assert_different_registers(src, dst, cnt, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5);
3297   Label Lloop, Lslow;
3298 
3299   // Check if cnt >= 8 (= 16 bytes)
3300   lis(tmp1, 0xFF);                // tmp1 = 0x00FF00FF00FF00FF
3301   srwi_(tmp2, cnt, 3);
3302   beq(CCR0, Lslow);
3303   ori(tmp1, tmp1, 0xFF);
3304   rldimi(tmp1, tmp1, 32, 0);
3305   mtctr(tmp2);
3306 
3307   // 2x unrolled loop
3308   bind(Lloop);
3309   ld(tmp2, 0, src);               // _0_1_2_3 (Big Endian)
3310   ld(tmp4, 8, src);               // _4_5_6_7
3311 
3312   orr(tmp0, tmp2, tmp4);
3313   rldicl(tmp3, tmp2, 6*8, 64-24); // _____1_2
3314   rldimi(tmp2, tmp2, 2*8, 2*8);   // _0_2_3_3
3315   rldicl(tmp5, tmp4, 6*8, 64-24); // _____5_6
3316   rldimi(tmp4, tmp4, 2*8, 2*8);   // _4_6_7_7
3317 
3318   andc_(tmp0, tmp0, tmp1);
3319   bne(CCR0, Lfailure);            // Not latin1.
3320   addi(src, src, 16);
3321 
3322   rlwimi(tmp3, tmp2, 0*8, 24, 31);// _____1_3
3323   srdi(tmp2, tmp2, 3*8);          // ____0_2_
3324   rlwimi(tmp5, tmp4, 0*8, 24, 31);// _____5_7
3325   srdi(tmp4, tmp4, 3*8);          // ____4_6_
3326 
3327   orr(tmp2, tmp2, tmp3);          // ____0123
3328   orr(tmp4, tmp4, tmp5);          // ____4567
3329 
3330   stw(tmp2, 0, dst);
3331   stw(tmp4, 4, dst);
3332   addi(dst, dst, 8);
3333   bdnz(Lloop);
3334 
3335   bind(Lslow);                    // Fallback to slow version
3336 }
3337 
3338 // Compress char[] to byte[]. cnt must be positive int.
3339 void MacroAssembler::string_compress(Register src, Register dst, Register cnt, Register tmp, Label& Lfailure) {
3340   Label Lloop;
3341   mtctr(cnt);
3342 
3343   bind(Lloop);
3344   lhz(tmp, 0, src);
3345   cmplwi(CCR0, tmp, 0xff);
3346   bgt(CCR0, Lfailure);            // Not latin1.
3347   addi(src, src, 2);
3348   stb(tmp, 0, dst);
3349   addi(dst, dst, 1);
3350   bdnz(Lloop);
3351 }
3352 
3353 // Inflate byte[] to char[] by inflating 16 bytes at once.
3354 void MacroAssembler::string_inflate_16(Register src, Register dst, Register cnt,
3355                                        Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
3356   const Register tmp0 = R0;
3357   assert_different_registers(src, dst, cnt, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5);
3358   Label Lloop, Lslow;
3359 
3360   // Check if cnt >= 8
3361   srwi_(tmp2, cnt, 3);
3362   beq(CCR0, Lslow);
3363   lis(tmp1, 0xFF);                // tmp1 = 0x00FF00FF
3364   ori(tmp1, tmp1, 0xFF);
3365   mtctr(tmp2);
3366 
3367   // 2x unrolled loop
3368   bind(Lloop);
3369   lwz(tmp2, 0, src);              // ____0123 (Big Endian)
3370   lwz(tmp4, 4, src);              // ____4567
3371   addi(src, src, 8);
3372 
3373   rldicl(tmp3, tmp2, 7*8, 64-8);  // _______2
3374   rlwimi(tmp2, tmp2, 3*8, 16, 23);// ____0113
3375   rldicl(tmp5, tmp4, 7*8, 64-8);  // _______6
3376   rlwimi(tmp4, tmp4, 3*8, 16, 23);// ____4557
3377 
3378   andc(tmp0, tmp2, tmp1);         // ____0_1_
3379   rlwimi(tmp2, tmp3, 2*8, 0, 23); // _____2_3
3380   andc(tmp3, tmp4, tmp1);         // ____4_5_
3381   rlwimi(tmp4, tmp5, 2*8, 0, 23); // _____6_7
3382 
3383   rldimi(tmp2, tmp0, 3*8, 0*8);   // _0_1_2_3
3384   rldimi(tmp4, tmp3, 3*8, 0*8);   // _4_5_6_7
3385 
3386   std(tmp2, 0, dst);
3387   std(tmp4, 8, dst);
3388   addi(dst, dst, 16);
3389   bdnz(Lloop);
3390 
3391   bind(Lslow);                    // Fallback to slow version
3392 }
3393 
3394 // Inflate byte[] to char[]. cnt must be positive int.
3395 void MacroAssembler::string_inflate(Register src, Register dst, Register cnt, Register tmp) {
3396   Label Lloop;
3397   mtctr(cnt);
3398 
3399   bind(Lloop);
3400   lbz(tmp, 0, src);
3401   addi(src, src, 1);
3402   sth(tmp, 0, dst);
3403   addi(dst, dst, 2);
3404   bdnz(Lloop);
3405 }
3406 
3407 void MacroAssembler::string_compare(Register str1, Register str2,
3408                                     Register cnt1, Register cnt2,
3409                                     Register tmp1, Register result, int ae) {
3410   const Register tmp0 = R0,
3411                  diff = tmp1;
3412 
3413   assert_different_registers(str1, str2, cnt1, cnt2, tmp0, tmp1, result);
3414   Label Ldone, Lslow, Lloop, Lreturn_diff;
3415 
3416   // Note: Making use of the fact that compareTo(a, b) == -compareTo(b, a)
3417   // we interchange str1 and str2 in the UL case and negate the result.
3418   // Like this, str1 is always latin1 encoded, except for the UU case.
3419   // In addition, we need 0 (or sign which is 0) extend.
3420 
3421   if (ae == StrIntrinsicNode::UU) {
3422     srwi(cnt1, cnt1, 1);
3423   } else {
3424     clrldi(cnt1, cnt1, 32);
3425   }
3426 
3427   if (ae != StrIntrinsicNode::LL) {
3428     srwi(cnt2, cnt2, 1);
3429   } else {
3430     clrldi(cnt2, cnt2, 32);
3431   }
3432 
3433   // See if the lengths are different, and calculate min in cnt1.
3434   // Save diff in case we need it for a tie-breaker.
3435   subf_(diff, cnt2, cnt1); // diff = cnt1 - cnt2
3436   // if (diff > 0) { cnt1 = cnt2; }
3437   if (VM_Version::has_isel()) {
3438     isel(cnt1, CCR0, Assembler::greater, /*invert*/ false, cnt2);
3439   } else {
3440     Label Lskip;
3441     blt(CCR0, Lskip);
3442     mr(cnt1, cnt2);
3443     bind(Lskip);
3444   }
3445 
3446   // Rename registers
3447   Register chr1 = result;
3448   Register chr2 = tmp0;
3449 
3450   // Compare multiple characters in fast loop (only implemented for same encoding).
3451   int stride1 = 8, stride2 = 8;
3452   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3453     int log2_chars_per_iter = (ae == StrIntrinsicNode::LL) ? 3 : 2;
3454     Label Lfastloop, Lskipfast;
3455 
3456     srwi_(tmp0, cnt1, log2_chars_per_iter);
3457     beq(CCR0, Lskipfast);
3458     rldicl(cnt2, cnt1, 0, 64 - log2_chars_per_iter); // Remaining characters.
3459     li(cnt1, 1 << log2_chars_per_iter); // Initialize for failure case: Rescan characters from current iteration.
3460     mtctr(tmp0);
3461 
3462     bind(Lfastloop);
3463     ld(chr1, 0, str1);
3464     ld(chr2, 0, str2);
3465     cmpd(CCR0, chr1, chr2);
3466     bne(CCR0, Lslow);
3467     addi(str1, str1, stride1);
3468     addi(str2, str2, stride2);
3469     bdnz(Lfastloop);
3470     mr(cnt1, cnt2); // Remaining characters.
3471     bind(Lskipfast);
3472   }
3473 
3474   // Loop which searches the first difference character by character.
3475   cmpwi(CCR0, cnt1, 0);
3476   beq(CCR0, Lreturn_diff);
3477   bind(Lslow);
3478   mtctr(cnt1);
3479 
3480   switch (ae) {
3481     case StrIntrinsicNode::LL: stride1 = 1; stride2 = 1; break;
3482     case StrIntrinsicNode::UL: // fallthru (see comment above)
3483     case StrIntrinsicNode::LU: stride1 = 1; stride2 = 2; break;
3484     case StrIntrinsicNode::UU: stride1 = 2; stride2 = 2; break;
3485     default: ShouldNotReachHere(); break;
3486   }
3487 
3488   bind(Lloop);
3489   if (stride1 == 1) { lbz(chr1, 0, str1); } else { lhz(chr1, 0, str1); }
3490   if (stride2 == 1) { lbz(chr2, 0, str2); } else { lhz(chr2, 0, str2); }
3491   subf_(result, chr2, chr1); // result = chr1 - chr2
3492   bne(CCR0, Ldone);
3493   addi(str1, str1, stride1);
3494   addi(str2, str2, stride2);
3495   bdnz(Lloop);
3496 
3497   // If strings are equal up to min length, return the length difference.
3498   bind(Lreturn_diff);
3499   mr(result, diff);
3500 
3501   // Otherwise, return the difference between the first mismatched chars.
3502   bind(Ldone);
3503   if (ae == StrIntrinsicNode::UL) {
3504     neg(result, result); // Negate result (see note above).
3505   }
3506 }
3507 
3508 void MacroAssembler::array_equals(bool is_array_equ, Register ary1, Register ary2,
3509                                   Register limit, Register tmp1, Register result, bool is_byte) {
3510   const Register tmp0 = R0;
3511   assert_different_registers(ary1, ary2, limit, tmp0, tmp1, result);
3512   Label Ldone, Lskiploop, Lloop, Lfastloop, Lskipfast;
3513   bool limit_needs_shift = false;
3514 
3515   if (is_array_equ) {
3516     const int length_offset = arrayOopDesc::length_offset_in_bytes();
3517     const int base_offset   = arrayOopDesc::base_offset_in_bytes(is_byte ? T_BYTE : T_CHAR);
3518 
3519     // Return true if the same array.
3520     cmpd(CCR0, ary1, ary2);
3521     beq(CCR0, Lskiploop);
3522 
3523     // Return false if one of them is NULL.
3524     cmpdi(CCR0, ary1, 0);
3525     cmpdi(CCR1, ary2, 0);
3526     li(result, 0);
3527     cror(CCR0, Assembler::equal, CCR1, Assembler::equal);
3528     beq(CCR0, Ldone);
3529 
3530     // Load the lengths of arrays.
3531     lwz(limit, length_offset, ary1);
3532     lwz(tmp0, length_offset, ary2);
3533 
3534     // Return false if the two arrays are not equal length.
3535     cmpw(CCR0, limit, tmp0);
3536     bne(CCR0, Ldone);
3537 
3538     // Load array addresses.
3539     addi(ary1, ary1, base_offset);
3540     addi(ary2, ary2, base_offset);
3541   } else {
3542     limit_needs_shift = !is_byte;
3543     li(result, 0); // Assume not equal.
3544   }
3545 
3546   // Rename registers
3547   Register chr1 = tmp0;
3548   Register chr2 = tmp1;
3549 
3550   // Compare 8 bytes per iteration in fast loop.
3551   const int log2_chars_per_iter = is_byte ? 3 : 2;
3552 
3553   srwi_(tmp0, limit, log2_chars_per_iter + (limit_needs_shift ? 1 : 0));
3554   beq(CCR0, Lskipfast);
3555   mtctr(tmp0);
3556 
3557   bind(Lfastloop);
3558   ld(chr1, 0, ary1);
3559   ld(chr2, 0, ary2);
3560   addi(ary1, ary1, 8);
3561   addi(ary2, ary2, 8);
3562   cmpd(CCR0, chr1, chr2);
3563   bne(CCR0, Ldone);
3564   bdnz(Lfastloop);
3565 
3566   bind(Lskipfast);
3567   rldicl_(limit, limit, limit_needs_shift ? 64 - 1 : 0, 64 - log2_chars_per_iter); // Remaining characters.
3568   beq(CCR0, Lskiploop);
3569   mtctr(limit);
3570 
3571   // Character by character.
3572   bind(Lloop);
3573   if (is_byte) {
3574     lbz(chr1, 0, ary1);
3575     lbz(chr2, 0, ary2);
3576     addi(ary1, ary1, 1);
3577     addi(ary2, ary2, 1);
3578   } else {
3579     lhz(chr1, 0, ary1);
3580     lhz(chr2, 0, ary2);
3581     addi(ary1, ary1, 2);
3582     addi(ary2, ary2, 2);
3583   }
3584   cmpw(CCR0, chr1, chr2);
3585   bne(CCR0, Ldone);
3586   bdnz(Lloop);
3587 
3588   bind(Lskiploop);
3589   li(result, 1); // All characters are equal.
3590   bind(Ldone);
3591 }
3592 
3593 void MacroAssembler::string_indexof(Register result, Register haystack, Register haycnt,
3594                                     Register needle, ciTypeArray* needle_values, Register needlecnt, int needlecntval,
3595                                     Register tmp1, Register tmp2, Register tmp3, Register tmp4, int ae) {
3596 
3597   // Ensure 0<needlecnt<=haycnt in ideal graph as prerequisite!
3598   Label L_TooShort, L_Found, L_NotFound, L_End;
3599   Register last_addr = haycnt, // Kill haycnt at the beginning.
3600   addr      = tmp1,
3601   n_start   = tmp2,
3602   ch1       = tmp3,
3603   ch2       = R0;
3604 
3605   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
3606   const int h_csize = (ae == StrIntrinsicNode::LL) ? 1 : 2;
3607   const int n_csize = (ae == StrIntrinsicNode::UU) ? 2 : 1;
3608 
3609   // **************************************************************************************************
3610   // Prepare for main loop: optimized for needle count >=2, bail out otherwise.
3611   // **************************************************************************************************
3612 
3613   // Compute last haystack addr to use if no match gets found.
3614   clrldi(haycnt, haycnt, 32);         // Ensure positive int is valid as 64 bit value.
3615   addi(addr, haystack, -h_csize);     // Accesses use pre-increment.
3616   if (needlecntval == 0) { // variable needlecnt
3617    cmpwi(CCR6, needlecnt, 2);
3618    clrldi(needlecnt, needlecnt, 32);  // Ensure positive int is valid as 64 bit value.
3619    blt(CCR6, L_TooShort);             // Variable needlecnt: handle short needle separately.
3620   }
3621 
3622   if (n_csize == 2) { lwz(n_start, 0, needle); } else { lhz(n_start, 0, needle); } // Load first 2 characters of needle.
3623 
3624   if (needlecntval == 0) { // variable needlecnt
3625    subf(ch1, needlecnt, haycnt);      // Last character index to compare is haycnt-needlecnt.
3626    addi(needlecnt, needlecnt, -2);    // Rest of needle.
3627   } else { // constant needlecnt
3628   guarantee(needlecntval != 1, "IndexOf with single-character needle must be handled separately");
3629   assert((needlecntval & 0x7fff) == needlecntval, "wrong immediate");
3630    addi(ch1, haycnt, -needlecntval);  // Last character index to compare is haycnt-needlecnt.
3631    if (needlecntval > 3) { li(needlecnt, needlecntval - 2); } // Rest of needle.
3632   }
3633 
3634   if (h_csize == 2) { slwi(ch1, ch1, 1); } // Scale to number of bytes.
3635 
3636   if (ae ==StrIntrinsicNode::UL) {
3637    srwi(tmp4, n_start, 1*8);          // ___0
3638    rlwimi(n_start, tmp4, 2*8, 0, 23); // _0_1
3639   }
3640 
3641   add(last_addr, haystack, ch1);      // Point to last address to compare (haystack+2*(haycnt-needlecnt)).
3642 
3643   // Main Loop (now we have at least 2 characters).
3644   Label L_OuterLoop, L_InnerLoop, L_FinalCheck, L_Comp1, L_Comp2;
3645   bind(L_OuterLoop); // Search for 1st 2 characters.
3646   Register addr_diff = tmp4;
3647    subf(addr_diff, addr, last_addr);  // Difference between already checked address and last address to check.
3648    addi(addr, addr, h_csize);         // This is the new address we want to use for comparing.
3649    srdi_(ch2, addr_diff, h_csize);
3650    beq(CCR0, L_FinalCheck);           // 2 characters left?
3651    mtctr(ch2);                        // num of characters / 2
3652   bind(L_InnerLoop);                  // Main work horse (2x unrolled search loop)
3653    if (h_csize == 2) {                // Load 2 characters of haystack (ignore alignment).
3654     lwz(ch1, 0, addr);
3655     lwz(ch2, 2, addr);
3656    } else {
3657     lhz(ch1, 0, addr);
3658     lhz(ch2, 1, addr);
3659    }
3660    cmpw(CCR0, ch1, n_start);          // Compare 2 characters (1 would be sufficient but try to reduce branches to CompLoop).
3661    cmpw(CCR1, ch2, n_start);
3662    beq(CCR0, L_Comp1);                // Did we find the needle start?
3663    beq(CCR1, L_Comp2);
3664    addi(addr, addr, 2 * h_csize);
3665    bdnz(L_InnerLoop);
3666   bind(L_FinalCheck);
3667    andi_(addr_diff, addr_diff, h_csize); // Remaining characters not covered by InnerLoop: (num of characters) & 1.
3668    beq(CCR0, L_NotFound);
3669    if (h_csize == 2) { lwz(ch1, 0, addr); } else { lhz(ch1, 0, addr); } // One position left at which we have to compare.
3670    cmpw(CCR1, ch1, n_start);
3671    beq(CCR1, L_Comp1);
3672   bind(L_NotFound);
3673    li(result, -1);                    // not found
3674    b(L_End);
3675 
3676    // **************************************************************************************************
3677    // Special Case: unfortunately, the variable needle case can be called with needlecnt<2
3678    // **************************************************************************************************
3679   if (needlecntval == 0) {           // We have to handle these cases separately.
3680   Label L_OneCharLoop;
3681   bind(L_TooShort);
3682    mtctr(haycnt);
3683    if (n_csize == 2) { lhz(n_start, 0, needle); } else { lbz(n_start, 0, needle); } // First character of needle
3684   bind(L_OneCharLoop);
3685    if (h_csize == 2) { lhzu(ch1, 2, addr); } else { lbzu(ch1, 1, addr); }
3686    cmpw(CCR1, ch1, n_start);
3687    beq(CCR1, L_Found);               // Did we find the one character needle?
3688    bdnz(L_OneCharLoop);
3689    li(result, -1);                   // Not found.
3690    b(L_End);
3691   }
3692 
3693   // **************************************************************************************************
3694   // Regular Case Part II: compare rest of needle (first 2 characters have been compared already)
3695   // **************************************************************************************************
3696 
3697   // Compare the rest
3698   bind(L_Comp2);
3699    addi(addr, addr, h_csize);        // First comparison has failed, 2nd one hit.
3700   bind(L_Comp1);                     // Addr points to possible needle start.
3701   if (needlecntval != 2) {           // Const needlecnt==2?
3702    if (needlecntval != 3) {
3703     if (needlecntval == 0) { beq(CCR6, L_Found); } // Variable needlecnt==2?
3704     Register n_ind = tmp4,
3705              h_ind = n_ind;
3706     li(n_ind, 2 * n_csize);          // First 2 characters are already compared, use index 2.
3707     mtctr(needlecnt);                // Decremented by 2, still > 0.
3708    Label L_CompLoop;
3709    bind(L_CompLoop);
3710     if (ae ==StrIntrinsicNode::UL) {
3711       h_ind = ch1;
3712       sldi(h_ind, n_ind, 1);
3713     }
3714     if (n_csize == 2) { lhzx(ch2, needle, n_ind); } else { lbzx(ch2, needle, n_ind); }
3715     if (h_csize == 2) { lhzx(ch1, addr, h_ind); } else { lbzx(ch1, addr, h_ind); }
3716     cmpw(CCR1, ch1, ch2);
3717     bne(CCR1, L_OuterLoop);
3718     addi(n_ind, n_ind, n_csize);
3719     bdnz(L_CompLoop);
3720    } else { // No loop required if there's only one needle character left.
3721     if (n_csize == 2) { lhz(ch2, 2 * 2, needle); } else { lbz(ch2, 2 * 1, needle); }
3722     if (h_csize == 2) { lhz(ch1, 2 * 2, addr); } else { lbz(ch1, 2 * 1, addr); }
3723     cmpw(CCR1, ch1, ch2);
3724     bne(CCR1, L_OuterLoop);
3725    }
3726   }
3727   // Return index ...
3728   bind(L_Found);
3729    subf(result, haystack, addr);     // relative to haystack, ...
3730    if (h_csize == 2) { srdi(result, result, 1); } // in characters.
3731   bind(L_End);
3732 } // string_indexof
3733 
3734 void MacroAssembler::string_indexof_char(Register result, Register haystack, Register haycnt,
3735                                          Register needle, jchar needleChar, Register tmp1, Register tmp2, bool is_byte) {
3736   assert_different_registers(haystack, haycnt, needle, tmp1, tmp2);
3737 
3738   Label L_InnerLoop, L_FinalCheck, L_Found1, L_Found2, L_NotFound, L_End;
3739   Register addr = tmp1,
3740            ch1 = tmp2,
3741            ch2 = R0;
3742 
3743   const int h_csize = is_byte ? 1 : 2;
3744 
3745 //4:
3746    srwi_(tmp2, haycnt, 1);   // Shift right by exact_log2(UNROLL_FACTOR).
3747    mr(addr, haystack);
3748    beq(CCR0, L_FinalCheck);
3749    mtctr(tmp2);              // Move to count register.
3750 //8:
3751   bind(L_InnerLoop);         // Main work horse (2x unrolled search loop).
3752    if (!is_byte) {
3753     lhz(ch1, 0, addr);
3754     lhz(ch2, 2, addr);
3755    } else {
3756     lbz(ch1, 0, addr);
3757     lbz(ch2, 1, addr);
3758    }
3759    (needle != R0) ? cmpw(CCR0, ch1, needle) : cmplwi(CCR0, ch1, (unsigned int)needleChar);
3760    (needle != R0) ? cmpw(CCR1, ch2, needle) : cmplwi(CCR1, ch2, (unsigned int)needleChar);
3761    beq(CCR0, L_Found1);      // Did we find the needle?
3762    beq(CCR1, L_Found2);
3763    addi(addr, addr, 2 * h_csize);
3764    bdnz(L_InnerLoop);
3765 //16:
3766   bind(L_FinalCheck);
3767    andi_(R0, haycnt, 1);
3768    beq(CCR0, L_NotFound);
3769    if (!is_byte) { lhz(ch1, 0, addr); } else { lbz(ch1, 0, addr); } // One position left at which we have to compare.
3770    (needle != R0) ? cmpw(CCR1, ch1, needle) : cmplwi(CCR1, ch1, (unsigned int)needleChar);
3771    beq(CCR1, L_Found1);
3772 //21:
3773   bind(L_NotFound);
3774    li(result, -1);           // Not found.
3775    b(L_End);
3776 
3777   bind(L_Found2);
3778    addi(addr, addr, h_csize);
3779 //24:
3780   bind(L_Found1);            // Return index ...
3781    subf(result, haystack, addr); // relative to haystack, ...
3782    if (!is_byte) { srdi(result, result, 1); } // in characters.
3783   bind(L_End);
3784 } // string_indexof_char
3785 
3786 
3787 void MacroAssembler::has_negatives(Register src, Register cnt, Register result,
3788                                    Register tmp1, Register tmp2) {
3789   const Register tmp0 = R0;
3790   assert_different_registers(src, result, cnt, tmp0, tmp1, tmp2);
3791   Label Lfastloop, Lslow, Lloop, Lnoneg, Ldone;
3792 
3793   // Check if cnt >= 8 (= 16 bytes)
3794   lis(tmp1, (int)(short)0x8080);  // tmp1 = 0x8080808080808080
3795   srwi_(tmp2, cnt, 4);
3796   li(result, 1);                  // Assume there's a negative byte.
3797   beq(CCR0, Lslow);
3798   ori(tmp1, tmp1, 0x8080);
3799   rldimi(tmp1, tmp1, 32, 0);
3800   mtctr(tmp2);
3801 
3802   // 2x unrolled loop
3803   bind(Lfastloop);
3804   ld(tmp2, 0, src);
3805   ld(tmp0, 8, src);
3806 
3807   orr(tmp0, tmp2, tmp0);
3808 
3809   and_(tmp0, tmp0, tmp1);
3810   bne(CCR0, Ldone);               // Found negative byte.
3811   addi(src, src, 16);
3812 
3813   bdnz(Lfastloop);
3814 
3815   bind(Lslow);                    // Fallback to slow version
3816   rldicl_(tmp0, cnt, 0, 64-4);
3817   beq(CCR0, Lnoneg);
3818   mtctr(tmp0);
3819   bind(Lloop);
3820   lbz(tmp0, 0, src);
3821   addi(src, src, 1);
3822   andi_(tmp0, tmp0, 0x80);
3823   bne(CCR0, Ldone);               // Found negative byte.
3824   bdnz(Lloop);
3825   bind(Lnoneg);
3826   li(result, 0);
3827 
3828   bind(Ldone);
3829 }
3830 
3831 #endif // Compiler2
3832 
3833 // Helpers for Intrinsic Emitters
3834 //
3835 // Revert the byte order of a 32bit value in a register
3836 //   src: 0x44556677
3837 //   dst: 0x77665544
3838 // Three steps to obtain the result:
3839 //  1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word
3840 //     into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared.
3841 //     This value initializes dst.
3842 //  2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost
3843 //     byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go.
3844 //     This value is mask inserted into dst with a [0..23] mask of 1s.
3845 //  3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position.
3846 //     This value is mask inserted into dst with a [8..15] mask of 1s.
3847 void MacroAssembler::load_reverse_32(Register dst, Register src) {
3848   assert_different_registers(dst, src);
3849 
3850   rldicl(dst, src, (4+1)*8, 56);       // Rotate byte 4 into position 7 (rightmost), clear all to the left.
3851   rlwimi(dst, src,     3*8,  0, 23);   // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone.
3852   rlwimi(dst, src,     1*8,  8, 15);   // Insert byte 6 into position 5, leave the rest alone.
3853 }
3854 
3855 // Calculate the column addresses of the crc32 lookup table into distinct registers.
3856 // This loop-invariant calculation is moved out of the loop body, reducing the loop
3857 // body size from 20 to 16 instructions.
3858 // Returns the offset that was used to calculate the address of column tc3.
3859 // Due to register shortage, setting tc3 may overwrite table. With the return offset
3860 // at hand, the original table address can be easily reconstructed.
3861 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {

3862 


3863 #ifdef VM_LITTLE_ENDIAN
3864   // This is what we implement (the DOLIT4 part):
3865   // ========================================================================= */
3866   // #define DOLIT4 c ^= *buf4++; \
3867   //         c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \
3868   //             crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24]
3869   // #define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4
3870   // ========================================================================= */
3871   const int ix0 = 3*(4*CRC32_COLUMN_SIZE);
3872   const int ix1 = 2*(4*CRC32_COLUMN_SIZE);
3873   const int ix2 = 1*(4*CRC32_COLUMN_SIZE);
3874   const int ix3 = 0*(4*CRC32_COLUMN_SIZE);
3875 #else
3876   // This is what we implement (the DOBIG4 part):
3877   // =========================================================================
3878   // #define DOBIG4 c ^= *++buf4; \
3879   //         c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \
3880   //             crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24]
3881   // #define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4
3882   // =========================================================================
3883   const int ix0 = 4*(4*CRC32_COLUMN_SIZE);
3884   const int ix1 = 5*(4*CRC32_COLUMN_SIZE);
3885   const int ix2 = 6*(4*CRC32_COLUMN_SIZE);
3886   const int ix3 = 7*(4*CRC32_COLUMN_SIZE);
3887 #endif
3888   assert_different_registers(table, tc0, tc1, tc2);
3889   assert(table == tc3, "must be!");
3890 
3891   addi(tc0, table, ix0);
3892   addi(tc1, table, ix1);
3893   addi(tc2, table, ix2);
3894   if (ix3 != 0) addi(tc3, table, ix3);
3895 
3896   return ix3;
3897 }
3898 
3899 /**
3900  * uint32_t crc;
3901  * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
3902  */
3903 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) {
3904   assert_different_registers(crc, table, tmp);
3905   assert_different_registers(val, table);
3906 
3907   if (crc == val) {                   // Must rotate first to use the unmodified value.
3908     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3909                                       // As we use a word (4-byte) instruction, we have to adapt the mask bit positions.
3910     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
3911   } else {
3912     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
3913     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3914   }
3915   lwzx(tmp, table, tmp);
3916   xorr(crc, crc, tmp);
3917 }
3918 
3919 /**
3920  * uint32_t crc;
3921  * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
3922  */
3923 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
3924   fold_byte_crc32(crc, crc, table, tmp);
3925 }
3926 
3927 /**
3928  * Emits code to update CRC-32 with a byte value according to constants in table.
3929  *
3930  * @param [in,out]crc   Register containing the crc.
3931  * @param [in]val       Register containing the byte to fold into the CRC.
3932  * @param [in]table     Register containing the table of crc constants.
3933  *
3934  * uint32_t crc;
3935  * val = crc_table[(val ^ crc) & 0xFF];
3936  * crc = val ^ (crc >> 8);
3937  */
3938 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3939   BLOCK_COMMENT("update_byte_crc32:");
3940   xorr(val, val, crc);
3941   fold_byte_crc32(crc, val, table, val);
3942 }
3943 
3944 /**
3945  * @param crc   register containing existing CRC (32-bit)
3946  * @param buf   register pointing to input byte buffer (byte*)
3947  * @param len   register containing number of bytes
3948  * @param table register pointing to CRC table
3949  */
3950 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,
3951                                            Register data, bool loopAlignment) {
3952   assert_different_registers(crc, buf, len, table, data);
3953 
3954   Label L_mainLoop, L_done;
3955   const int mainLoop_stepping  = 1;
3956   const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4;
3957 
3958   // Process all bytes in a single-byte loop.
3959   clrldi_(len, len, 32);                         // Enforce 32 bit. Anything to do?
3960   beq(CCR0, L_done);
3961 
3962   mtctr(len);
3963   align(mainLoop_alignment);
3964   BIND(L_mainLoop);
3965     lbz(data, 0, buf);                           // Byte from buffer, zero-extended.
3966     addi(buf, buf, mainLoop_stepping);           // Advance buffer position.
3967     update_byte_crc32(crc, data, table);
3968     bdnz(L_mainLoop);                            // Iterate.
3969 
3970   bind(L_done);
3971 }
3972 
3973 /**
3974  * Emits code to update CRC-32 with a 4-byte value according to constants in table
3975  * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
3976  */
3977 // A note on the lookup table address(es):
3978 // The lookup table consists of two sets of four columns each.
3979 // The columns {0..3} are used for little-endian machines.
3980 // The columns {4..7} are used for big-endian machines.
3981 // To save the effort of adding the column offset to the table address each time
3982 // a table element is looked up, it is possible to pass the pre-calculated
3983 // column addresses.
3984 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary.
3985 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,
3986                                         Register t0,  Register t1,  Register t2,  Register t3,
3987                                         Register tc0, Register tc1, Register tc2, Register tc3) {
3988   assert_different_registers(crc, t3);
3989 
3990   // XOR crc with next four bytes of buffer.
3991   lwz(t3, bufDisp, buf);
3992   if (bufInc != 0) {
3993     addi(buf, buf, bufInc);
3994   }
3995   xorr(t3, t3, crc);
3996 
3997   // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices.
3998   rlwinm(t0, t3,  2,         24-2, 31-2);  // ((t1 >>  0) & 0xff) << 2
3999   rlwinm(t1, t3,  32+(2- 8), 24-2, 31-2);  // ((t1 >>  8) & 0xff) << 2
4000   rlwinm(t2, t3,  32+(2-16), 24-2, 31-2);  // ((t1 >> 16) & 0xff) << 2
4001   rlwinm(t3, t3,  32+(2-24), 24-2, 31-2);  // ((t1 >> 24) & 0xff) << 2
4002 
4003   // Use the pre-calculated column addresses.
4004   // Load pre-calculated table values.
4005   lwzx(t0, tc0, t0);
4006   lwzx(t1, tc1, t1);
4007   lwzx(t2, tc2, t2);
4008   lwzx(t3, tc3, t3);
4009 
4010   // Calculate new crc from table values.
4011   xorr(t0,  t0, t1);
4012   xorr(t2,  t2, t3);
4013   xorr(crc, t0, t2);  // Now crc contains the final checksum value.
4014 }
4015 
4016 /**
4017  * @param crc   register containing existing CRC (32-bit)
4018  * @param buf   register pointing to input byte buffer (byte*)
4019  * @param len   register containing number of bytes
4020  * @param table register pointing to CRC table
4021  *
4022  * uses R9..R12 as work register. Must be saved/restored by caller!
4023  */
4024 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table,
4025                                         Register t0,  Register t1,  Register t2,  Register t3,
4026                                         Register tc0, Register tc1, Register tc2, Register tc3,
4027                                         bool invertCRC) {
4028   assert_different_registers(crc, buf, len, table);
4029 
4030   Label L_mainLoop, L_tail;
4031   Register  tmp          = t0;
4032   Register  data         = t0;
4033   Register  tmp2         = t1;
4034   const int mainLoop_stepping  = 4;
4035   const int tailLoop_stepping  = 1;
4036   const int log_stepping       = exact_log2(mainLoop_stepping);
4037   const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
4038   const int complexThreshold   = 2*mainLoop_stepping;
4039 
4040   // Don't test for len <= 0 here. This pathological case should not occur anyway.
4041   // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles
4042   // for all well-behaved cases. The situation itself is detected and handled correctly
4043   // within update_byteLoop_crc32.
4044   assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
4045 
4046   BLOCK_COMMENT("kernel_crc32_1word {");
4047 
4048   if (invertCRC) {
4049     nand(crc, crc, crc);                      // 1s complement of crc
4050   }
4051 
4052   // Check for short (<mainLoop_stepping) buffer.
4053   cmpdi(CCR0, len, complexThreshold);
4054   blt(CCR0, L_tail);
4055 
4056   // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
4057   // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
4058   {
4059     // Align buf addr to mainLoop_stepping boundary.
4060     neg(tmp2, buf);                              // Calculate # preLoop iterations for alignment.
4061     rldicl(tmp2, tmp2, 0, 64-log_stepping);      // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
4062 
4063     if (complexThreshold > mainLoop_stepping) {
4064       sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4065     } else {
4066       sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
4067       cmpdi(CCR0, tmp, mainLoop_stepping);
4068       blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
4069       mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4070     }
4071     update_byteLoop_crc32(crc, buf, tmp2, table, data, false);
4072   }
4073 
4074   srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
4075   andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
4076   mtctr(tmp2);
4077 
4078 #ifdef VM_LITTLE_ENDIAN
4079   Register crc_rv = crc;
4080 #else
4081   Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
4082                                                  // Occupies tmp, but frees up crc.
4083   load_reverse_32(crc_rv, crc);                  // Revert byte order because we are dealing with big-endian data.
4084   tmp = crc;
4085 #endif
4086 
4087   int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
4088 
4089   align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
4090   BIND(L_mainLoop);
4091     update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
4092     bdnz(L_mainLoop);
4093 
4094 #ifndef VM_LITTLE_ENDIAN
4095   load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
4096   tmp = crc_rv;                                  // Tmp uses it's original register again.
4097 #endif
4098 
4099   // Restore original table address for tailLoop.
4100   if (reconstructTableOffset != 0) {
4101     addi(table, table, -reconstructTableOffset);
4102   }
4103 
4104   // Process last few (<complexThreshold) bytes of buffer.
4105   BIND(L_tail);
4106   update_byteLoop_crc32(crc, buf, len, table, data, false);
4107 
4108   if (invertCRC) {
4109     nand(crc, crc, crc);                      // 1s complement of crc
4110   }
4111   BLOCK_COMMENT("} kernel_crc32_1word");
4112 }
4113 
4114 /**
4115  * @param crc   register containing existing CRC (32-bit)
4116  * @param buf   register pointing to input byte buffer (byte*)
4117  * @param len   register containing number of bytes
4118  * @param table register pointing to CRC table
4119  *
4120  * Uses R7_ARG5, R8_ARG6 as work registers.
4121  */
4122 void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len, Register table,
4123                                         Register t0,  Register t1,  Register t2,  Register t3,
4124                                         bool invertCRC) {
4125   assert_different_registers(crc, buf, len, table);
4126 
4127   Register  data = t0;                   // Holds the current byte to be folded into crc.
4128 
4129   BLOCK_COMMENT("kernel_crc32_1byte {");
4130 
4131   if (invertCRC) {
4132     nand(crc, crc, crc);                      // 1s complement of crc
4133   }
4134 
4135   // Process all bytes in a single-byte loop.
4136   update_byteLoop_crc32(crc, buf, len, table, data, true);
4137 
4138   if (invertCRC) {
4139     nand(crc, crc, crc);                      // 1s complement of crc
4140   }
4141   BLOCK_COMMENT("} kernel_crc32_1byte");
4142 }
4143 
4144 /**
4145  * @param crc             register containing existing CRC (32-bit)
4146  * @param buf             register pointing to input byte buffer (byte*)
4147  * @param len             register containing number of bytes
4148  * @param table           register pointing to CRC table
4149  * @param constants       register pointing to CRC table for 128-bit aligned memory
4150  * @param t0-t5           temp registers
4151  */
4152 void MacroAssembler::kernel_crc32_vpmsum(Register crc, Register buf, Register len, Register table,
4153                                          Register constants, Register t0, Register t1, Register t2,
4154                                          Register t3, Register t4, Register t5, bool invertCRC) {
4155   assert_different_registers(crc, buf, len, table);
4156 
4157   Label L_tail;
4158 
4159   BLOCK_COMMENT("kernel_crc32_vpmsum {");
4160 
4161   if (invertCRC) {
4162     nand(crc, crc, crc);                      // 1s complement of crc
4163   }
4164 
4165   // Enforce 32 bit.
4166   clrldi(len, len, 32);
4167 
4168   // Align if we have enough bytes for the fast version.
4169   const int alignment = 16,
4170             threshold = 32;
4171   Register prealign = t0;
4172 
4173   neg(prealign, buf);
4174   addi(t1, len, -threshold);
4175   andi(prealign, prealign, alignment - 1);
4176   cmpw(CCR0, t1, prealign);
4177   blt(CCR0, L_tail); // len - prealign < threshold?
4178 
4179   subf(len, prealign, len);
4180   update_byteLoop_crc32(crc, buf, prealign, table, t2, false);
4181 
4182   // Calculate from first aligned address as far as possible.
4183   kernel_crc32_vpmsum_aligned(crc, buf, len, constants, t0, t1, t2, t3, t4, t5);


4184 
4185   // Remaining bytes.
4186   BIND(L_tail);
4187   update_byteLoop_crc32(crc, buf, len, table, t2, false);
4188 
4189   if (invertCRC) {
4190     nand(crc, crc, crc);                      // 1s complement of crc
4191   }
4192 
4193   BLOCK_COMMENT("} kernel_crc32_vpmsum");
4194 }
4195 
4196 /**
4197  * @param crc             register containing existing CRC (32-bit)
4198  * @param buf             register pointing to input byte buffer (byte*)
4199  * @param len             register containing number of bytes (will get updated to remaining bytes)
4200  * @param constants       register pointing to CRC table for 128-bit aligned memory
4201  * @param t0-t5           temp registers
4202  */
4203 void MacroAssembler::kernel_crc32_vpmsum_aligned(Register crc, Register buf, Register len,
4204     Register constants, Register t0, Register t1, Register t2, Register t3, Register t4, Register t5) {
4205 
4206   // Save non-volatile vector registers (frameless).
4207   Register offset = t1;
4208   int offsetInt = 0;
4209   offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP);
4210   offsetInt -= 16; li(offset, offsetInt); stvx(VR21, offset, R1_SP);
4211   offsetInt -= 16; li(offset, offsetInt); stvx(VR22, offset, R1_SP);
4212   offsetInt -= 16; li(offset, offsetInt); stvx(VR23, offset, R1_SP);
4213   offsetInt -= 16; li(offset, offsetInt); stvx(VR24, offset, R1_SP);
4214   offsetInt -= 16; li(offset, offsetInt); stvx(VR25, offset, R1_SP);
4215 #ifndef VM_LITTLE_ENDIAN
4216   offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP);
4217 #endif
4218   offsetInt -= 8; std(R14, offsetInt, R1_SP);
4219   offsetInt -= 8; std(R15, offsetInt, R1_SP);
4220   offsetInt -= 8; std(R16, offsetInt, R1_SP);
4221 
4222   // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor
4223   // bytes per iteration. The basic scheme is:
4224   // lvx: load vector (Big Endian needs reversal)
4225   // vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift
4226   // vxor: xor partial results together to get unroll_factor2 vectors
4227 
4228   // Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors.
4229 
4230   // Using 16 * unroll_factor / unroll_factor_2 bytes for constants.
4231   const int unroll_factor = CRC32_UNROLL_FACTOR,
4232             unroll_factor2 = CRC32_UNROLL_FACTOR2;
4233 
4234   const int outer_consts_size = (unroll_factor2 - 1) * 16,
4235             inner_consts_size = (unroll_factor / unroll_factor2) * 16;
4236 
4237   // Support registers.
4238   Register offs[] = { noreg, t0, t1, t2, t3, t4, t5, crc /* will live in VCRC */ };
4239   Register num_bytes = R14,
4240            loop_count = R15,
4241            cur_const = R16;
4242   // Constant array for outer loop: unroll_factor2 - 1 registers,
4243   // Constant array for inner loop: unroll_factor / unroll_factor2 registers.
4244   VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 },
4245                  consts1[] = { VR23, VR24 };
4246   // Data register arrays: 2 arrays with unroll_factor2 registers.
4247   VectorRegister data0[] = { VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7 },
4248                  data1[] = { VR8, VR9, VR10, VR11, VR12, VR13, VR14, VR15 };
4249 
4250   VectorRegister VCRC = data0[0];
4251   VectorRegister Vc = VR25;
4252   VectorRegister swap_bytes = VR26; // Only for Big Endian.
4253 
4254   // We have at least 1 iteration (ensured by caller).
4255   Label L_outer_loop, L_inner_loop, L_last;
4256 
4257   // If supported set DSCR pre-fetch to deepest.
4258   if (VM_Version::has_mfdscr()) {
4259     load_const_optimized(t0, VM_Version::_dscr_val | 7);
4260     mtdscr(t0);
4261   }
4262 
4263   mtvrwz(VCRC, crc); // crc lives in VCRC, now
4264 
4265   for (int i = 1; i < unroll_factor2; ++i) {
4266     li(offs[i], 16 * i);
4267   }
4268 
4269   // Load consts for outer loop
4270   lvx(consts0[0], constants);
4271   for (int i = 1; i < unroll_factor2 - 1; ++i) {
4272     lvx(consts0[i], offs[i], constants);
4273   }
4274 
4275   load_const_optimized(num_bytes, 16 * unroll_factor);
4276 
4277   // Reuse data registers outside of the loop.
4278   VectorRegister Vtmp = data1[0];
4279   VectorRegister Vtmp2 = data1[1];
4280   VectorRegister zeroes = data1[2];
4281 
4282   vspltisb(Vtmp, 0);
4283   vsldoi(VCRC, Vtmp, VCRC, 8); // 96 bit zeroes, 32 bit CRC.
4284 
4285   // Load vector for vpermxor (to xor both 64 bit parts together)
4286   lvsl(Vtmp, buf);   // 000102030405060708090a0b0c0d0e0f
4287   vspltisb(Vc, 4);
4288   vsl(Vc, Vtmp, Vc); // 00102030405060708090a0b0c0d0e0f0
4289   xxspltd(Vc->to_vsr(), Vc->to_vsr(), 0);
4290   vor(Vc, Vtmp, Vc); // 001122334455667708192a3b4c5d6e7f
4291 
4292 #ifdef VM_LITTLE_ENDIAN
4293 #define BE_swap_bytes(x)
4294 #else
4295   vspltisb(Vtmp2, 0xf);
4296   vxor(swap_bytes, Vtmp, Vtmp2);
4297 #define BE_swap_bytes(x) vperm(x, x, x, swap_bytes)
4298 #endif
4299 
4300   cmpd(CCR0, len, num_bytes);
4301   blt(CCR0, L_last);
4302 
4303   addi(cur_const, constants, outer_consts_size); // Point to consts for inner loop
4304   load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off.
4305 
4306   // ********** Main loop start **********
4307   align(32);
4308   bind(L_outer_loop);
4309 
4310   // Begin of unrolled first iteration (no xor).
4311   lvx(data1[0], buf);
4312   for (int i = 1; i < unroll_factor2 / 2; ++i) {
4313     lvx(data1[i], offs[i], buf);
4314   }
4315   vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
4316   lvx(consts1[0], cur_const);
4317   mtctr(loop_count);
4318   for (int i = 0; i < unroll_factor2 / 2; ++i) {
4319     BE_swap_bytes(data1[i]);
4320     if (i == 0) { vxor(data1[0], data1[0], VCRC); } // xor in previous CRC.
4321     lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
4322     vpmsumw(data0[i], data1[i], consts1[0]);
4323   }
4324   addi(buf, buf, 16 * unroll_factor2);
4325   subf(len, num_bytes, len);
4326   lvx(consts1[1], offs[1], cur_const);
4327   addi(cur_const, cur_const, 32);
4328   // Begin of unrolled second iteration (head).
4329   for (int i = 0; i < unroll_factor2 / 2; ++i) {
4330     BE_swap_bytes(data1[i + unroll_factor2 / 2]);
4331     if (i == 0) { lvx(data1[0], buf); } else { lvx(data1[i], offs[i], buf); }
4332     vpmsumw(data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[0]);
4333   }
4334   for (int i = 0; i < unroll_factor2 / 2; ++i) {
4335     BE_swap_bytes(data1[i]);
4336     lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
4337     vpmsumw(data1[i], data1[i], consts1[1]);
4338   }
4339   addi(buf, buf, 16 * unroll_factor2);
4340 
4341   // Generate most performance relevant code. Loads + half of the vpmsumw have been generated.
4342   // Double-iteration allows using the 2 constant registers alternatingly.
4343   align(32);
4344   bind(L_inner_loop);
4345   for (int j = 1; j < 3; ++j) { // j < unroll_factor / unroll_factor2 - 1 for complete unrolling.
4346     if (j & 1) {
4347       lvx(consts1[0], cur_const);
4348     } else {
4349       lvx(consts1[1], offs[1], cur_const);
4350       addi(cur_const, cur_const, 32);
4351     }
4352     for (int i = 0; i < unroll_factor2; ++i) {
4353       int idx = i + unroll_factor2 / 2, inc = 0; // For modulo-scheduled input.
4354       if (idx >= unroll_factor2) { idx -= unroll_factor2; inc = 1; }
4355       BE_swap_bytes(data1[idx]);
4356       vxor(data0[i], data0[i], data1[i]);
4357       if (i == 0) lvx(data1[0], buf); else lvx(data1[i], offs[i], buf);
4358       vpmsumw(data1[idx], data1[idx], consts1[(j + inc) & 1]);
4359     }
4360     addi(buf, buf, 16 * unroll_factor2);
4361   }
4362   bdnz(L_inner_loop);
4363 
4364   addi(cur_const, constants, outer_consts_size); // Reset
4365 
4366   // Tail of last iteration (no loads).
4367   for (int i = 0; i < unroll_factor2 / 2; ++i) {
4368     BE_swap_bytes(data1[i + unroll_factor2 / 2]);
4369     vxor(data0[i], data0[i], data1[i]);
4370     vpmsumw(data1[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[1]);
4371   }
4372   for (int i = 0; i < unroll_factor2 / 2; ++i) {
4373     vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); // First half of fixup shifts.
4374     vxor(data0[i + unroll_factor2 / 2], data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2]);
4375   }
4376 
4377   // Last data register is ok, other ones need fixup shift.
4378   for (int i = unroll_factor2 / 2; i < unroll_factor2 - 1; ++i) {
4379     vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]);
4380   }
4381 
4382   // Combine to 128 bit result vector VCRC = data0[0].
4383   for (int i = 1; i < unroll_factor2; i<<=1) {
4384     for (int j = 0; j <= unroll_factor2 - 2*i; j+=2*i) {
4385       vxor(data0[j], data0[j], data0[j+i]);
4386     }
4387   }
4388   cmpd(CCR0, len, num_bytes);
4389   bge(CCR0, L_outer_loop);
4390 
4391   // Last chance with lower num_bytes.
4392   bind(L_last);
4393   srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations.
4394   // Point behind last const for inner loop.
4395   add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
4396   sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used.
4397   clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2));
4398   subf(cur_const, R0, cur_const); // Point to constant to be used first.
4399 
4400   addic_(loop_count, loop_count, -1); // One double-iteration peeled off.
4401   bgt(CCR0, L_outer_loop);
4402   // ********** Main loop end **********
4403 
4404   // Restore DSCR pre-fetch value.
4405   if (VM_Version::has_mfdscr()) {
4406     load_const_optimized(t0, VM_Version::_dscr_val);
4407     mtdscr(t0);
4408   }
4409 
4410   // ********** Simple loop for remaining 16 byte blocks **********
4411   {
4412     Label L_loop, L_done;
4413 
4414     srdi_(t0, len, 4); // 16 bytes per iteration
4415     clrldi(len, len, 64-4);
4416     beq(CCR0, L_done);
4417 
4418     // Point to const (same as last const for inner loop).
4419     add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size - 16);
4420     mtctr(t0);
4421     lvx(Vtmp2, cur_const);
4422 
4423     align(32);
4424     bind(L_loop);
4425 
4426     lvx(Vtmp, buf);
4427     addi(buf, buf, 16);
4428     vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
4429     BE_swap_bytes(Vtmp);
4430     vxor(VCRC, VCRC, Vtmp);
4431     vpmsumw(VCRC, VCRC, Vtmp2);
4432     bdnz(L_loop);
4433 
4434     bind(L_done);
4435   }
4436   // ********** Simple loop end **********
4437 #undef BE_swap_bytes
4438 
4439   // Point to Barrett constants
4440   add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
4441 
4442   vspltisb(zeroes, 0);
4443 
4444   // Combine to 64 bit result.
4445   vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
4446 
4447   // Reduce to 32 bit CRC: Remainder by multiply-high.
4448   lvx(Vtmp, cur_const);
4449   vsldoi(Vtmp2, zeroes, VCRC, 12);  // Extract high 32 bit.
4450   vpmsumd(Vtmp2, Vtmp2, Vtmp);      // Multiply by inverse long poly.
4451   vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit.
4452   vsldoi(Vtmp, zeroes, Vtmp, 8);
4453   vpmsumd(Vtmp2, Vtmp2, Vtmp);      // Multiply quotient by long poly.
4454   vxor(VCRC, VCRC, Vtmp2);          // Remainder fits into 32 bit.
4455 
4456   // Move result. len is already updated.
4457   vsldoi(VCRC, VCRC, zeroes, 8);
4458   mfvrd(crc, VCRC);
4459 
4460   // Restore non-volatile Vector registers (frameless).
4461   offsetInt = 0;
4462   offsetInt -= 16; li(offset, offsetInt); lvx(VR20, offset, R1_SP);
4463   offsetInt -= 16; li(offset, offsetInt); lvx(VR21, offset, R1_SP);
4464   offsetInt -= 16; li(offset, offsetInt); lvx(VR22, offset, R1_SP);
4465   offsetInt -= 16; li(offset, offsetInt); lvx(VR23, offset, R1_SP);
4466   offsetInt -= 16; li(offset, offsetInt); lvx(VR24, offset, R1_SP);
4467   offsetInt -= 16; li(offset, offsetInt); lvx(VR25, offset, R1_SP);
4468 #ifndef VM_LITTLE_ENDIAN
4469   offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP);
4470 #endif
4471   offsetInt -= 8;  ld(R14, offsetInt, R1_SP);
4472   offsetInt -= 8;  ld(R15, offsetInt, R1_SP);
4473   offsetInt -= 8;  ld(R16, offsetInt, R1_SP);
4474 }
4475 
4476 void MacroAssembler::crc32(Register crc, Register buf, Register len, Register t0, Register t1, Register t2,
4477                            Register t3, Register t4, Register t5, Register t6, Register t7, bool is_crc32c) {
4478   load_const_optimized(t0, is_crc32c ? StubRoutines::crc32c_table_addr()
4479                                      : StubRoutines::crc_table_addr()   , R0);
4480 
4481   if (VM_Version::has_vpmsumb()) {
4482     load_const_optimized(t1, is_crc32c ? StubRoutines::ppc64::crc32c_constants()
4483                                        : StubRoutines::ppc64::crc_constants()   , R0);
4484     kernel_crc32_vpmsum(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, !is_crc32c);
4485   } else {
4486     kernel_crc32_1word(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, t0, !is_crc32c);
4487   }
4488 }
4489 
4490 void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp, bool invertCRC) {
4491   assert_different_registers(crc, buf, /* len,  not used!! */ table, tmp);
4492 
4493   BLOCK_COMMENT("kernel_crc32_singleByte:");
4494   if (invertCRC) {
4495     nand(crc, crc, crc);                // 1s complement of crc
4496   }
4497 
4498   lbz(tmp, 0, buf);                     // Byte from buffer, zero-extended.
4499   update_byte_crc32(crc, tmp, table);
4500 
4501   if (invertCRC) {
4502     nand(crc, crc, crc);                // 1s complement of crc
4503   }
4504 }
4505 
4506 void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) {
4507   assert_different_registers(crc, val, table);
4508 
4509   BLOCK_COMMENT("kernel_crc32_singleByteReg:");
4510   if (invertCRC) {
4511     nand(crc, crc, crc);                // 1s complement of crc
4512   }
4513 
4514   update_byte_crc32(crc, val, table);
4515 
4516   if (invertCRC) {
4517     nand(crc, crc, crc);                // 1s complement of crc
4518   }
4519 }
4520 
4521 // dest_lo += src1 + src2
4522 // dest_hi += carry1 + carry2
4523 void MacroAssembler::add2_with_carry(Register dest_hi,
4524                                      Register dest_lo,
4525                                      Register src1, Register src2) {
4526   li(R0, 0);
4527   addc(dest_lo, dest_lo, src1);
4528   adde(dest_hi, dest_hi, R0);
4529   addc(dest_lo, dest_lo, src2);
4530   adde(dest_hi, dest_hi, R0);
4531 }
4532 
4533 // Multiply 64 bit by 64 bit first loop.
4534 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,
4535                                            Register x_xstart,
4536                                            Register y, Register y_idx,
4537                                            Register z,
4538                                            Register carry,
4539                                            Register product_high, Register product,
4540                                            Register idx, Register kdx,
4541                                            Register tmp) {
4542   //  jlong carry, x[], y[], z[];
4543   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
4544   //    huge_128 product = y[idx] * x[xstart] + carry;
4545   //    z[kdx] = (jlong)product;
4546   //    carry  = (jlong)(product >>> 64);
4547   //  }
4548   //  z[xstart] = carry;
4549 
4550   Label L_first_loop, L_first_loop_exit;
4551   Label L_one_x, L_one_y, L_multiply;
4552 
4553   addic_(xstart, xstart, -1);
4554   blt(CCR0, L_one_x);   // Special case: length of x is 1.
4555 
4556   // Load next two integers of x.
4557   sldi(tmp, xstart, LogBytesPerInt);
4558   ldx(x_xstart, x, tmp);
4559 #ifdef VM_LITTLE_ENDIAN
4560   rldicl(x_xstart, x_xstart, 32, 0);
4561 #endif
4562 
4563   align(32, 16);
4564   bind(L_first_loop);
4565 
4566   cmpdi(CCR0, idx, 1);
4567   blt(CCR0, L_first_loop_exit);
4568   addi(idx, idx, -2);
4569   beq(CCR0, L_one_y);
4570 
4571   // Load next two integers of y.
4572   sldi(tmp, idx, LogBytesPerInt);
4573   ldx(y_idx, y, tmp);
4574 #ifdef VM_LITTLE_ENDIAN
4575   rldicl(y_idx, y_idx, 32, 0);
4576 #endif
4577 
4578 
4579   bind(L_multiply);
4580   multiply64(product_high, product, x_xstart, y_idx);
4581 
4582   li(tmp, 0);
4583   addc(product, product, carry);         // Add carry to result.
4584   adde(product_high, product_high, tmp); // Add carry of the last addition.
4585   addi(kdx, kdx, -2);
4586 
4587   // Store result.
4588 #ifdef VM_LITTLE_ENDIAN
4589   rldicl(product, product, 32, 0);
4590 #endif
4591   sldi(tmp, kdx, LogBytesPerInt);
4592   stdx(product, z, tmp);
4593   mr_if_needed(carry, product_high);
4594   b(L_first_loop);
4595 
4596 
4597   bind(L_one_y); // Load one 32 bit portion of y as (0,value).
4598 
4599   lwz(y_idx, 0, y);
4600   b(L_multiply);
4601 
4602 
4603   bind(L_one_x); // Load one 32 bit portion of x as (0,value).
4604 
4605   lwz(x_xstart, 0, x);
4606   b(L_first_loop);
4607 
4608   bind(L_first_loop_exit);
4609 }
4610 
4611 // Multiply 64 bit by 64 bit and add 128 bit.
4612 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y,
4613                                             Register z, Register yz_idx,
4614                                             Register idx, Register carry,
4615                                             Register product_high, Register product,
4616                                             Register tmp, int offset) {
4617 
4618   //  huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
4619   //  z[kdx] = (jlong)product;
4620 
4621   sldi(tmp, idx, LogBytesPerInt);
4622   if (offset) {
4623     addi(tmp, tmp, offset);
4624   }
4625   ldx(yz_idx, y, tmp);
4626 #ifdef VM_LITTLE_ENDIAN
4627   rldicl(yz_idx, yz_idx, 32, 0);
4628 #endif
4629 
4630   multiply64(product_high, product, x_xstart, yz_idx);
4631   ldx(yz_idx, z, tmp);
4632 #ifdef VM_LITTLE_ENDIAN
4633   rldicl(yz_idx, yz_idx, 32, 0);
4634 #endif
4635 
4636   add2_with_carry(product_high, product, carry, yz_idx);
4637 
4638   sldi(tmp, idx, LogBytesPerInt);
4639   if (offset) {
4640     addi(tmp, tmp, offset);
4641   }
4642 #ifdef VM_LITTLE_ENDIAN
4643   rldicl(product, product, 32, 0);
4644 #endif
4645   stdx(product, z, tmp);
4646 }
4647 
4648 // Multiply 128 bit by 128 bit. Unrolled inner loop.
4649 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart,
4650                                              Register y, Register z,
4651                                              Register yz_idx, Register idx, Register carry,
4652                                              Register product_high, Register product,
4653                                              Register carry2, Register tmp) {
4654 
4655   //  jlong carry, x[], y[], z[];
4656   //  int kdx = ystart+1;
4657   //  for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
4658   //    huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
4659   //    z[kdx+idx+1] = (jlong)product;
4660   //    jlong carry2 = (jlong)(product >>> 64);
4661   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
4662   //    z[kdx+idx] = (jlong)product;
4663   //    carry = (jlong)(product >>> 64);
4664   //  }
4665   //  idx += 2;
4666   //  if (idx > 0) {
4667   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
4668   //    z[kdx+idx] = (jlong)product;
4669   //    carry = (jlong)(product >>> 64);
4670   //  }
4671 
4672   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
4673   const Register jdx = R0;
4674 
4675   // Scale the index.
4676   srdi_(jdx, idx, 2);
4677   beq(CCR0, L_third_loop_exit);
4678   mtctr(jdx);
4679 
4680   align(32, 16);
4681   bind(L_third_loop);
4682 
4683   addi(idx, idx, -4);
4684 
4685   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8);
4686   mr_if_needed(carry2, product_high);
4687 
4688   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0);
4689   mr_if_needed(carry, product_high);
4690   bdnz(L_third_loop);
4691 
4692   bind(L_third_loop_exit);  // Handle any left-over operand parts.
4693 
4694   andi_(idx, idx, 0x3);
4695   beq(CCR0, L_post_third_loop_done);
4696 
4697   Label L_check_1;
4698 
4699   addic_(idx, idx, -2);
4700   blt(CCR0, L_check_1);
4701 
4702   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0);
4703   mr_if_needed(carry, product_high);
4704 
4705   bind(L_check_1);
4706 
4707   addi(idx, idx, 0x2);
4708   andi_(idx, idx, 0x1);
4709   addic_(idx, idx, -1);
4710   blt(CCR0, L_post_third_loop_done);
4711 
4712   sldi(tmp, idx, LogBytesPerInt);
4713   lwzx(yz_idx, y, tmp);
4714   multiply64(product_high, product, x_xstart, yz_idx);
4715   lwzx(yz_idx, z, tmp);
4716 
4717   add2_with_carry(product_high, product, yz_idx, carry);
4718 
4719   sldi(tmp, idx, LogBytesPerInt);
4720   stwx(product, z, tmp);
4721   srdi(product, product, 32);
4722 
4723   sldi(product_high, product_high, 32);
4724   orr(product, product, product_high);
4725   mr_if_needed(carry, product);
4726 
4727   bind(L_post_third_loop_done);
4728 }   // multiply_128_x_128_loop
4729 
4730 void MacroAssembler::muladd(Register out, Register in,
4731                             Register offset, Register len, Register k,
4732                             Register tmp1, Register tmp2, Register carry) {
4733 
4734   // Labels
4735   Label LOOP, SKIP;
4736 
4737   // Make sure length is positive.
4738   cmpdi  (CCR0,    len,     0);
4739 
4740   // Prepare variables
4741   subi   (offset,  offset,  4);
4742   li     (carry,   0);
4743   ble    (CCR0,    SKIP);
4744 
4745   mtctr  (len);
4746   subi   (len,     len,     1    );
4747   sldi   (len,     len,     2    );
4748 
4749   // Main loop
4750   bind(LOOP);
4751   lwzx   (tmp1,    len,     in   );
4752   lwzx   (tmp2,    offset,  out  );
4753   mulld  (tmp1,    tmp1,    k    );
4754   add    (tmp2,    carry,   tmp2 );
4755   add    (tmp2,    tmp1,    tmp2 );
4756   stwx   (tmp2,    offset,  out  );
4757   srdi   (carry,   tmp2,    32   );
4758   subi   (offset,  offset,  4    );
4759   subi   (len,     len,     4    );
4760   bdnz   (LOOP);
4761   bind(SKIP);
4762 }
4763 
4764 void MacroAssembler::multiply_to_len(Register x, Register xlen,
4765                                      Register y, Register ylen,
4766                                      Register z, Register zlen,
4767                                      Register tmp1, Register tmp2,
4768                                      Register tmp3, Register tmp4,
4769                                      Register tmp5, Register tmp6,
4770                                      Register tmp7, Register tmp8,
4771                                      Register tmp9, Register tmp10,
4772                                      Register tmp11, Register tmp12,
4773                                      Register tmp13) {
4774 
4775   ShortBranchVerifier sbv(this);
4776 
4777   assert_different_registers(x, xlen, y, ylen, z, zlen,
4778                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
4779   assert_different_registers(x, xlen, y, ylen, z, zlen,
4780                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp7);
4781   assert_different_registers(x, xlen, y, ylen, z, zlen,
4782                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp8);
4783 
4784   const Register idx = tmp1;
4785   const Register kdx = tmp2;
4786   const Register xstart = tmp3;
4787 
4788   const Register y_idx = tmp4;
4789   const Register carry = tmp5;
4790   const Register product = tmp6;
4791   const Register product_high = tmp7;
4792   const Register x_xstart = tmp8;
4793   const Register tmp = tmp9;
4794 
4795   // First Loop.
4796   //
4797   //  final static long LONG_MASK = 0xffffffffL;
4798   //  int xstart = xlen - 1;
4799   //  int ystart = ylen - 1;
4800   //  long carry = 0;
4801   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
4802   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
4803   //    z[kdx] = (int)product;
4804   //    carry = product >>> 32;
4805   //  }
4806   //  z[xstart] = (int)carry;
4807 
4808   mr_if_needed(idx, ylen);        // idx = ylen
4809   mr_if_needed(kdx, zlen);        // kdx = xlen + ylen
4810   li(carry, 0);                   // carry = 0
4811 
4812   Label L_done;
4813 
4814   addic_(xstart, xlen, -1);
4815   blt(CCR0, L_done);
4816 
4817   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z,
4818                         carry, product_high, product, idx, kdx, tmp);
4819 
4820   Label L_second_loop;
4821 
4822   cmpdi(CCR0, kdx, 0);
4823   beq(CCR0, L_second_loop);
4824 
4825   Label L_carry;
4826 
4827   addic_(kdx, kdx, -1);
4828   beq(CCR0, L_carry);
4829 
4830   // Store lower 32 bits of carry.
4831   sldi(tmp, kdx, LogBytesPerInt);
4832   stwx(carry, z, tmp);
4833   srdi(carry, carry, 32);
4834   addi(kdx, kdx, -1);
4835 
4836 
4837   bind(L_carry);
4838 
4839   // Store upper 32 bits of carry.
4840   sldi(tmp, kdx, LogBytesPerInt);
4841   stwx(carry, z, tmp);
4842 
4843   // Second and third (nested) loops.
4844   //
4845   //  for (int i = xstart-1; i >= 0; i--) { // Second loop
4846   //    carry = 0;
4847   //    for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
4848   //      long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
4849   //                     (z[k] & LONG_MASK) + carry;
4850   //      z[k] = (int)product;
4851   //      carry = product >>> 32;
4852   //    }
4853   //    z[i] = (int)carry;
4854   //  }
4855   //
4856   //  i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
4857 
4858   bind(L_second_loop);
4859 
4860   li(carry, 0);                   // carry = 0;
4861 
4862   addic_(xstart, xstart, -1);     // i = xstart-1;
4863   blt(CCR0, L_done);
4864 
4865   Register zsave = tmp10;
4866 
4867   mr(zsave, z);
4868 
4869 
4870   Label L_last_x;
4871 
4872   sldi(tmp, xstart, LogBytesPerInt);
4873   add(z, z, tmp);                 // z = z + k - j
4874   addi(z, z, 4);
4875   addic_(xstart, xstart, -1);     // i = xstart-1;
4876   blt(CCR0, L_last_x);
4877 
4878   sldi(tmp, xstart, LogBytesPerInt);
4879   ldx(x_xstart, x, tmp);
4880 #ifdef VM_LITTLE_ENDIAN
4881   rldicl(x_xstart, x_xstart, 32, 0);
4882 #endif
4883 
4884 
4885   Label L_third_loop_prologue;
4886 
4887   bind(L_third_loop_prologue);
4888 
4889   Register xsave = tmp11;
4890   Register xlensave = tmp12;
4891   Register ylensave = tmp13;
4892 
4893   mr(xsave, x);
4894   mr(xlensave, xstart);
4895   mr(ylensave, ylen);
4896 
4897 
4898   multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen,
4899                           carry, product_high, product, x, tmp);
4900 
4901   mr(z, zsave);
4902   mr(x, xsave);
4903   mr(xlen, xlensave);   // This is the decrement of the loop counter!
4904   mr(ylen, ylensave);
4905 
4906   addi(tmp3, xlen, 1);
4907   sldi(tmp, tmp3, LogBytesPerInt);
4908   stwx(carry, z, tmp);
4909   addic_(tmp3, tmp3, -1);
4910   blt(CCR0, L_done);
4911 
4912   srdi(carry, carry, 32);
4913   sldi(tmp, tmp3, LogBytesPerInt);
4914   stwx(carry, z, tmp);
4915   b(L_second_loop);
4916 
4917   // Next infrequent code is moved outside loops.
4918   bind(L_last_x);
4919 
4920   lwz(x_xstart, 0, x);
4921   b(L_third_loop_prologue);
4922 
4923   bind(L_done);
4924 }   // multiply_to_len
4925 
4926 void MacroAssembler::asm_assert(bool check_equal, const char *msg, int id) {
4927 #ifdef ASSERT
4928   Label ok;
4929   if (check_equal) {
4930     beq(CCR0, ok);
4931   } else {
4932     bne(CCR0, ok);
4933   }
4934   stop(msg, id);
4935   bind(ok);
4936 #endif
4937 }
4938 
4939 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset,
4940                                           Register mem_base, const char* msg, int id) {
4941 #ifdef ASSERT
4942   switch (size) {
4943     case 4:
4944       lwz(R0, mem_offset, mem_base);
4945       cmpwi(CCR0, R0, 0);
4946       break;
4947     case 8:
4948       ld(R0, mem_offset, mem_base);
4949       cmpdi(CCR0, R0, 0);
4950       break;
4951     default:
4952       ShouldNotReachHere();
4953   }
4954   asm_assert(check_equal, msg, id);
4955 #endif // ASSERT
4956 }
4957 
4958 void MacroAssembler::verify_thread() {
4959   if (VerifyThread) {
4960     unimplemented("'VerifyThread' currently not implemented on PPC");
4961   }
4962 }
4963 
4964 // READ: oop. KILL: R0. Volatile floats perhaps.
4965 void MacroAssembler::verify_oop(Register oop, const char* msg) {
4966   if (!VerifyOops) {
4967     return;
4968   }
4969 
4970   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4971   const Register tmp = R11; // Will be preserved.
4972   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
4973   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
4974 
4975   mr_if_needed(R4_ARG2, oop);
4976   save_LR_CR(tmp); // save in old frame
4977   push_frame_reg_args(nbytes_save, tmp);
4978   // load FunctionDescriptor** / entry_address *
4979   load_const_optimized(tmp, fd, R0);
4980   // load FunctionDescriptor* / entry_address
4981   ld(tmp, 0, tmp);
4982   load_const_optimized(R3_ARG1, (address)msg, R0);
4983   // Call destination for its side effect.
4984   call_c(tmp);
4985 
4986   pop_frame();
4987   restore_LR_CR(tmp);
4988   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
4989 }
4990 
4991 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) {
4992   if (!VerifyOops) {
4993     return;
4994   }
4995 
4996   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4997   const Register tmp = R11; // Will be preserved.
4998   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
4999   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
5000 
5001   ld(R4_ARG2, offs, base);
5002   save_LR_CR(tmp); // save in old frame
5003   push_frame_reg_args(nbytes_save, tmp);
5004   // load FunctionDescriptor** / entry_address *
5005   load_const_optimized(tmp, fd, R0);
5006   // load FunctionDescriptor* / entry_address
5007   ld(tmp, 0, tmp);
5008   load_const_optimized(R3_ARG1, (address)msg, R0);
5009   // Call destination for its side effect.
5010   call_c(tmp);
5011 
5012   pop_frame();
5013   restore_LR_CR(tmp);
5014   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
5015 }
5016 
5017 const char* stop_types[] = {
5018   "stop",
5019   "untested",
5020   "unimplemented",
5021   "shouldnotreachhere"
5022 };
5023 
5024 static void stop_on_request(int tp, const char* msg) {
5025   tty->print("PPC assembly code requires stop: (%s) %s\n", stop_types[tp%/*stop_end*/4], msg);
5026   guarantee(false, "PPC assembly code requires stop: %s", msg);
5027 }
5028 
5029 // Call a C-function that prints output.
5030 void MacroAssembler::stop(int type, const char* msg, int id) {
5031 #ifndef PRODUCT
5032   block_comment(err_msg("stop: %s %s {", stop_types[type%stop_end], msg));
5033 #else
5034   block_comment("stop {");
5035 #endif
5036 
5037   // setup arguments
5038   load_const_optimized(R3_ARG1, type);
5039   load_const_optimized(R4_ARG2, (void *)msg, /*tmp=*/R0);
5040   call_VM_leaf(CAST_FROM_FN_PTR(address, stop_on_request), R3_ARG1, R4_ARG2);
5041   illtrap();
5042   emit_int32(id);
5043   block_comment("} stop;");
5044 }
5045 
5046 #ifndef PRODUCT
5047 // Write pattern 0x0101010101010101 in memory region [low-before, high+after].
5048 // Val, addr are temp registers.
5049 // If low == addr, addr is killed.
5050 // High is preserved.
5051 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) {
5052   if (!ZapMemory) return;
5053 
5054   assert_different_registers(low, val);
5055 
5056   BLOCK_COMMENT("zap memory region {");
5057   load_const_optimized(val, 0x0101010101010101);
5058   int size = before + after;
5059   if (low == high && size < 5 && size > 0) {
5060     int offset = -before*BytesPerWord;
5061     for (int i = 0; i < size; ++i) {
5062       std(val, offset, low);
5063       offset += (1*BytesPerWord);
5064     }
5065   } else {
5066     addi(addr, low, -before*BytesPerWord);
5067     assert_different_registers(high, val);
5068     if (after) addi(high, high, after * BytesPerWord);
5069     Label loop;
5070     bind(loop);
5071     std(val, 0, addr);
5072     addi(addr, addr, 8);
5073     cmpd(CCR6, addr, high);
5074     ble(CCR6, loop);
5075     if (after) addi(high, high, -after * BytesPerWord);  // Correct back to old value.
5076   }
5077   BLOCK_COMMENT("} zap memory region");
5078 }
5079 
5080 #endif // !PRODUCT
5081 
5082 void SkipIfEqualZero::skip_to_label_if_equal_zero(MacroAssembler* masm, Register temp,
5083                                                   const bool* flag_addr, Label& label) {
5084   int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true);
5085   assert(sizeof(bool) == 1, "PowerPC ABI");
5086   masm->lbz(temp, simm16_offset, temp);
5087   masm->cmpwi(CCR0, temp, 0);
5088   masm->beq(CCR0, label);
5089 }
5090 
5091 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() {
5092   skip_to_label_if_equal_zero(masm, temp, flag_addr, _label);
5093 }
5094 
5095 SkipIfEqualZero::~SkipIfEqualZero() {
5096   _masm->bind(_label);
5097 }
--- EOF ---