1 /*
   2  * Copyright (c) 1997, 2016, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright 2012, 2015 SAP AG. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "compiler/disassembler.hpp"
  29 #include "gc/shared/cardTableModRefBS.hpp"
  30 #include "gc/shared/collectedHeap.inline.hpp"
  31 #include "interpreter/interpreter.hpp"
  32 #include "memory/resourceArea.hpp"
  33 #include "nativeInst_ppc.hpp"
  34 #include "prims/methodHandles.hpp"
  35 #include "runtime/biasedLocking.hpp"
  36 #include "runtime/icache.hpp"
  37 #include "runtime/interfaceSupport.hpp"
  38 #include "runtime/objectMonitor.hpp"
  39 #include "runtime/os.hpp"
  40 #include "runtime/sharedRuntime.hpp"
  41 #include "runtime/stubRoutines.hpp"
  42 #include "utilities/macros.hpp"
  43 #if INCLUDE_ALL_GCS
  44 #include "gc/g1/g1CollectedHeap.inline.hpp"
  45 #include "gc/g1/g1SATBCardTableModRefBS.hpp"
  46 #include "gc/g1/heapRegion.hpp"
  47 #endif // INCLUDE_ALL_GCS
  48 
  49 #ifdef PRODUCT
  50 #define BLOCK_COMMENT(str) // nothing
  51 #else
  52 #define BLOCK_COMMENT(str) block_comment(str)
  53 #endif
  54 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  55 
  56 #ifdef ASSERT
  57 // On RISC, there's no benefit to verifying instruction boundaries.
  58 bool AbstractAssembler::pd_check_instruction_mark() { return false; }
  59 #endif
  60 
  61 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) {
  62   assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range");
  63   if (Assembler::is_simm(si31, 16)) {
  64     ld(d, si31, a);
  65     if (emit_filler_nop) nop();
  66   } else {
  67     const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31);
  68     const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31);
  69     addis(d, a, hi);
  70     ld(d, lo, d);
  71   }
  72 }
  73 
  74 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) {
  75   assert_different_registers(d, a);
  76   ld_largeoffset_unchecked(d, si31, a, emit_filler_nop);
  77 }
  78 
  79 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base,
  80                                       size_t size_in_bytes, bool is_signed) {
  81   switch (size_in_bytes) {
  82   case  8:              ld(dst, offs, base);                         break;
  83   case  4:  is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break;
  84   case  2:  is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break;
  85   case  1:  lbz(dst, offs, base); if (is_signed) extsb(dst, dst);    break; // lba doesn't exist :(
  86   default:  ShouldNotReachHere();
  87   }
  88 }
  89 
  90 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base,
  91                                        size_t size_in_bytes) {
  92   switch (size_in_bytes) {
  93   case  8:  std(dst, offs, base); break;
  94   case  4:  stw(dst, offs, base); break;
  95   case  2:  sth(dst, offs, base); break;
  96   case  1:  stb(dst, offs, base); break;
  97   default:  ShouldNotReachHere();
  98   }
  99 }
 100 
 101 void MacroAssembler::align(int modulus, int max, int rem) {
 102   int padding = (rem + modulus - (offset() % modulus)) % modulus;
 103   if (padding > max) return;
 104   for (int c = (padding >> 2); c > 0; --c) { nop(); }
 105 }
 106 
 107 // Issue instructions that calculate given TOC from global TOC.
 108 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16,
 109                                                        bool add_relocation, bool emit_dummy_addr) {
 110   int offset = -1;
 111   if (emit_dummy_addr) {
 112     offset = -128; // dummy address
 113   } else if (addr != (address)(intptr_t)-1) {
 114     offset = MacroAssembler::offset_to_global_toc(addr);
 115   }
 116 
 117   if (hi16) {
 118     addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset));
 119   }
 120   if (lo16) {
 121     if (add_relocation) {
 122       // Relocate at the addi to avoid confusion with a load from the method's TOC.
 123       relocate(internal_word_Relocation::spec(addr));
 124     }
 125     addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset));
 126   }
 127 }
 128 
 129 int MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) {
 130   const int offset = MacroAssembler::offset_to_global_toc(addr);
 131 
 132   const address inst2_addr = a;
 133   const int inst2 = *(int *)inst2_addr;
 134 
 135   // The relocation points to the second instruction, the addi,
 136   // and the addi reads and writes the same register dst.
 137   const int dst = inv_rt_field(inst2);
 138   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 139 
 140   // Now, find the preceding addis which writes to dst.
 141   int inst1 = 0;
 142   address inst1_addr = inst2_addr - BytesPerInstWord;
 143   while (inst1_addr >= bound) {
 144     inst1 = *(int *) inst1_addr;
 145     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 146       // Stop, found the addis which writes dst.
 147       break;
 148     }
 149     inst1_addr -= BytesPerInstWord;
 150   }
 151 
 152   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 153   set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset));
 154   set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset));
 155   return (int)((intptr_t)addr - (intptr_t)inst1_addr);
 156 }
 157 
 158 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) {
 159   const address inst2_addr = a;
 160   const int inst2 = *(int *)inst2_addr;
 161 
 162   // The relocation points to the second instruction, the addi,
 163   // and the addi reads and writes the same register dst.
 164   const int dst = inv_rt_field(inst2);
 165   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 166 
 167   // Now, find the preceding addis which writes to dst.
 168   int inst1 = 0;
 169   address inst1_addr = inst2_addr - BytesPerInstWord;
 170   while (inst1_addr >= bound) {
 171     inst1 = *(int *) inst1_addr;
 172     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 173       // stop, found the addis which writes dst
 174       break;
 175     }
 176     inst1_addr -= BytesPerInstWord;
 177   }
 178 
 179   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 180 
 181   int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0);
 182   // -1 is a special case
 183   if (offset == -1) {
 184     return (address)(intptr_t)-1;
 185   } else {
 186     return global_toc() + offset;
 187   }
 188 }
 189 
 190 #ifdef _LP64
 191 // Patch compressed oops or klass constants.
 192 // Assembler sequence is
 193 // 1) compressed oops:
 194 //    lis  rx = const.hi
 195 //    ori rx = rx | const.lo
 196 // 2) compressed klass:
 197 //    lis  rx = const.hi
 198 //    clrldi rx = rx & 0xFFFFffff // clearMS32b, optional
 199 //    ori rx = rx | const.lo
 200 // Clrldi will be passed by.
 201 int MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) {
 202   assert(UseCompressedOops, "Should only patch compressed oops");
 203 
 204   const address inst2_addr = a;
 205   const int inst2 = *(int *)inst2_addr;
 206 
 207   // The relocation points to the second instruction, the ori,
 208   // and the ori reads and writes the same register dst.
 209   const int dst = inv_rta_field(inst2);
 210   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 211   // Now, find the preceding addis which writes to dst.
 212   int inst1 = 0;
 213   address inst1_addr = inst2_addr - BytesPerInstWord;
 214   bool inst1_found = false;
 215   while (inst1_addr >= bound) {
 216     inst1 = *(int *)inst1_addr;
 217     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; }
 218     inst1_addr -= BytesPerInstWord;
 219   }
 220   assert(inst1_found, "inst is not lis");
 221 
 222   int xc = (data >> 16) & 0xffff;
 223   int xd = (data >>  0) & 0xffff;
 224 
 225   set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo
 226   set_imm((int *)inst2_addr,        (xd)); // unsigned int
 227   return (int)((intptr_t)inst2_addr - (intptr_t)inst1_addr);
 228 }
 229 
 230 // Get compressed oop or klass constant.
 231 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) {
 232   assert(UseCompressedOops, "Should only patch compressed oops");
 233 
 234   const address inst2_addr = a;
 235   const int inst2 = *(int *)inst2_addr;
 236 
 237   // The relocation points to the second instruction, the ori,
 238   // and the ori reads and writes the same register dst.
 239   const int dst = inv_rta_field(inst2);
 240   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 241   // Now, find the preceding lis which writes to dst.
 242   int inst1 = 0;
 243   address inst1_addr = inst2_addr - BytesPerInstWord;
 244   bool inst1_found = false;
 245 
 246   while (inst1_addr >= bound) {
 247     inst1 = *(int *) inst1_addr;
 248     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;}
 249     inst1_addr -= BytesPerInstWord;
 250   }
 251   assert(inst1_found, "inst is not lis");
 252 
 253   uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff));
 254   uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16);
 255 
 256   return (int) (xl | xh);
 257 }
 258 #endif // _LP64
 259 
 260 // Returns true if successful.
 261 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a,
 262                                                 Register toc, bool fixed_size) {
 263   int toc_offset = 0;
 264   // Use RelocationHolder::none for the constant pool entry, otherwise
 265   // we will end up with a failing NativeCall::verify(x) where x is
 266   // the address of the constant pool entry.
 267   // FIXME: We should insert relocation information for oops at the constant
 268   // pool entries instead of inserting it at the loads; patching of a constant
 269   // pool entry should be less expensive.
 270   address const_address = address_constant((address)a.value(), RelocationHolder::none);
 271   if (const_address == NULL) { return false; } // allocation failure
 272   // Relocate at the pc of the load.
 273   relocate(a.rspec());
 274   toc_offset = (int)(const_address - code()->consts()->start());
 275   ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size);
 276   return true;
 277 }
 278 
 279 bool MacroAssembler::is_load_const_from_method_toc_at(address a) {
 280   const address inst1_addr = a;
 281   const int inst1 = *(int *)inst1_addr;
 282 
 283    // The relocation points to the ld or the addis.
 284    return (is_ld(inst1)) ||
 285           (is_addis(inst1) && inv_ra_field(inst1) != 0);
 286 }
 287 
 288 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) {
 289   assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc");
 290 
 291   const address inst1_addr = a;
 292   const int inst1 = *(int *)inst1_addr;
 293 
 294   if (is_ld(inst1)) {
 295     return inv_d1_field(inst1);
 296   } else if (is_addis(inst1)) {
 297     const int dst = inv_rt_field(inst1);
 298 
 299     // Now, find the succeeding ld which reads and writes to dst.
 300     address inst2_addr = inst1_addr + BytesPerInstWord;
 301     int inst2 = 0;
 302     while (true) {
 303       inst2 = *(int *) inst2_addr;
 304       if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) {
 305         // Stop, found the ld which reads and writes dst.
 306         break;
 307       }
 308       inst2_addr += BytesPerInstWord;
 309     }
 310     return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2);
 311   }
 312   ShouldNotReachHere();
 313   return 0;
 314 }
 315 
 316 // Get the constant from a `load_const' sequence.
 317 long MacroAssembler::get_const(address a) {
 318   assert(is_load_const_at(a), "not a load of a constant");
 319   const int *p = (const int*) a;
 320   unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48);
 321   if (is_ori(*(p+1))) {
 322     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32);
 323     x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16);
 324     x |= (((unsigned long) (get_imm(a,4) & 0xffff)));
 325   } else if (is_lis(*(p+1))) {
 326     x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32);
 327     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16);
 328     x |= (((unsigned long) (get_imm(a,3) & 0xffff)));
 329   } else {
 330     ShouldNotReachHere();
 331     return (long) 0;
 332   }
 333   return (long) x;
 334 }
 335 
 336 // Patch the 64 bit constant of a `load_const' sequence. This is a low
 337 // level procedure. It neither flushes the instruction cache nor is it
 338 // mt safe.
 339 void MacroAssembler::patch_const(address a, long x) {
 340   assert(is_load_const_at(a), "not a load of a constant");
 341   int *p = (int*) a;
 342   if (is_ori(*(p+1))) {
 343     set_imm(0 + p, (x >> 48) & 0xffff);
 344     set_imm(1 + p, (x >> 32) & 0xffff);
 345     set_imm(3 + p, (x >> 16) & 0xffff);
 346     set_imm(4 + p, x & 0xffff);
 347   } else if (is_lis(*(p+1))) {
 348     set_imm(0 + p, (x >> 48) & 0xffff);
 349     set_imm(2 + p, (x >> 32) & 0xffff);
 350     set_imm(1 + p, (x >> 16) & 0xffff);
 351     set_imm(3 + p, x & 0xffff);
 352   } else {
 353     ShouldNotReachHere();
 354   }
 355 }
 356 
 357 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) {
 358   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
 359   int index = oop_recorder()->allocate_metadata_index(obj);
 360   RelocationHolder rspec = metadata_Relocation::spec(index);
 361   return AddressLiteral((address)obj, rspec);
 362 }
 363 
 364 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) {
 365   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
 366   int index = oop_recorder()->find_index(obj);
 367   RelocationHolder rspec = metadata_Relocation::spec(index);
 368   return AddressLiteral((address)obj, rspec);
 369 }
 370 
 371 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) {
 372   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
 373   int oop_index = oop_recorder()->allocate_oop_index(obj);
 374   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 375 }
 376 
 377 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) {
 378   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
 379   int oop_index = oop_recorder()->find_index(obj);
 380   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 381 }
 382 
 383 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
 384                                                       Register tmp, int offset) {
 385   intptr_t value = *delayed_value_addr;
 386   if (value != 0) {
 387     return RegisterOrConstant(value + offset);
 388   }
 389 
 390   // Load indirectly to solve generation ordering problem.
 391   // static address, no relocation
 392   int simm16_offset = load_const_optimized(tmp, delayed_value_addr, noreg, true);
 393   ld(tmp, simm16_offset, tmp); // must be aligned ((xa & 3) == 0)
 394 
 395   if (offset != 0) {
 396     addi(tmp, tmp, offset);
 397   }
 398 
 399   return RegisterOrConstant(tmp);
 400 }
 401 
 402 #ifndef PRODUCT
 403 void MacroAssembler::pd_print_patched_instruction(address branch) {
 404   Unimplemented(); // TODO: PPC port
 405 }
 406 #endif // ndef PRODUCT
 407 
 408 // Conditional far branch for destinations encodable in 24+2 bits.
 409 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) {
 410 
 411   // If requested by flag optimize, relocate the bc_far as a
 412   // runtime_call and prepare for optimizing it when the code gets
 413   // relocated.
 414   if (optimize == bc_far_optimize_on_relocate) {
 415     relocate(relocInfo::runtime_call_type);
 416   }
 417 
 418   // variant 2:
 419   //
 420   //    b!cxx SKIP
 421   //    bxx   DEST
 422   //  SKIP:
 423   //
 424 
 425   const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 426                                                 opposite_bcond(inv_boint_bcond(boint)));
 427 
 428   // We emit two branches.
 429   // First, a conditional branch which jumps around the far branch.
 430   const address not_taken_pc = pc() + 2 * BytesPerInstWord;
 431   const address bc_pc        = pc();
 432   bc(opposite_boint, biint, not_taken_pc);
 433 
 434   const int bc_instr = *(int*)bc_pc;
 435   assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition");
 436   assert(opposite_boint == inv_bo_field(bc_instr), "postcondition");
 437   assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))),
 438                                      opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))),
 439          "postcondition");
 440   assert(biint == inv_bi_field(bc_instr), "postcondition");
 441 
 442   // Second, an unconditional far branch which jumps to dest.
 443   // Note: target(dest) remembers the current pc (see CodeSection::target)
 444   //       and returns the current pc if the label is not bound yet; when
 445   //       the label gets bound, the unconditional far branch will be patched.
 446   const address target_pc = target(dest);
 447   const address b_pc  = pc();
 448   b(target_pc);
 449 
 450   assert(not_taken_pc == pc(),                     "postcondition");
 451   assert(dest.is_bound() || target_pc == b_pc, "postcondition");
 452 }
 453 
 454 // 1 or 2 instructions
 455 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) {
 456   if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) {
 457     bc(boint, biint, dest);
 458   } else {
 459     bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate);
 460   }
 461 }
 462 
 463 bool MacroAssembler::is_bc_far_at(address instruction_addr) {
 464   return is_bc_far_variant1_at(instruction_addr) ||
 465          is_bc_far_variant2_at(instruction_addr) ||
 466          is_bc_far_variant3_at(instruction_addr);
 467 }
 468 
 469 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) {
 470   if (is_bc_far_variant1_at(instruction_addr)) {
 471     const address instruction_1_addr = instruction_addr;
 472     const int instruction_1 = *(int*)instruction_1_addr;
 473     return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr);
 474   } else if (is_bc_far_variant2_at(instruction_addr)) {
 475     const address instruction_2_addr = instruction_addr + 4;
 476     return bxx_destination(instruction_2_addr);
 477   } else if (is_bc_far_variant3_at(instruction_addr)) {
 478     return instruction_addr + 8;
 479   }
 480   // variant 4 ???
 481   ShouldNotReachHere();
 482   return NULL;
 483 }
 484 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) {
 485 
 486   if (is_bc_far_variant3_at(instruction_addr)) {
 487     // variant 3, far cond branch to the next instruction, already patched to nops:
 488     //
 489     //    nop
 490     //    endgroup
 491     //  SKIP/DEST:
 492     //
 493     return;
 494   }
 495 
 496   // first, extract boint and biint from the current branch
 497   int boint = 0;
 498   int biint = 0;
 499 
 500   ResourceMark rm;
 501   const int code_size = 2 * BytesPerInstWord;
 502   CodeBuffer buf(instruction_addr, code_size);
 503   MacroAssembler masm(&buf);
 504   if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) {
 505     // Far branch to next instruction: Optimize it by patching nops (produce variant 3).
 506     masm.nop();
 507     masm.endgroup();
 508   } else {
 509     if (is_bc_far_variant1_at(instruction_addr)) {
 510       // variant 1, the 1st instruction contains the destination address:
 511       //
 512       //    bcxx  DEST
 513       //    nop
 514       //
 515       const int instruction_1 = *(int*)(instruction_addr);
 516       boint = inv_bo_field(instruction_1);
 517       biint = inv_bi_field(instruction_1);
 518     } else if (is_bc_far_variant2_at(instruction_addr)) {
 519       // variant 2, the 2nd instruction contains the destination address:
 520       //
 521       //    b!cxx SKIP
 522       //    bxx   DEST
 523       //  SKIP:
 524       //
 525       const int instruction_1 = *(int*)(instruction_addr);
 526       boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))),
 527           opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1))));
 528       biint = inv_bi_field(instruction_1);
 529     } else {
 530       // variant 4???
 531       ShouldNotReachHere();
 532     }
 533 
 534     // second, set the new branch destination and optimize the code
 535     if (dest != instruction_addr + 4 && // the bc_far is still unbound!
 536         masm.is_within_range_of_bcxx(dest, instruction_addr)) {
 537       // variant 1:
 538       //
 539       //    bcxx  DEST
 540       //    nop
 541       //
 542       masm.bc(boint, biint, dest);
 543       masm.nop();
 544     } else {
 545       // variant 2:
 546       //
 547       //    b!cxx SKIP
 548       //    bxx   DEST
 549       //  SKIP:
 550       //
 551       const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 552                                                     opposite_bcond(inv_boint_bcond(boint)));
 553       const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord;
 554       masm.bc(opposite_boint, biint, not_taken_pc);
 555       masm.b(dest);
 556     }
 557   }
 558   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 559 }
 560 
 561 // Emit a NOT mt-safe patchable 64 bit absolute call/jump.
 562 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) {
 563   // get current pc
 564   uint64_t start_pc = (uint64_t) pc();
 565 
 566   const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last
 567   const address pc_of_b  = (address) (start_pc + (0*BytesPerInstWord)); // b is first
 568 
 569   // relocate here
 570   if (rt != relocInfo::none) {
 571     relocate(rt);
 572   }
 573 
 574   if ( ReoptimizeCallSequences &&
 575        (( link && is_within_range_of_b(dest, pc_of_bl)) ||
 576         (!link && is_within_range_of_b(dest, pc_of_b)))) {
 577     // variant 2:
 578     // Emit an optimized, pc-relative call/jump.
 579 
 580     if (link) {
 581       // some padding
 582       nop();
 583       nop();
 584       nop();
 585       nop();
 586       nop();
 587       nop();
 588 
 589       // do the call
 590       assert(pc() == pc_of_bl, "just checking");
 591       bl(dest, relocInfo::none);
 592     } else {
 593       // do the jump
 594       assert(pc() == pc_of_b, "just checking");
 595       b(dest, relocInfo::none);
 596 
 597       // some padding
 598       nop();
 599       nop();
 600       nop();
 601       nop();
 602       nop();
 603       nop();
 604     }
 605 
 606     // Assert that we can identify the emitted call/jump.
 607     assert(is_bxx64_patchable_variant2_at((address)start_pc, link),
 608            "can't identify emitted call");
 609   } else {
 610     // variant 1:
 611     mr(R0, R11);  // spill R11 -> R0.
 612 
 613     // Load the destination address into CTR,
 614     // calculate destination relative to global toc.
 615     calculate_address_from_global_toc(R11, dest, true, true, false);
 616 
 617     mtctr(R11);
 618     mr(R11, R0);  // spill R11 <- R0.
 619     nop();
 620 
 621     // do the call/jump
 622     if (link) {
 623       bctrl();
 624     } else{
 625       bctr();
 626     }
 627     // Assert that we can identify the emitted call/jump.
 628     assert(is_bxx64_patchable_variant1b_at((address)start_pc, link),
 629            "can't identify emitted call");
 630   }
 631 
 632   // Assert that we can identify the emitted call/jump.
 633   assert(is_bxx64_patchable_at((address)start_pc, link),
 634          "can't identify emitted call");
 635   assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest,
 636          "wrong encoding of dest address");
 637 }
 638 
 639 // Identify a bxx64_patchable instruction.
 640 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) {
 641   return is_bxx64_patchable_variant1b_at(instruction_addr, link)
 642     //|| is_bxx64_patchable_variant1_at(instruction_addr, link)
 643       || is_bxx64_patchable_variant2_at(instruction_addr, link);
 644 }
 645 
 646 // Does the call64_patchable instruction use a pc-relative encoding of
 647 // the call destination?
 648 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) {
 649   // variant 2 is pc-relative
 650   return is_bxx64_patchable_variant2_at(instruction_addr, link);
 651 }
 652 
 653 // Identify variant 1.
 654 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) {
 655   unsigned int* instr = (unsigned int*) instruction_addr;
 656   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 657       && is_mtctr(instr[5]) // mtctr
 658     && is_load_const_at(instruction_addr);
 659 }
 660 
 661 // Identify variant 1b: load destination relative to global toc.
 662 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) {
 663   unsigned int* instr = (unsigned int*) instruction_addr;
 664   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 665     && is_mtctr(instr[3]) // mtctr
 666     && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr);
 667 }
 668 
 669 // Identify variant 2.
 670 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) {
 671   unsigned int* instr = (unsigned int*) instruction_addr;
 672   if (link) {
 673     return is_bl (instr[6])  // bl dest is last
 674       && is_nop(instr[0])  // nop
 675       && is_nop(instr[1])  // nop
 676       && is_nop(instr[2])  // nop
 677       && is_nop(instr[3])  // nop
 678       && is_nop(instr[4])  // nop
 679       && is_nop(instr[5]); // nop
 680   } else {
 681     return is_b  (instr[0])  // b  dest is first
 682       && is_nop(instr[1])  // nop
 683       && is_nop(instr[2])  // nop
 684       && is_nop(instr[3])  // nop
 685       && is_nop(instr[4])  // nop
 686       && is_nop(instr[5])  // nop
 687       && is_nop(instr[6]); // nop
 688   }
 689 }
 690 
 691 // Set dest address of a bxx64_patchable instruction.
 692 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) {
 693   ResourceMark rm;
 694   int code_size = MacroAssembler::bxx64_patchable_size;
 695   CodeBuffer buf(instruction_addr, code_size);
 696   MacroAssembler masm(&buf);
 697   masm.bxx64_patchable(dest, relocInfo::none, link);
 698   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 699 }
 700 
 701 // Get dest address of a bxx64_patchable instruction.
 702 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) {
 703   if (is_bxx64_patchable_variant1_at(instruction_addr, link)) {
 704     return (address) (unsigned long) get_const(instruction_addr);
 705   } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) {
 706     unsigned int* instr = (unsigned int*) instruction_addr;
 707     if (link) {
 708       const int instr_idx = 6; // bl is last
 709       int branchoffset = branch_destination(instr[instr_idx], 0);
 710       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 711     } else {
 712       const int instr_idx = 0; // b is first
 713       int branchoffset = branch_destination(instr[instr_idx], 0);
 714       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 715     }
 716   // Load dest relative to global toc.
 717   } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) {
 718     return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord,
 719                                                                instruction_addr);
 720   } else {
 721     ShouldNotReachHere();
 722     return NULL;
 723   }
 724 }
 725 
 726 // Uses ordering which corresponds to ABI:
 727 //    _savegpr0_14:  std  r14,-144(r1)
 728 //    _savegpr0_15:  std  r15,-136(r1)
 729 //    _savegpr0_16:  std  r16,-128(r1)
 730 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) {
 731   std(R14, offset, dst);   offset += 8;
 732   std(R15, offset, dst);   offset += 8;
 733   std(R16, offset, dst);   offset += 8;
 734   std(R17, offset, dst);   offset += 8;
 735   std(R18, offset, dst);   offset += 8;
 736   std(R19, offset, dst);   offset += 8;
 737   std(R20, offset, dst);   offset += 8;
 738   std(R21, offset, dst);   offset += 8;
 739   std(R22, offset, dst);   offset += 8;
 740   std(R23, offset, dst);   offset += 8;
 741   std(R24, offset, dst);   offset += 8;
 742   std(R25, offset, dst);   offset += 8;
 743   std(R26, offset, dst);   offset += 8;
 744   std(R27, offset, dst);   offset += 8;
 745   std(R28, offset, dst);   offset += 8;
 746   std(R29, offset, dst);   offset += 8;
 747   std(R30, offset, dst);   offset += 8;
 748   std(R31, offset, dst);   offset += 8;
 749 
 750   stfd(F14, offset, dst);   offset += 8;
 751   stfd(F15, offset, dst);   offset += 8;
 752   stfd(F16, offset, dst);   offset += 8;
 753   stfd(F17, offset, dst);   offset += 8;
 754   stfd(F18, offset, dst);   offset += 8;
 755   stfd(F19, offset, dst);   offset += 8;
 756   stfd(F20, offset, dst);   offset += 8;
 757   stfd(F21, offset, dst);   offset += 8;
 758   stfd(F22, offset, dst);   offset += 8;
 759   stfd(F23, offset, dst);   offset += 8;
 760   stfd(F24, offset, dst);   offset += 8;
 761   stfd(F25, offset, dst);   offset += 8;
 762   stfd(F26, offset, dst);   offset += 8;
 763   stfd(F27, offset, dst);   offset += 8;
 764   stfd(F28, offset, dst);   offset += 8;
 765   stfd(F29, offset, dst);   offset += 8;
 766   stfd(F30, offset, dst);   offset += 8;
 767   stfd(F31, offset, dst);
 768 }
 769 
 770 // Uses ordering which corresponds to ABI:
 771 //    _restgpr0_14:  ld   r14,-144(r1)
 772 //    _restgpr0_15:  ld   r15,-136(r1)
 773 //    _restgpr0_16:  ld   r16,-128(r1)
 774 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) {
 775   ld(R14, offset, src);   offset += 8;
 776   ld(R15, offset, src);   offset += 8;
 777   ld(R16, offset, src);   offset += 8;
 778   ld(R17, offset, src);   offset += 8;
 779   ld(R18, offset, src);   offset += 8;
 780   ld(R19, offset, src);   offset += 8;
 781   ld(R20, offset, src);   offset += 8;
 782   ld(R21, offset, src);   offset += 8;
 783   ld(R22, offset, src);   offset += 8;
 784   ld(R23, offset, src);   offset += 8;
 785   ld(R24, offset, src);   offset += 8;
 786   ld(R25, offset, src);   offset += 8;
 787   ld(R26, offset, src);   offset += 8;
 788   ld(R27, offset, src);   offset += 8;
 789   ld(R28, offset, src);   offset += 8;
 790   ld(R29, offset, src);   offset += 8;
 791   ld(R30, offset, src);   offset += 8;
 792   ld(R31, offset, src);   offset += 8;
 793 
 794   // FP registers
 795   lfd(F14, offset, src);   offset += 8;
 796   lfd(F15, offset, src);   offset += 8;
 797   lfd(F16, offset, src);   offset += 8;
 798   lfd(F17, offset, src);   offset += 8;
 799   lfd(F18, offset, src);   offset += 8;
 800   lfd(F19, offset, src);   offset += 8;
 801   lfd(F20, offset, src);   offset += 8;
 802   lfd(F21, offset, src);   offset += 8;
 803   lfd(F22, offset, src);   offset += 8;
 804   lfd(F23, offset, src);   offset += 8;
 805   lfd(F24, offset, src);   offset += 8;
 806   lfd(F25, offset, src);   offset += 8;
 807   lfd(F26, offset, src);   offset += 8;
 808   lfd(F27, offset, src);   offset += 8;
 809   lfd(F28, offset, src);   offset += 8;
 810   lfd(F29, offset, src);   offset += 8;
 811   lfd(F30, offset, src);   offset += 8;
 812   lfd(F31, offset, src);
 813 }
 814 
 815 // For verify_oops.
 816 void MacroAssembler::save_volatile_gprs(Register dst, int offset) {
 817   std(R2,  offset, dst);   offset += 8;
 818   std(R3,  offset, dst);   offset += 8;
 819   std(R4,  offset, dst);   offset += 8;
 820   std(R5,  offset, dst);   offset += 8;
 821   std(R6,  offset, dst);   offset += 8;
 822   std(R7,  offset, dst);   offset += 8;
 823   std(R8,  offset, dst);   offset += 8;
 824   std(R9,  offset, dst);   offset += 8;
 825   std(R10, offset, dst);   offset += 8;
 826   std(R11, offset, dst);   offset += 8;
 827   std(R12, offset, dst);   offset += 8;
 828 
 829   stfd(F0, offset, dst);   offset += 8;
 830   stfd(F1, offset, dst);   offset += 8;
 831   stfd(F2, offset, dst);   offset += 8;
 832   stfd(F3, offset, dst);   offset += 8;
 833   stfd(F4, offset, dst);   offset += 8;
 834   stfd(F5, offset, dst);   offset += 8;
 835   stfd(F6, offset, dst);   offset += 8;
 836   stfd(F7, offset, dst);   offset += 8;
 837   stfd(F8, offset, dst);   offset += 8;
 838   stfd(F9, offset, dst);   offset += 8;
 839   stfd(F10, offset, dst);  offset += 8;
 840   stfd(F11, offset, dst);  offset += 8;
 841   stfd(F12, offset, dst);  offset += 8;
 842   stfd(F13, offset, dst);
 843 }
 844 
 845 // For verify_oops.
 846 void MacroAssembler::restore_volatile_gprs(Register src, int offset) {
 847   ld(R2,  offset, src);   offset += 8;
 848   ld(R3,  offset, src);   offset += 8;
 849   ld(R4,  offset, src);   offset += 8;
 850   ld(R5,  offset, src);   offset += 8;
 851   ld(R6,  offset, src);   offset += 8;
 852   ld(R7,  offset, src);   offset += 8;
 853   ld(R8,  offset, src);   offset += 8;
 854   ld(R9,  offset, src);   offset += 8;
 855   ld(R10, offset, src);   offset += 8;
 856   ld(R11, offset, src);   offset += 8;
 857   ld(R12, offset, src);   offset += 8;
 858 
 859   lfd(F0, offset, src);   offset += 8;
 860   lfd(F1, offset, src);   offset += 8;
 861   lfd(F2, offset, src);   offset += 8;
 862   lfd(F3, offset, src);   offset += 8;
 863   lfd(F4, offset, src);   offset += 8;
 864   lfd(F5, offset, src);   offset += 8;
 865   lfd(F6, offset, src);   offset += 8;
 866   lfd(F7, offset, src);   offset += 8;
 867   lfd(F8, offset, src);   offset += 8;
 868   lfd(F9, offset, src);   offset += 8;
 869   lfd(F10, offset, src);  offset += 8;
 870   lfd(F11, offset, src);  offset += 8;
 871   lfd(F12, offset, src);  offset += 8;
 872   lfd(F13, offset, src);
 873 }
 874 
 875 void MacroAssembler::save_LR_CR(Register tmp) {
 876   mfcr(tmp);
 877   std(tmp, _abi(cr), R1_SP);
 878   mflr(tmp);
 879   std(tmp, _abi(lr), R1_SP);
 880   // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad)
 881 }
 882 
 883 void MacroAssembler::restore_LR_CR(Register tmp) {
 884   assert(tmp != R1_SP, "must be distinct");
 885   ld(tmp, _abi(lr), R1_SP);
 886   mtlr(tmp);
 887   ld(tmp, _abi(cr), R1_SP);
 888   mtcr(tmp);
 889 }
 890 
 891 address MacroAssembler::get_PC_trash_LR(Register result) {
 892   Label L;
 893   bl(L);
 894   bind(L);
 895   address lr_pc = pc();
 896   mflr(result);
 897   return lr_pc;
 898 }
 899 
 900 void MacroAssembler::resize_frame(Register offset, Register tmp) {
 901 #ifdef ASSERT
 902   assert_different_registers(offset, tmp, R1_SP);
 903   andi_(tmp, offset, frame::alignment_in_bytes-1);
 904   asm_assert_eq("resize_frame: unaligned", 0x204);
 905 #endif
 906 
 907   // tmp <- *(SP)
 908   ld(tmp, _abi(callers_sp), R1_SP);
 909   // addr <- SP + offset;
 910   // *(addr) <- tmp;
 911   // SP <- addr
 912   stdux(tmp, R1_SP, offset);
 913 }
 914 
 915 void MacroAssembler::resize_frame(int offset, Register tmp) {
 916   assert(is_simm(offset, 16), "too big an offset");
 917   assert_different_registers(tmp, R1_SP);
 918   assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned");
 919   // tmp <- *(SP)
 920   ld(tmp, _abi(callers_sp), R1_SP);
 921   // addr <- SP + offset;
 922   // *(addr) <- tmp;
 923   // SP <- addr
 924   stdu(tmp, offset, R1_SP);
 925 }
 926 
 927 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) {
 928   // (addr == tmp1) || (addr == tmp2) is allowed here!
 929   assert(tmp1 != tmp2, "must be distinct");
 930 
 931   // compute offset w.r.t. current stack pointer
 932   // tmp_1 <- addr - SP (!)
 933   subf(tmp1, R1_SP, addr);
 934 
 935   // atomically update SP keeping back link.
 936   resize_frame(tmp1/* offset */, tmp2/* tmp */);
 937 }
 938 
 939 void MacroAssembler::push_frame(Register bytes, Register tmp) {
 940 #ifdef ASSERT
 941   assert(bytes != R0, "r0 not allowed here");
 942   andi_(R0, bytes, frame::alignment_in_bytes-1);
 943   asm_assert_eq("push_frame(Reg, Reg): unaligned", 0x203);
 944 #endif
 945   neg(tmp, bytes);
 946   stdux(R1_SP, R1_SP, tmp);
 947 }
 948 
 949 // Push a frame of size `bytes'.
 950 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) {
 951   long offset = align_addr(bytes, frame::alignment_in_bytes);
 952   if (is_simm(-offset, 16)) {
 953     stdu(R1_SP, -offset, R1_SP);
 954   } else {
 955     load_const_optimized(tmp, -offset);
 956     stdux(R1_SP, R1_SP, tmp);
 957   }
 958 }
 959 
 960 // Push a frame of size `bytes' plus abi_reg_args on top.
 961 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) {
 962   push_frame(bytes + frame::abi_reg_args_size, tmp);
 963 }
 964 
 965 // Setup up a new C frame with a spill area for non-volatile GPRs and
 966 // additional space for local variables.
 967 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes,
 968                                                       Register tmp) {
 969   push_frame(bytes + frame::abi_reg_args_size + frame::spill_nonvolatiles_size, tmp);
 970 }
 971 
 972 // Pop current C frame.
 973 void MacroAssembler::pop_frame() {
 974   ld(R1_SP, _abi(callers_sp), R1_SP);
 975 }
 976 
 977 #if defined(ABI_ELFv2)
 978 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) {
 979   // TODO(asmundak): make sure the caller uses R12 as function descriptor
 980   // most of the times.
 981   if (R12 != r_function_entry) {
 982     mr(R12, r_function_entry);
 983   }
 984   mtctr(R12);
 985   // Do a call or a branch.
 986   if (and_link) {
 987     bctrl();
 988   } else {
 989     bctr();
 990   }
 991   _last_calls_return_pc = pc();
 992 
 993   return _last_calls_return_pc;
 994 }
 995 
 996 // Call a C function via a function descriptor and use full C
 997 // calling conventions. Updates and returns _last_calls_return_pc.
 998 address MacroAssembler::call_c(Register r_function_entry) {
 999   return branch_to(r_function_entry, /*and_link=*/true);
1000 }
1001 
1002 // For tail calls: only branch, don't link, so callee returns to caller of this function.
1003 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) {
1004   return branch_to(r_function_entry, /*and_link=*/false);
1005 }
1006 
1007 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) {
1008   load_const(R12, function_entry, R0);
1009   return branch_to(R12,  /*and_link=*/true);
1010 }
1011 
1012 #else
1013 // Generic version of a call to C function via a function descriptor
1014 // with variable support for C calling conventions (TOC, ENV, etc.).
1015 // Updates and returns _last_calls_return_pc.
1016 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call,
1017                                   bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) {
1018   // we emit standard ptrgl glue code here
1019   assert((function_descriptor != R0), "function_descriptor cannot be R0");
1020 
1021   // retrieve necessary entries from the function descriptor
1022   ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor);
1023   mtctr(R0);
1024 
1025   if (load_toc_of_callee) {
1026     ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor);
1027   }
1028   if (load_env_of_callee) {
1029     ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor);
1030   } else if (load_toc_of_callee) {
1031     li(R11, 0);
1032   }
1033 
1034   // do a call or a branch
1035   if (and_link) {
1036     bctrl();
1037   } else {
1038     bctr();
1039   }
1040   _last_calls_return_pc = pc();
1041 
1042   return _last_calls_return_pc;
1043 }
1044 
1045 // Call a C function via a function descriptor and use full C calling
1046 // conventions.
1047 // We don't use the TOC in generated code, so there is no need to save
1048 // and restore its value.
1049 address MacroAssembler::call_c(Register fd) {
1050   return branch_to(fd, /*and_link=*/true,
1051                        /*save toc=*/false,
1052                        /*restore toc=*/false,
1053                        /*load toc=*/true,
1054                        /*load env=*/true);
1055 }
1056 
1057 address MacroAssembler::call_c_and_return_to_caller(Register fd) {
1058   return branch_to(fd, /*and_link=*/false,
1059                        /*save toc=*/false,
1060                        /*restore toc=*/false,
1061                        /*load toc=*/true,
1062                        /*load env=*/true);
1063 }
1064 
1065 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) {
1066   if (rt != relocInfo::none) {
1067     // this call needs to be relocatable
1068     if (!ReoptimizeCallSequences
1069         || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1070         || fd == NULL   // support code-size estimation
1071         || !fd->is_friend_function()
1072         || fd->entry() == NULL) {
1073       // it's not a friend function as defined by class FunctionDescriptor,
1074       // so do a full call-c here.
1075       load_const(R11, (address)fd, R0);
1076 
1077       bool has_env = (fd != NULL && fd->env() != NULL);
1078       return branch_to(R11, /*and_link=*/true,
1079                             /*save toc=*/false,
1080                             /*restore toc=*/false,
1081                             /*load toc=*/true,
1082                             /*load env=*/has_env);
1083     } else {
1084       // It's a friend function. Load the entry point and don't care about
1085       // toc and env. Use an optimizable call instruction, but ensure the
1086       // same code-size as in the case of a non-friend function.
1087       nop();
1088       nop();
1089       nop();
1090       bl64_patchable(fd->entry(), rt);
1091       _last_calls_return_pc = pc();
1092       return _last_calls_return_pc;
1093     }
1094   } else {
1095     // This call does not need to be relocatable, do more aggressive
1096     // optimizations.
1097     if (!ReoptimizeCallSequences
1098       || !fd->is_friend_function()) {
1099       // It's not a friend function as defined by class FunctionDescriptor,
1100       // so do a full call-c here.
1101       load_const(R11, (address)fd, R0);
1102       return branch_to(R11, /*and_link=*/true,
1103                             /*save toc=*/false,
1104                             /*restore toc=*/false,
1105                             /*load toc=*/true,
1106                             /*load env=*/true);
1107     } else {
1108       // it's a friend function, load the entry point and don't care about
1109       // toc and env.
1110       address dest = fd->entry();
1111       if (is_within_range_of_b(dest, pc())) {
1112         bl(dest);
1113       } else {
1114         bl64_patchable(dest, rt);
1115       }
1116       _last_calls_return_pc = pc();
1117       return _last_calls_return_pc;
1118     }
1119   }
1120 }
1121 
1122 // Call a C function.  All constants needed reside in TOC.
1123 //
1124 // Read the address to call from the TOC.
1125 // Read env from TOC, if fd specifies an env.
1126 // Read new TOC from TOC.
1127 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd,
1128                                          relocInfo::relocType rt, Register toc) {
1129   if (!ReoptimizeCallSequences
1130     || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1131     || !fd->is_friend_function()) {
1132     // It's not a friend function as defined by class FunctionDescriptor,
1133     // so do a full call-c here.
1134     assert(fd->entry() != NULL, "function must be linked");
1135 
1136     AddressLiteral fd_entry(fd->entry());
1137     bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true);
1138     mtctr(R11);
1139     if (fd->env() == NULL) {
1140       li(R11, 0);
1141       nop();
1142     } else {
1143       AddressLiteral fd_env(fd->env());
1144       success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true);
1145     }
1146     AddressLiteral fd_toc(fd->toc());
1147     // Set R2_TOC (load from toc)
1148     success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true);
1149     bctrl();
1150     _last_calls_return_pc = pc();
1151     if (!success) { return NULL; }
1152   } else {
1153     // It's a friend function, load the entry point and don't care about
1154     // toc and env. Use an optimizable call instruction, but ensure the
1155     // same code-size as in the case of a non-friend function.
1156     nop();
1157     bl64_patchable(fd->entry(), rt);
1158     _last_calls_return_pc = pc();
1159   }
1160   return _last_calls_return_pc;
1161 }
1162 #endif // ABI_ELFv2
1163 
1164 void MacroAssembler::call_VM_base(Register oop_result,
1165                                   Register last_java_sp,
1166                                   address  entry_point,
1167                                   bool     check_exceptions) {
1168   BLOCK_COMMENT("call_VM {");
1169   // Determine last_java_sp register.
1170   if (!last_java_sp->is_valid()) {
1171     last_java_sp = R1_SP;
1172   }
1173   set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1);
1174 
1175   // ARG1 must hold thread address.
1176   mr(R3_ARG1, R16_thread);
1177 #if defined(ABI_ELFv2)
1178   address return_pc = call_c(entry_point, relocInfo::none);
1179 #else
1180   address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none);
1181 #endif
1182 
1183   reset_last_Java_frame();
1184 
1185   // Check for pending exceptions.
1186   if (check_exceptions) {
1187     // We don't check for exceptions here.
1188     ShouldNotReachHere();
1189   }
1190 
1191   // Get oop result if there is one and reset the value in the thread.
1192   if (oop_result->is_valid()) {
1193     get_vm_result(oop_result);
1194   }
1195 
1196   _last_calls_return_pc = return_pc;
1197   BLOCK_COMMENT("} call_VM");
1198 }
1199 
1200 void MacroAssembler::call_VM_leaf_base(address entry_point) {
1201   BLOCK_COMMENT("call_VM_leaf {");
1202 #if defined(ABI_ELFv2)
1203   call_c(entry_point, relocInfo::none);
1204 #else
1205   call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none);
1206 #endif
1207   BLOCK_COMMENT("} call_VM_leaf");
1208 }
1209 
1210 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) {
1211   call_VM_base(oop_result, noreg, entry_point, check_exceptions);
1212 }
1213 
1214 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1,
1215                              bool check_exceptions) {
1216   // R3_ARG1 is reserved for the thread.
1217   mr_if_needed(R4_ARG2, arg_1);
1218   call_VM(oop_result, entry_point, check_exceptions);
1219 }
1220 
1221 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2,
1222                              bool check_exceptions) {
1223   // R3_ARG1 is reserved for the thread
1224   mr_if_needed(R4_ARG2, arg_1);
1225   assert(arg_2 != R4_ARG2, "smashed argument");
1226   mr_if_needed(R5_ARG3, arg_2);
1227   call_VM(oop_result, entry_point, check_exceptions);
1228 }
1229 
1230 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3,
1231                              bool check_exceptions) {
1232   // R3_ARG1 is reserved for the thread
1233   mr_if_needed(R4_ARG2, arg_1);
1234   assert(arg_2 != R4_ARG2, "smashed argument");
1235   mr_if_needed(R5_ARG3, arg_2);
1236   mr_if_needed(R6_ARG4, arg_3);
1237   call_VM(oop_result, entry_point, check_exceptions);
1238 }
1239 
1240 void MacroAssembler::call_VM_leaf(address entry_point) {
1241   call_VM_leaf_base(entry_point);
1242 }
1243 
1244 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) {
1245   mr_if_needed(R3_ARG1, arg_1);
1246   call_VM_leaf(entry_point);
1247 }
1248 
1249 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) {
1250   mr_if_needed(R3_ARG1, arg_1);
1251   assert(arg_2 != R3_ARG1, "smashed argument");
1252   mr_if_needed(R4_ARG2, arg_2);
1253   call_VM_leaf(entry_point);
1254 }
1255 
1256 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) {
1257   mr_if_needed(R3_ARG1, arg_1);
1258   assert(arg_2 != R3_ARG1, "smashed argument");
1259   mr_if_needed(R4_ARG2, arg_2);
1260   assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument");
1261   mr_if_needed(R5_ARG3, arg_3);
1262   call_VM_leaf(entry_point);
1263 }
1264 
1265 // Check whether instruction is a read access to the polling page
1266 // which was emitted by load_from_polling_page(..).
1267 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext,
1268                                                address* polling_address_ptr) {
1269   if (!is_ld(instruction))
1270     return false; // It's not a ld. Fail.
1271 
1272   int rt = inv_rt_field(instruction);
1273   int ra = inv_ra_field(instruction);
1274   int ds = inv_ds_field(instruction);
1275   if (!(ds == 0 && ra != 0 && rt == 0)) {
1276     return false; // It's not a ld(r0, X, ra). Fail.
1277   }
1278 
1279   if (!ucontext) {
1280     // Set polling address.
1281     if (polling_address_ptr != NULL) {
1282       *polling_address_ptr = NULL;
1283     }
1284     return true; // No ucontext given. Can't check value of ra. Assume true.
1285   }
1286 
1287 #ifdef LINUX
1288   // Ucontext given. Check that register ra contains the address of
1289   // the safepoing polling page.
1290   ucontext_t* uc = (ucontext_t*) ucontext;
1291   // Set polling address.
1292   address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds;
1293   if (polling_address_ptr != NULL) {
1294     *polling_address_ptr = addr;
1295   }
1296   return os::is_poll_address(addr);
1297 #else
1298   // Not on Linux, ucontext must be NULL.
1299   ShouldNotReachHere();
1300   return false;
1301 #endif
1302 }
1303 
1304 bool MacroAssembler::is_memory_serialization(int instruction, JavaThread* thread, void* ucontext) {
1305 #ifdef LINUX
1306   ucontext_t* uc = (ucontext_t*) ucontext;
1307 
1308   if (is_stwx(instruction) || is_stwux(instruction)) {
1309     int ra = inv_ra_field(instruction);
1310     int rb = inv_rb_field(instruction);
1311 
1312     // look up content of ra and rb in ucontext
1313     address ra_val=(address)uc->uc_mcontext.regs->gpr[ra];
1314     long rb_val=(long)uc->uc_mcontext.regs->gpr[rb];
1315     return os::is_memory_serialize_page(thread, ra_val+rb_val);
1316   } else if (is_stw(instruction) || is_stwu(instruction)) {
1317     int ra = inv_ra_field(instruction);
1318     int d1 = inv_d1_field(instruction);
1319 
1320     // look up content of ra in ucontext
1321     address ra_val=(address)uc->uc_mcontext.regs->gpr[ra];
1322     return os::is_memory_serialize_page(thread, ra_val+d1);
1323   } else {
1324     return false;
1325   }
1326 #else
1327   // workaround not needed on !LINUX :-)
1328   ShouldNotCallThis();
1329   return false;
1330 #endif
1331 }
1332 
1333 void MacroAssembler::bang_stack_with_offset(int offset) {
1334   // When increasing the stack, the old stack pointer will be written
1335   // to the new top of stack according to the PPC64 abi.
1336   // Therefore, stack banging is not necessary when increasing
1337   // the stack by <= os::vm_page_size() bytes.
1338   // When increasing the stack by a larger amount, this method is
1339   // called repeatedly to bang the intermediate pages.
1340 
1341   // Stack grows down, caller passes positive offset.
1342   assert(offset > 0, "must bang with positive offset");
1343 
1344   long stdoffset = -offset;
1345 
1346   if (is_simm(stdoffset, 16)) {
1347     // Signed 16 bit offset, a simple std is ok.
1348     if (UseLoadInstructionsForStackBangingPPC64) {
1349       ld(R0, (int)(signed short)stdoffset, R1_SP);
1350     } else {
1351       std(R0,(int)(signed short)stdoffset, R1_SP);
1352     }
1353   } else if (is_simm(stdoffset, 31)) {
1354     const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset);
1355     const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset);
1356 
1357     Register tmp = R11;
1358     addis(tmp, R1_SP, hi);
1359     if (UseLoadInstructionsForStackBangingPPC64) {
1360       ld(R0,  lo, tmp);
1361     } else {
1362       std(R0, lo, tmp);
1363     }
1364   } else {
1365     ShouldNotReachHere();
1366   }
1367 }
1368 
1369 // If instruction is a stack bang of the form
1370 //    std    R0,    x(Ry),       (see bang_stack_with_offset())
1371 //    stdu   R1_SP, x(R1_SP),    (see push_frame(), resize_frame())
1372 // or stdux  R1_SP, Rx, R1_SP    (see push_frame(), resize_frame())
1373 // return the banged address. Otherwise, return 0.
1374 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) {
1375 #ifdef LINUX
1376   ucontext_t* uc = (ucontext_t*) ucontext;
1377   int rs = inv_rs_field(instruction);
1378   int ra = inv_ra_field(instruction);
1379   if (   (is_ld(instruction)   && rs == 0 &&  UseLoadInstructionsForStackBangingPPC64)
1380       || (is_std(instruction)  && rs == 0 && !UseLoadInstructionsForStackBangingPPC64)
1381       || (is_stdu(instruction) && rs == 1)) {
1382     int ds = inv_ds_field(instruction);
1383     // return banged address
1384     return ds+(address)uc->uc_mcontext.regs->gpr[ra];
1385   } else if (is_stdux(instruction) && rs == 1) {
1386     int rb = inv_rb_field(instruction);
1387     address sp = (address)uc->uc_mcontext.regs->gpr[1];
1388     long rb_val = (long)uc->uc_mcontext.regs->gpr[rb];
1389     return ra != 1 || rb_val >= 0 ? NULL         // not a stack bang
1390                                   : sp + rb_val; // banged address
1391   }
1392   return NULL; // not a stack bang
1393 #else
1394   // workaround not needed on !LINUX :-)
1395   ShouldNotCallThis();
1396   return NULL;
1397 #endif
1398 }
1399 
1400 // CmpxchgX sets condition register to cmpX(current, compare).
1401 void MacroAssembler::cmpxchgw(ConditionRegister flag, Register dest_current_value,
1402                               Register compare_value, Register exchange_value,
1403                               Register addr_base, int semantics, bool cmpxchgx_hint,
1404                               Register int_flag_success, bool contention_hint) {
1405   Label retry;
1406   Label failed;
1407   Label done;
1408 
1409   // Save one branch if result is returned via register and
1410   // result register is different from the other ones.
1411   bool use_result_reg    = (int_flag_success != noreg);
1412   bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value &&
1413                             int_flag_success != exchange_value && int_flag_success != addr_base);
1414 
1415   if (use_result_reg && preset_result_reg) {
1416     li(int_flag_success, 0); // preset (assume cas failed)
1417   }
1418 
1419   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1420   if (contention_hint) { // Don't try to reserve if cmp fails.
1421     lwz(dest_current_value, 0, addr_base);
1422     cmpw(flag, dest_current_value, compare_value);
1423     bne(flag, failed);
1424   }
1425 
1426   // release/fence semantics
1427   if (semantics & MemBarRel) {
1428     release();
1429   }
1430 
1431   // atomic emulation loop
1432   bind(retry);
1433 
1434   lwarx(dest_current_value, addr_base, cmpxchgx_hint);
1435   cmpw(flag, dest_current_value, compare_value);
1436   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1437     bne_predict_not_taken(flag, failed);
1438   } else {
1439     bne(                  flag, failed);
1440   }
1441   // branch to done  => (flag == ne), (dest_current_value != compare_value)
1442   // fall through    => (flag == eq), (dest_current_value == compare_value)
1443 
1444   stwcx_(exchange_value, addr_base);
1445   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1446     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1447   } else {
1448     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1449   }
1450   // fall through    => (flag == eq), (dest_current_value == compare_value), (swapped)
1451 
1452   // Result in register (must do this at the end because int_flag_success can be the
1453   // same register as one above).
1454   if (use_result_reg) {
1455     li(int_flag_success, 1);
1456   }
1457 
1458   if (semantics & MemBarFenceAfter) {
1459     fence();
1460   } else if (semantics & MemBarAcq) {
1461     isync();
1462   }
1463 
1464   if (use_result_reg && !preset_result_reg) {
1465     b(done);
1466   }
1467 
1468   bind(failed);
1469   if (use_result_reg && !preset_result_reg) {
1470     li(int_flag_success, 0);
1471   }
1472 
1473   bind(done);
1474   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1475   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1476 }
1477 
1478 // Preforms atomic compare exchange:
1479 //   if (compare_value == *addr_base)
1480 //     *addr_base = exchange_value
1481 //     int_flag_success = 1;
1482 //   else
1483 //     int_flag_success = 0;
1484 //
1485 // ConditionRegister flag       = cmp(compare_value, *addr_base)
1486 // Register dest_current_value  = *addr_base
1487 // Register compare_value       Used to compare with value in memory
1488 // Register exchange_value      Written to memory if compare_value == *addr_base
1489 // Register addr_base           The memory location to compareXChange
1490 // Register int_flag_success    Set to 1 if exchange_value was written to *addr_base
1491 //
1492 // To avoid the costly compare exchange the value is tested beforehand.
1493 // Several special cases exist to avoid that unnecessary information is generated.
1494 //
1495 void MacroAssembler::cmpxchgd(ConditionRegister flag,
1496                               Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value,
1497                               Register addr_base, int semantics, bool cmpxchgx_hint,
1498                               Register int_flag_success, Label* failed_ext, bool contention_hint) {
1499   Label retry;
1500   Label failed_int;
1501   Label& failed = (failed_ext != NULL) ? *failed_ext : failed_int;
1502   Label done;
1503 
1504   // Save one branch if result is returned via register and result register is different from the other ones.
1505   bool use_result_reg    = (int_flag_success!=noreg);
1506   bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() &&
1507                             int_flag_success!=exchange_value && int_flag_success!=addr_base);
1508   assert(int_flag_success == noreg || failed_ext == NULL, "cannot have both");
1509 
1510   if (use_result_reg && preset_result_reg) {
1511     li(int_flag_success, 0); // preset (assume cas failed)
1512   }
1513 
1514   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1515   if (contention_hint) { // Don't try to reserve if cmp fails.
1516     ld(dest_current_value, 0, addr_base);
1517     cmpd(flag, compare_value, dest_current_value);
1518     bne(flag, failed);
1519   }
1520 
1521   // release/fence semantics
1522   if (semantics & MemBarRel) {
1523     release();
1524   }
1525 
1526   // atomic emulation loop
1527   bind(retry);
1528 
1529   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1530   cmpd(flag, compare_value, dest_current_value);
1531   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1532     bne_predict_not_taken(flag, failed);
1533   } else {
1534     bne(                  flag, failed);
1535   }
1536 
1537   stdcx_(exchange_value, addr_base);
1538   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1539     bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
1540   } else {
1541     bne(                  CCR0, retry); // stXcx_ sets CCR0
1542   }
1543 
1544   // result in register (must do this at the end because int_flag_success can be the same register as one above)
1545   if (use_result_reg) {
1546     li(int_flag_success, 1);
1547   }
1548 
1549   if (semantics & MemBarFenceAfter) {
1550     fence();
1551   } else if (semantics & MemBarAcq) {
1552     isync();
1553   }
1554 
1555   if (use_result_reg && !preset_result_reg) {
1556     b(done);
1557   }
1558 
1559   bind(failed_int);
1560   if (use_result_reg && !preset_result_reg) {
1561     li(int_flag_success, 0);
1562   }
1563 
1564   bind(done);
1565   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1566   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1567 }
1568 
1569 // Look up the method for a megamorphic invokeinterface call.
1570 // The target method is determined by <intf_klass, itable_index>.
1571 // The receiver klass is in recv_klass.
1572 // On success, the result will be in method_result, and execution falls through.
1573 // On failure, execution transfers to the given label.
1574 void MacroAssembler::lookup_interface_method(Register recv_klass,
1575                                              Register intf_klass,
1576                                              RegisterOrConstant itable_index,
1577                                              Register method_result,
1578                                              Register scan_temp,
1579                                              Register sethi_temp,
1580                                              Label& L_no_such_interface) {
1581   assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
1582   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
1583          "caller must use same register for non-constant itable index as for method");
1584 
1585   // Compute start of first itableOffsetEntry (which is at the end of the vtable).
1586   int vtable_base = in_bytes(InstanceKlass::vtable_start_offset());
1587   int itentry_off = itableMethodEntry::method_offset_in_bytes();
1588   int logMEsize   = exact_log2(itableMethodEntry::size() * wordSize);
1589   int scan_step   = itableOffsetEntry::size() * wordSize;
1590   int log_vte_size= exact_log2(vtableEntry::size_in_bytes());
1591 
1592   lwz(scan_temp, in_bytes(InstanceKlass::vtable_length_offset()), recv_klass);
1593   // %%% We should store the aligned, prescaled offset in the klassoop.
1594   // Then the next several instructions would fold away.
1595 
1596   sldi(scan_temp, scan_temp, log_vte_size);
1597   addi(scan_temp, scan_temp, vtable_base);
1598   add(scan_temp, recv_klass, scan_temp);
1599 
1600   // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1601   if (itable_index.is_register()) {
1602     Register itable_offset = itable_index.as_register();
1603     sldi(itable_offset, itable_offset, logMEsize);
1604     if (itentry_off) addi(itable_offset, itable_offset, itentry_off);
1605     add(recv_klass, itable_offset, recv_klass);
1606   } else {
1607     long itable_offset = (long)itable_index.as_constant();
1608     load_const_optimized(sethi_temp, (itable_offset<<logMEsize)+itentry_off); // static address, no relocation
1609     add(recv_klass, sethi_temp, recv_klass);
1610   }
1611 
1612   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
1613   //   if (scan->interface() == intf) {
1614   //     result = (klass + scan->offset() + itable_index);
1615   //   }
1616   // }
1617   Label search, found_method;
1618 
1619   for (int peel = 1; peel >= 0; peel--) {
1620     // %%%% Could load both offset and interface in one ldx, if they were
1621     // in the opposite order. This would save a load.
1622     ld(method_result, itableOffsetEntry::interface_offset_in_bytes(), scan_temp);
1623 
1624     // Check that this entry is non-null. A null entry means that
1625     // the receiver class doesn't implement the interface, and wasn't the
1626     // same as when the caller was compiled.
1627     cmpd(CCR0, method_result, intf_klass);
1628 
1629     if (peel) {
1630       beq(CCR0, found_method);
1631     } else {
1632       bne(CCR0, search);
1633       // (invert the test to fall through to found_method...)
1634     }
1635 
1636     if (!peel) break;
1637 
1638     bind(search);
1639 
1640     cmpdi(CCR0, method_result, 0);
1641     beq(CCR0, L_no_such_interface);
1642     addi(scan_temp, scan_temp, scan_step);
1643   }
1644 
1645   bind(found_method);
1646 
1647   // Got a hit.
1648   int ito_offset = itableOffsetEntry::offset_offset_in_bytes();
1649   lwz(scan_temp, ito_offset, scan_temp);
1650   ldx(method_result, scan_temp, recv_klass);
1651 }
1652 
1653 // virtual method calling
1654 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1655                                            RegisterOrConstant vtable_index,
1656                                            Register method_result) {
1657 
1658   assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg());
1659 
1660   const int base = in_bytes(InstanceKlass::vtable_start_offset());
1661   assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1662 
1663   if (vtable_index.is_register()) {
1664     sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord);
1665     add(recv_klass, vtable_index.as_register(), recv_klass);
1666   } else {
1667     addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord);
1668   }
1669   ld(R19_method, base + vtableEntry::method_offset_in_bytes(), recv_klass);
1670 }
1671 
1672 /////////////////////////////////////////// subtype checking ////////////////////////////////////////////
1673 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1674                                                    Register super_klass,
1675                                                    Register temp1_reg,
1676                                                    Register temp2_reg,
1677                                                    Label* L_success,
1678                                                    Label* L_failure,
1679                                                    Label* L_slow_path,
1680                                                    RegisterOrConstant super_check_offset) {
1681 
1682   const Register check_cache_offset = temp1_reg;
1683   const Register cached_super       = temp2_reg;
1684 
1685   assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super);
1686 
1687   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1688   int sc_offset  = in_bytes(Klass::secondary_super_cache_offset());
1689 
1690   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1691   bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset);
1692 
1693   Label L_fallthrough;
1694   int label_nulls = 0;
1695   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1696   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1697   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1698   assert(label_nulls <= 1 ||
1699          (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path),
1700          "at most one NULL in the batch, usually");
1701 
1702   // If the pointers are equal, we are done (e.g., String[] elements).
1703   // This self-check enables sharing of secondary supertype arrays among
1704   // non-primary types such as array-of-interface. Otherwise, each such
1705   // type would need its own customized SSA.
1706   // We move this check to the front of the fast path because many
1707   // type checks are in fact trivially successful in this manner,
1708   // so we get a nicely predicted branch right at the start of the check.
1709   cmpd(CCR0, sub_klass, super_klass);
1710   beq(CCR0, *L_success);
1711 
1712   // Check the supertype display:
1713   if (must_load_sco) {
1714     // The super check offset is always positive...
1715   lwz(check_cache_offset, sco_offset, super_klass);
1716     super_check_offset = RegisterOrConstant(check_cache_offset);
1717     // super_check_offset is register.
1718     assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register());
1719   }
1720   // The loaded value is the offset from KlassOopDesc.
1721 
1722   ld(cached_super, super_check_offset, sub_klass);
1723   cmpd(CCR0, cached_super, super_klass);
1724 
1725   // This check has worked decisively for primary supers.
1726   // Secondary supers are sought in the super_cache ('super_cache_addr').
1727   // (Secondary supers are interfaces and very deeply nested subtypes.)
1728   // This works in the same check above because of a tricky aliasing
1729   // between the super_cache and the primary super display elements.
1730   // (The 'super_check_addr' can address either, as the case requires.)
1731   // Note that the cache is updated below if it does not help us find
1732   // what we need immediately.
1733   // So if it was a primary super, we can just fail immediately.
1734   // Otherwise, it's the slow path for us (no success at this point).
1735 
1736 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); }
1737 
1738   if (super_check_offset.is_register()) {
1739     beq(CCR0, *L_success);
1740     cmpwi(CCR0, super_check_offset.as_register(), sc_offset);
1741     if (L_failure == &L_fallthrough) {
1742       beq(CCR0, *L_slow_path);
1743     } else {
1744       bne(CCR0, *L_failure);
1745       FINAL_JUMP(*L_slow_path);
1746     }
1747   } else {
1748     if (super_check_offset.as_constant() == sc_offset) {
1749       // Need a slow path; fast failure is impossible.
1750       if (L_slow_path == &L_fallthrough) {
1751         beq(CCR0, *L_success);
1752       } else {
1753         bne(CCR0, *L_slow_path);
1754         FINAL_JUMP(*L_success);
1755       }
1756     } else {
1757       // No slow path; it's a fast decision.
1758       if (L_failure == &L_fallthrough) {
1759         beq(CCR0, *L_success);
1760       } else {
1761         bne(CCR0, *L_failure);
1762         FINAL_JUMP(*L_success);
1763       }
1764     }
1765   }
1766 
1767   bind(L_fallthrough);
1768 #undef FINAL_JUMP
1769 }
1770 
1771 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1772                                                    Register super_klass,
1773                                                    Register temp1_reg,
1774                                                    Register temp2_reg,
1775                                                    Label* L_success,
1776                                                    Register result_reg) {
1777   const Register array_ptr = temp1_reg; // current value from cache array
1778   const Register temp      = temp2_reg;
1779 
1780   assert_different_registers(sub_klass, super_klass, array_ptr, temp);
1781 
1782   int source_offset = in_bytes(Klass::secondary_supers_offset());
1783   int target_offset = in_bytes(Klass::secondary_super_cache_offset());
1784 
1785   int length_offset = Array<Klass*>::length_offset_in_bytes();
1786   int base_offset   = Array<Klass*>::base_offset_in_bytes();
1787 
1788   Label hit, loop, failure, fallthru;
1789 
1790   ld(array_ptr, source_offset, sub_klass);
1791 
1792   // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated.");
1793   lwz(temp, length_offset, array_ptr);
1794   cmpwi(CCR0, temp, 0);
1795   beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0
1796 
1797   mtctr(temp); // load ctr
1798 
1799   bind(loop);
1800   // Oops in table are NO MORE compressed.
1801   ld(temp, base_offset, array_ptr);
1802   cmpd(CCR0, temp, super_klass);
1803   beq(CCR0, hit);
1804   addi(array_ptr, array_ptr, BytesPerWord);
1805   bdnz(loop);
1806 
1807   bind(failure);
1808   if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss)
1809   b(fallthru);
1810 
1811   bind(hit);
1812   std(super_klass, target_offset, sub_klass); // save result to cache
1813   if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit)
1814   if (L_success != NULL) { b(*L_success); }
1815   else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided
1816 
1817   bind(fallthru);
1818 }
1819 
1820 // Try fast path, then go to slow one if not successful
1821 void MacroAssembler::check_klass_subtype(Register sub_klass,
1822                          Register super_klass,
1823                          Register temp1_reg,
1824                          Register temp2_reg,
1825                          Label& L_success) {
1826   Label L_failure;
1827   check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure);
1828   check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success);
1829   bind(L_failure); // Fallthru if not successful.
1830 }
1831 
1832 void MacroAssembler::check_method_handle_type(Register mtype_reg, Register mh_reg,
1833                                               Register temp_reg,
1834                                               Label& wrong_method_type) {
1835   assert_different_registers(mtype_reg, mh_reg, temp_reg);
1836   // Compare method type against that of the receiver.
1837   load_heap_oop_not_null(temp_reg, delayed_value(java_lang_invoke_MethodHandle::type_offset_in_bytes, temp_reg), mh_reg);
1838   cmpd(CCR0, temp_reg, mtype_reg);
1839   bne(CCR0, wrong_method_type);
1840 }
1841 
1842 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot,
1843                                                    Register temp_reg,
1844                                                    int extra_slot_offset) {
1845   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
1846   int stackElementSize = Interpreter::stackElementSize;
1847   int offset = extra_slot_offset * stackElementSize;
1848   if (arg_slot.is_constant()) {
1849     offset += arg_slot.as_constant() * stackElementSize;
1850     return offset;
1851   } else {
1852     assert(temp_reg != noreg, "must specify");
1853     sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize));
1854     if (offset != 0)
1855       addi(temp_reg, temp_reg, offset);
1856     return temp_reg;
1857   }
1858 }
1859 
1860 // Supports temp2_reg = R0.
1861 void MacroAssembler::biased_locking_enter(ConditionRegister cr_reg, Register obj_reg,
1862                                           Register mark_reg, Register temp_reg,
1863                                           Register temp2_reg, Label& done, Label* slow_case) {
1864   assert(UseBiasedLocking, "why call this otherwise?");
1865 
1866 #ifdef ASSERT
1867   assert_different_registers(obj_reg, mark_reg, temp_reg, temp2_reg);
1868 #endif
1869 
1870   Label cas_label;
1871 
1872   // Branch to done if fast path fails and no slow_case provided.
1873   Label *slow_case_int = (slow_case != NULL) ? slow_case : &done;
1874 
1875   // Biased locking
1876   // See whether the lock is currently biased toward our thread and
1877   // whether the epoch is still valid
1878   // Note that the runtime guarantees sufficient alignment of JavaThread
1879   // pointers to allow age to be placed into low bits
1880   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits,
1881          "biased locking makes assumptions about bit layout");
1882 
1883   if (PrintBiasedLockingStatistics) {
1884     load_const(temp2_reg, (address) BiasedLocking::total_entry_count_addr(), temp_reg);
1885     lwzx(temp_reg, temp2_reg);
1886     addi(temp_reg, temp_reg, 1);
1887     stwx(temp_reg, temp2_reg);
1888   }
1889 
1890   andi(temp_reg, mark_reg, markOopDesc::biased_lock_mask_in_place);
1891   cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern);
1892   bne(cr_reg, cas_label);
1893 
1894   load_klass(temp_reg, obj_reg);
1895 
1896   load_const_optimized(temp2_reg, ~((int) markOopDesc::age_mask_in_place));
1897   ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
1898   orr(temp_reg, R16_thread, temp_reg);
1899   xorr(temp_reg, mark_reg, temp_reg);
1900   andr(temp_reg, temp_reg, temp2_reg);
1901   cmpdi(cr_reg, temp_reg, 0);
1902   if (PrintBiasedLockingStatistics) {
1903     Label l;
1904     bne(cr_reg, l);
1905     load_const(temp2_reg, (address) BiasedLocking::biased_lock_entry_count_addr());
1906     lwzx(mark_reg, temp2_reg);
1907     addi(mark_reg, mark_reg, 1);
1908     stwx(mark_reg, temp2_reg);
1909     // restore mark_reg
1910     ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);
1911     bind(l);
1912   }
1913   beq(cr_reg, done);
1914 
1915   Label try_revoke_bias;
1916   Label try_rebias;
1917 
1918   // At this point we know that the header has the bias pattern and
1919   // that we are not the bias owner in the current epoch. We need to
1920   // figure out more details about the state of the header in order to
1921   // know what operations can be legally performed on the object's
1922   // header.
1923 
1924   // If the low three bits in the xor result aren't clear, that means
1925   // the prototype header is no longer biased and we have to revoke
1926   // the bias on this object.
1927   andi(temp2_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
1928   cmpwi(cr_reg, temp2_reg, 0);
1929   bne(cr_reg, try_revoke_bias);
1930 
1931   // Biasing is still enabled for this data type. See whether the
1932   // epoch of the current bias is still valid, meaning that the epoch
1933   // bits of the mark word are equal to the epoch bits of the
1934   // prototype header. (Note that the prototype header's epoch bits
1935   // only change at a safepoint.) If not, attempt to rebias the object
1936   // toward the current thread. Note that we must be absolutely sure
1937   // that the current epoch is invalid in order to do this because
1938   // otherwise the manipulations it performs on the mark word are
1939   // illegal.
1940 
1941   int shift_amount = 64 - markOopDesc::epoch_shift;
1942   // rotate epoch bits to right (little) end and set other bits to 0
1943   // [ big part | epoch | little part ] -> [ 0..0 | epoch ]
1944   rldicl_(temp2_reg, temp_reg, shift_amount, 64 - markOopDesc::epoch_bits);
1945   // branch if epoch bits are != 0, i.e. they differ, because the epoch has been incremented
1946   bne(CCR0, try_rebias);
1947 
1948   // The epoch of the current bias is still valid but we know nothing
1949   // about the owner; it might be set or it might be clear. Try to
1950   // acquire the bias of the object using an atomic operation. If this
1951   // fails we will go in to the runtime to revoke the object's bias.
1952   // Note that we first construct the presumed unbiased header so we
1953   // don't accidentally blow away another thread's valid bias.
1954   andi(mark_reg, mark_reg, (markOopDesc::biased_lock_mask_in_place |
1955                                 markOopDesc::age_mask_in_place |
1956                                 markOopDesc::epoch_mask_in_place));
1957   orr(temp_reg, R16_thread, mark_reg);
1958 
1959   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
1960 
1961   // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
1962   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
1963            /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
1964            /*where=*/obj_reg,
1965            MacroAssembler::MemBarAcq,
1966            MacroAssembler::cmpxchgx_hint_acquire_lock(),
1967            noreg, slow_case_int); // bail out if failed
1968 
1969   // If the biasing toward our thread failed, this means that
1970   // another thread succeeded in biasing it toward itself and we
1971   // need to revoke that bias. The revocation will occur in the
1972   // interpreter runtime in the slow case.
1973   if (PrintBiasedLockingStatistics) {
1974     load_const(temp2_reg, (address) BiasedLocking::anonymously_biased_lock_entry_count_addr(), temp_reg);
1975     lwzx(temp_reg, temp2_reg);
1976     addi(temp_reg, temp_reg, 1);
1977     stwx(temp_reg, temp2_reg);
1978   }
1979   b(done);
1980 
1981   bind(try_rebias);
1982   // At this point we know the epoch has expired, meaning that the
1983   // current "bias owner", if any, is actually invalid. Under these
1984   // circumstances _only_, we are allowed to use the current header's
1985   // value as the comparison value when doing the cas to acquire the
1986   // bias in the current epoch. In other words, we allow transfer of
1987   // the bias from one thread to another directly in this situation.
1988   load_klass(temp_reg, obj_reg);
1989   andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place);
1990   orr(temp2_reg, R16_thread, temp2_reg);
1991   ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
1992   orr(temp_reg, temp2_reg, temp_reg);
1993 
1994   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
1995 
1996   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
1997                  /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
1998                  /*where=*/obj_reg,
1999                  MacroAssembler::MemBarAcq,
2000                  MacroAssembler::cmpxchgx_hint_acquire_lock(),
2001                  noreg, slow_case_int); // bail out if failed
2002 
2003   // If the biasing toward our thread failed, this means that
2004   // another thread succeeded in biasing it toward itself and we
2005   // need to revoke that bias. The revocation will occur in the
2006   // interpreter runtime in the slow case.
2007   if (PrintBiasedLockingStatistics) {
2008     load_const(temp2_reg, (address) BiasedLocking::rebiased_lock_entry_count_addr(), temp_reg);
2009     lwzx(temp_reg, temp2_reg);
2010     addi(temp_reg, temp_reg, 1);
2011     stwx(temp_reg, temp2_reg);
2012   }
2013   b(done);
2014 
2015   bind(try_revoke_bias);
2016   // The prototype mark in the klass doesn't have the bias bit set any
2017   // more, indicating that objects of this data type are not supposed
2018   // to be biased any more. We are going to try to reset the mark of
2019   // this object to the prototype value and fall through to the
2020   // CAS-based locking scheme. Note that if our CAS fails, it means
2021   // that another thread raced us for the privilege of revoking the
2022   // bias of this particular object, so it's okay to continue in the
2023   // normal locking code.
2024   load_klass(temp_reg, obj_reg);
2025   ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
2026   andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place);
2027   orr(temp_reg, temp_reg, temp2_reg);
2028 
2029   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2030 
2031   // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
2032   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
2033                  /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
2034                  /*where=*/obj_reg,
2035                  MacroAssembler::MemBarAcq,
2036                  MacroAssembler::cmpxchgx_hint_acquire_lock());
2037 
2038   // reload markOop in mark_reg before continuing with lightweight locking
2039   ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);
2040 
2041   // Fall through to the normal CAS-based lock, because no matter what
2042   // the result of the above CAS, some thread must have succeeded in
2043   // removing the bias bit from the object's header.
2044   if (PrintBiasedLockingStatistics) {
2045     Label l;
2046     bne(cr_reg, l);
2047     load_const(temp2_reg, (address) BiasedLocking::revoked_lock_entry_count_addr(), temp_reg);
2048     lwzx(temp_reg, temp2_reg);
2049     addi(temp_reg, temp_reg, 1);
2050     stwx(temp_reg, temp2_reg);
2051     bind(l);
2052   }
2053 
2054   bind(cas_label);
2055 }
2056 
2057 void MacroAssembler::biased_locking_exit (ConditionRegister cr_reg, Register mark_addr, Register temp_reg, Label& done) {
2058   // Check for biased locking unlock case, which is a no-op
2059   // Note: we do not have to check the thread ID for two reasons.
2060   // First, the interpreter checks for IllegalMonitorStateException at
2061   // a higher level. Second, if the bias was revoked while we held the
2062   // lock, the object could not be rebiased toward another thread, so
2063   // the bias bit would be clear.
2064 
2065   ld(temp_reg, 0, mark_addr);
2066   andi(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
2067 
2068   cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern);
2069   beq(cr_reg, done);
2070 }
2071 
2072 // allocation (for C1)
2073 void MacroAssembler::eden_allocate(
2074   Register obj,                      // result: pointer to object after successful allocation
2075   Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
2076   int      con_size_in_bytes,        // object size in bytes if   known at compile time
2077   Register t1,                       // temp register
2078   Register t2,                       // temp register
2079   Label&   slow_case                 // continuation point if fast allocation fails
2080 ) {
2081   b(slow_case);
2082 }
2083 
2084 void MacroAssembler::tlab_allocate(
2085   Register obj,                      // result: pointer to object after successful allocation
2086   Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
2087   int      con_size_in_bytes,        // object size in bytes if   known at compile time
2088   Register t1,                       // temp register
2089   Label&   slow_case                 // continuation point if fast allocation fails
2090 ) {
2091   // make sure arguments make sense
2092   assert_different_registers(obj, var_size_in_bytes, t1);
2093   assert(0 <= con_size_in_bytes && is_simm13(con_size_in_bytes), "illegal object size");
2094   assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment");
2095 
2096   const Register new_top = t1;
2097   //verify_tlab(); not implemented
2098 
2099   ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2100   ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread);
2101   if (var_size_in_bytes == noreg) {
2102     addi(new_top, obj, con_size_in_bytes);
2103   } else {
2104     add(new_top, obj, var_size_in_bytes);
2105   }
2106   cmpld(CCR0, new_top, R0);
2107   bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case);
2108 
2109 #ifdef ASSERT
2110   // make sure new free pointer is properly aligned
2111   {
2112     Label L;
2113     andi_(R0, new_top, MinObjAlignmentInBytesMask);
2114     beq(CCR0, L);
2115     stop("updated TLAB free is not properly aligned", 0x934);
2116     bind(L);
2117   }
2118 #endif // ASSERT
2119 
2120   // update the tlab top pointer
2121   std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2122   //verify_tlab(); not implemented
2123 }
2124 void MacroAssembler::tlab_refill(Label& retry_tlab, Label& try_eden, Label& slow_case) {
2125   unimplemented("tlab_refill");
2126 }
2127 void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register t1, Register t2) {
2128   unimplemented("incr_allocated_bytes");
2129 }
2130 
2131 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset,
2132                                              int insts_call_instruction_offset, Register Rtoc) {
2133   // Start the stub.
2134   address stub = start_a_stub(64);
2135   if (stub == NULL) { return NULL; } // CodeCache full: bail out
2136 
2137   // Create a trampoline stub relocation which relates this trampoline stub
2138   // with the call instruction at insts_call_instruction_offset in the
2139   // instructions code-section.
2140   relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset));
2141   const int stub_start_offset = offset();
2142 
2143   // For java_to_interp stubs we use R11_scratch1 as scratch register
2144   // and in call trampoline stubs we use R12_scratch2. This way we
2145   // can distinguish them (see is_NativeCallTrampolineStub_at()).
2146   Register reg_scratch = R12_scratch2;
2147 
2148   // Now, create the trampoline stub's code:
2149   // - load the TOC
2150   // - load the call target from the constant pool
2151   // - call
2152   if (Rtoc == noreg) {
2153     calculate_address_from_global_toc(reg_scratch, method_toc());
2154     Rtoc = reg_scratch;
2155   }
2156 
2157   ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false);
2158   mtctr(reg_scratch);
2159   bctr();
2160 
2161   const address stub_start_addr = addr_at(stub_start_offset);
2162 
2163   // Assert that the encoded destination_toc_offset can be identified and that it is correct.
2164   assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(),
2165          "encoded offset into the constant pool must match");
2166   // Trampoline_stub_size should be good.
2167   assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size");
2168   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
2169 
2170   // End the stub.
2171   end_a_stub();
2172   return stub;
2173 }
2174 
2175 // TM on PPC64.
2176 void MacroAssembler::atomic_inc_ptr(Register addr, Register result, int simm16) {
2177   Label retry;
2178   bind(retry);
2179   ldarx(result, addr, /*hint*/ false);
2180   addi(result, result, simm16);
2181   stdcx_(result, addr);
2182   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
2183     bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
2184   } else {
2185     bne(                  CCR0, retry); // stXcx_ sets CCR0
2186   }
2187 }
2188 
2189 void MacroAssembler::atomic_ori_int(Register addr, Register result, int uimm16) {
2190   Label retry;
2191   bind(retry);
2192   lwarx(result, addr, /*hint*/ false);
2193   ori(result, result, uimm16);
2194   stwcx_(result, addr);
2195   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
2196     bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
2197   } else {
2198     bne(                  CCR0, retry); // stXcx_ sets CCR0
2199   }
2200 }
2201 
2202 #if INCLUDE_RTM_OPT
2203 
2204 // Update rtm_counters based on abort status
2205 // input: abort_status
2206 //        rtm_counters (RTMLockingCounters*)
2207 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters_Reg) {
2208   // Mapping to keep PreciseRTMLockingStatistics similar to x86.
2209   // x86 ppc (! means inverted, ? means not the same)
2210   //  0   31  Set if abort caused by XABORT instruction.
2211   //  1  ! 7  If set, the transaction may succeed on a retry. This bit is always clear if bit 0 is set.
2212   //  2   13  Set if another logical processor conflicted with a memory address that was part of the transaction that aborted.
2213   //  3   10  Set if an internal buffer overflowed.
2214   //  4  ?12  Set if a debug breakpoint was hit.
2215   //  5  ?32  Set if an abort occurred during execution of a nested transaction.
2216   const  int tm_failure_bit[] = {Assembler::tm_tabort, // Note: Seems like signal handler sets this, too.
2217                                  Assembler::tm_failure_persistent, // inverted: transient
2218                                  Assembler::tm_trans_cf,
2219                                  Assembler::tm_footprint_of,
2220                                  Assembler::tm_non_trans_cf,
2221                                  Assembler::tm_suspended};
2222   const bool tm_failure_inv[] = {false, true, false, false, false, false};
2223   assert(sizeof(tm_failure_bit)/sizeof(int) == RTMLockingCounters::ABORT_STATUS_LIMIT, "adapt mapping!");
2224 
2225   const Register addr_Reg = R0;
2226   // Keep track of offset to where rtm_counters_Reg had pointed to.
2227   int counters_offs = RTMLockingCounters::abort_count_offset();
2228   addi(addr_Reg, rtm_counters_Reg, counters_offs);
2229   const Register temp_Reg = rtm_counters_Reg;
2230 
2231   //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically
2232   ldx(temp_Reg, addr_Reg);
2233   addi(temp_Reg, temp_Reg, 1);
2234   stdx(temp_Reg, addr_Reg);
2235 
2236   if (PrintPreciseRTMLockingStatistics) {
2237     int counters_offs_delta = RTMLockingCounters::abortX_count_offset() - counters_offs;
2238 
2239     //mftexasr(abort_status); done by caller
2240     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
2241       counters_offs += counters_offs_delta;
2242       li(temp_Reg, counters_offs_delta); // can't use addi with R0
2243       add(addr_Reg, addr_Reg, temp_Reg); // point to next counter
2244       counters_offs_delta = sizeof(uintx);
2245 
2246       Label check_abort;
2247       rldicr_(temp_Reg, abort_status, tm_failure_bit[i], 0);
2248       if (tm_failure_inv[i]) {
2249         bne(CCR0, check_abort);
2250       } else {
2251         beq(CCR0, check_abort);
2252       }
2253       //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically
2254       ldx(temp_Reg, addr_Reg);
2255       addi(temp_Reg, temp_Reg, 1);
2256       stdx(temp_Reg, addr_Reg);
2257       bind(check_abort);
2258     }
2259   }
2260   li(temp_Reg, -counters_offs); // can't use addi with R0
2261   add(rtm_counters_Reg, addr_Reg, temp_Reg); // restore
2262 }
2263 
2264 // Branch if (random & (count-1) != 0), count is 2^n
2265 // tmp and CR0 are killed
2266 void MacroAssembler::branch_on_random_using_tb(Register tmp, int count, Label& brLabel) {
2267   mftb(tmp);
2268   andi_(tmp, tmp, count-1);
2269   bne(CCR0, brLabel);
2270 }
2271 
2272 // Perform abort ratio calculation, set no_rtm bit if high ratio.
2273 // input:  rtm_counters_Reg (RTMLockingCounters* address) - KILLED
2274 void MacroAssembler::rtm_abort_ratio_calculation(Register rtm_counters_Reg,
2275                                                  RTMLockingCounters* rtm_counters,
2276                                                  Metadata* method_data) {
2277   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
2278 
2279   if (RTMLockingCalculationDelay > 0) {
2280     // Delay calculation.
2281     ld(rtm_counters_Reg, (RegisterOrConstant)(intptr_t)RTMLockingCounters::rtm_calculation_flag_addr());
2282     cmpdi(CCR0, rtm_counters_Reg, 0);
2283     beq(CCR0, L_done);
2284     load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2285   }
2286   // Abort ratio calculation only if abort_count > RTMAbortThreshold.
2287   //   Aborted transactions = abort_count * 100
2288   //   All transactions = total_count *  RTMTotalCountIncrRate
2289   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
2290   ld(R0, RTMLockingCounters::abort_count_offset(), rtm_counters_Reg);
2291   cmpdi(CCR0, R0, RTMAbortThreshold);
2292   blt(CCR0, L_check_always_rtm2);
2293   mulli(R0, R0, 100);
2294 
2295   const Register tmpReg = rtm_counters_Reg;
2296   ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2297   mulli(tmpReg, tmpReg, RTMTotalCountIncrRate);
2298   mulli(tmpReg, tmpReg, RTMAbortRatio);
2299   cmpd(CCR0, R0, tmpReg);
2300   blt(CCR0, L_check_always_rtm1); // jump to reload
2301   if (method_data != NULL) {
2302     // Set rtm_state to "no rtm" in MDO.
2303     // Not using a metadata relocation. Method and Class Loader are kept alive anyway.
2304     // (See nmethod::metadata_do and CodeBuffer::finalize_oop_references.)
2305     load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2306     atomic_ori_int(R0, tmpReg, NoRTM);
2307   }
2308   b(L_done);
2309 
2310   bind(L_check_always_rtm1);
2311   load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2312   bind(L_check_always_rtm2);
2313   ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2314   cmpdi(CCR0, tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
2315   blt(CCR0, L_done);
2316   if (method_data != NULL) {
2317     // Set rtm_state to "always rtm" in MDO.
2318     // Not using a metadata relocation. See above.
2319     load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2320     atomic_ori_int(R0, tmpReg, UseRTM);
2321   }
2322   bind(L_done);
2323 }
2324 
2325 // Update counters and perform abort ratio calculation.
2326 // input: abort_status_Reg
2327 void MacroAssembler::rtm_profiling(Register abort_status_Reg, Register temp_Reg,
2328                                    RTMLockingCounters* rtm_counters,
2329                                    Metadata* method_data,
2330                                    bool profile_rtm) {
2331 
2332   assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2333   // Update rtm counters based on state at abort.
2334   // Reads abort_status_Reg, updates flags.
2335   assert_different_registers(abort_status_Reg, temp_Reg);
2336   load_const_optimized(temp_Reg, (address)rtm_counters, R0);
2337   rtm_counters_update(abort_status_Reg, temp_Reg);
2338   if (profile_rtm) {
2339     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2340     rtm_abort_ratio_calculation(temp_Reg, rtm_counters, method_data);
2341   }
2342 }
2343 
2344 // Retry on abort if abort's status indicates non-persistent failure.
2345 // inputs: retry_count_Reg
2346 //       : abort_status_Reg
2347 // output: retry_count_Reg decremented by 1
2348 void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg,
2349                                              Label& retryLabel, Label* checkRetry) {
2350   Label doneRetry;
2351   rldicr_(R0, abort_status_Reg, tm_failure_persistent, 0);
2352   bne(CCR0, doneRetry);
2353   if (checkRetry) { bind(*checkRetry); }
2354   addic_(retry_count_Reg, retry_count_Reg, -1);
2355   blt(CCR0, doneRetry);
2356   smt_yield(); // Can't use wait(). No permission (SIGILL).
2357   b(retryLabel);
2358   bind(doneRetry);
2359 }
2360 
2361 // Spin and retry if lock is busy.
2362 // inputs: box_Reg (monitor address)
2363 //       : retry_count_Reg
2364 // output: retry_count_Reg decremented by 1
2365 // CTR is killed
2366 void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register owner_addr_Reg, Label& retryLabel) {
2367   Label SpinLoop, doneRetry;
2368   addic_(retry_count_Reg, retry_count_Reg, -1);
2369   blt(CCR0, doneRetry);
2370   li(R0, RTMSpinLoopCount);
2371   mtctr(R0);
2372 
2373   bind(SpinLoop);
2374   smt_yield(); // Can't use waitrsv(). No permission (SIGILL).
2375   bdz(retryLabel);
2376   ld(R0, 0, owner_addr_Reg);
2377   cmpdi(CCR0, R0, 0);
2378   bne(CCR0, SpinLoop);
2379   b(retryLabel);
2380 
2381   bind(doneRetry);
2382 }
2383 
2384 // Use RTM for normal stack locks.
2385 // Input: objReg (object to lock)
2386 void MacroAssembler::rtm_stack_locking(ConditionRegister flag,
2387                                        Register obj, Register mark_word, Register tmp,
2388                                        Register retry_on_abort_count_Reg,
2389                                        RTMLockingCounters* stack_rtm_counters,
2390                                        Metadata* method_data, bool profile_rtm,
2391                                        Label& DONE_LABEL, Label& IsInflated) {
2392   assert(UseRTMForStackLocks, "why call this otherwise?");
2393   assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
2394   Label L_rtm_retry, L_decrement_retry, L_on_abort;
2395 
2396   if (RTMRetryCount > 0) {
2397     load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
2398     bind(L_rtm_retry);
2399   }
2400   andi_(R0, mark_word, markOopDesc::monitor_value);  // inflated vs stack-locked|neutral|biased
2401   bne(CCR0, IsInflated);
2402 
2403   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2404     Label L_noincrement;
2405     if (RTMTotalCountIncrRate > 1) {
2406       branch_on_random_using_tb(tmp, (int)RTMTotalCountIncrRate, L_noincrement);
2407     }
2408     assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
2409     load_const_optimized(tmp, (address)stack_rtm_counters->total_count_addr(), R0);
2410     //atomic_inc_ptr(tmp, /*temp, will be reloaded*/mark_word); We don't increment atomically
2411     ldx(mark_word, tmp);
2412     addi(mark_word, mark_word, 1);
2413     stdx(mark_word, tmp);
2414     bind(L_noincrement);
2415   }
2416   tbegin_();
2417   beq(CCR0, L_on_abort);
2418   ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);         // Reload in transaction, conflicts need to be tracked.
2419   andi(R0, mark_word, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
2420   cmpwi(flag, R0, markOopDesc::unlocked_value);                // bits = 001 unlocked
2421   beq(flag, DONE_LABEL);                                       // all done if unlocked
2422 
2423   if (UseRTMXendForLockBusy) {
2424     tend_();
2425     b(L_decrement_retry);
2426   } else {
2427     tabort_();
2428   }
2429   bind(L_on_abort);
2430   const Register abort_status_Reg = tmp;
2431   mftexasr(abort_status_Reg);
2432   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2433     rtm_profiling(abort_status_Reg, /*temp*/mark_word, stack_rtm_counters, method_data, profile_rtm);
2434   }
2435   ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // reload
2436   if (RTMRetryCount > 0) {
2437     // Retry on lock abort if abort status is not permanent.
2438     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry, &L_decrement_retry);
2439   } else {
2440     bind(L_decrement_retry);
2441   }
2442 }
2443 
2444 // Use RTM for inflating locks
2445 // inputs: obj       (object to lock)
2446 //         mark_word (current header - KILLED)
2447 //         boxReg    (on-stack box address (displaced header location) - KILLED)
2448 void MacroAssembler::rtm_inflated_locking(ConditionRegister flag,
2449                                           Register obj, Register mark_word, Register boxReg,
2450                                           Register retry_on_busy_count_Reg, Register retry_on_abort_count_Reg,
2451                                           RTMLockingCounters* rtm_counters,
2452                                           Metadata* method_data, bool profile_rtm,
2453                                           Label& DONE_LABEL) {
2454   assert(UseRTMLocking, "why call this otherwise?");
2455   Label L_rtm_retry, L_decrement_retry, L_on_abort;
2456   // Clean monitor_value bit to get valid pointer.
2457   int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markOopDesc::monitor_value;
2458 
2459   // Store non-null, using boxReg instead of (intptr_t)markOopDesc::unused_mark().
2460   std(boxReg, BasicLock::displaced_header_offset_in_bytes(), boxReg);
2461   const Register tmpReg = boxReg;
2462   const Register owner_addr_Reg = mark_word;
2463   addi(owner_addr_Reg, mark_word, owner_offset);
2464 
2465   if (RTMRetryCount > 0) {
2466     load_const_optimized(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy.
2467     load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort.
2468     bind(L_rtm_retry);
2469   }
2470   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2471     Label L_noincrement;
2472     if (RTMTotalCountIncrRate > 1) {
2473       branch_on_random_using_tb(R0, (int)RTMTotalCountIncrRate, L_noincrement);
2474     }
2475     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2476     load_const(R0, (address)rtm_counters->total_count_addr(), tmpReg);
2477     //atomic_inc_ptr(R0, tmpReg); We don't increment atomically
2478     ldx(tmpReg, R0);
2479     addi(tmpReg, tmpReg, 1);
2480     stdx(tmpReg, R0);
2481     bind(L_noincrement);
2482   }
2483   tbegin_();
2484   beq(CCR0, L_on_abort);
2485   // We don't reload mark word. Will only be reset at safepoint.
2486   ld(R0, 0, owner_addr_Reg); // Load in transaction, conflicts need to be tracked.
2487   cmpdi(flag, R0, 0);
2488   beq(flag, DONE_LABEL);
2489 
2490   if (UseRTMXendForLockBusy) {
2491     tend_();
2492     b(L_decrement_retry);
2493   } else {
2494     tabort_();
2495   }
2496   bind(L_on_abort);
2497   const Register abort_status_Reg = tmpReg;
2498   mftexasr(abort_status_Reg);
2499   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2500     rtm_profiling(abort_status_Reg, /*temp*/ owner_addr_Reg, rtm_counters, method_data, profile_rtm);
2501     // Restore owner_addr_Reg
2502     ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);
2503 #ifdef ASSERT
2504     andi_(R0, mark_word, markOopDesc::monitor_value);
2505     asm_assert_ne("must be inflated", 0xa754); // Deflating only allowed at safepoint.
2506 #endif
2507     addi(owner_addr_Reg, mark_word, owner_offset);
2508   }
2509   if (RTMRetryCount > 0) {
2510     // Retry on lock abort if abort status is not permanent.
2511     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
2512   }
2513 
2514   // Appears unlocked - try to swing _owner from null to non-null.
2515   cmpxchgd(flag, /*current val*/ R0, (intptr_t)0, /*new val*/ R16_thread, owner_addr_Reg,
2516            MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2517            MacroAssembler::cmpxchgx_hint_acquire_lock(), noreg, &L_decrement_retry, true);
2518 
2519   if (RTMRetryCount > 0) {
2520     // success done else retry
2521     b(DONE_LABEL);
2522     bind(L_decrement_retry);
2523     // Spin and retry if lock is busy.
2524     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, owner_addr_Reg, L_rtm_retry);
2525   } else {
2526     bind(L_decrement_retry);
2527   }
2528 }
2529 
2530 #endif //  INCLUDE_RTM_OPT
2531 
2532 // "The box" is the space on the stack where we copy the object mark.
2533 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box,
2534                                                Register temp, Register displaced_header, Register current_header,
2535                                                bool try_bias,
2536                                                RTMLockingCounters* rtm_counters,
2537                                                RTMLockingCounters* stack_rtm_counters,
2538                                                Metadata* method_data,
2539                                                bool use_rtm, bool profile_rtm) {
2540   assert_different_registers(oop, box, temp, displaced_header, current_header);
2541   assert(flag != CCR0, "bad condition register");
2542   Label cont;
2543   Label object_has_monitor;
2544   Label cas_failed;
2545 
2546   // Load markOop from object into displaced_header.
2547   ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop);
2548 
2549 
2550   // Always do locking in runtime.
2551   if (EmitSync & 0x01) {
2552     cmpdi(flag, oop, 0); // Oop can't be 0 here => always false.
2553     return;
2554   }
2555 
2556   if (try_bias) {
2557     biased_locking_enter(flag, oop, displaced_header, temp, current_header, cont);
2558   }
2559 
2560 #if INCLUDE_RTM_OPT
2561   if (UseRTMForStackLocks && use_rtm) {
2562     rtm_stack_locking(flag, oop, displaced_header, temp, /*temp*/ current_header,
2563                       stack_rtm_counters, method_data, profile_rtm,
2564                       cont, object_has_monitor);
2565   }
2566 #endif // INCLUDE_RTM_OPT
2567 
2568   // Handle existing monitor.
2569   if ((EmitSync & 0x02) == 0) {
2570     // The object has an existing monitor iff (mark & monitor_value) != 0.
2571     andi_(temp, displaced_header, markOopDesc::monitor_value);
2572     bne(CCR0, object_has_monitor);
2573   }
2574 
2575   // Set displaced_header to be (markOop of object | UNLOCK_VALUE).
2576   ori(displaced_header, displaced_header, markOopDesc::unlocked_value);
2577 
2578   // Load Compare Value application register.
2579 
2580   // Initialize the box. (Must happen before we update the object mark!)
2581   std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2582 
2583   // Must fence, otherwise, preceding store(s) may float below cmpxchg.
2584   // Compare object markOop with mark and if equal exchange scratch1 with object markOop.
2585   cmpxchgd(/*flag=*/flag,
2586            /*current_value=*/current_header,
2587            /*compare_value=*/displaced_header,
2588            /*exchange_value=*/box,
2589            /*where=*/oop,
2590            MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2591            MacroAssembler::cmpxchgx_hint_acquire_lock(),
2592            noreg,
2593            &cas_failed,
2594            /*check without membar and ldarx first*/true);
2595   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2596 
2597   // If the compare-and-exchange succeeded, then we found an unlocked
2598   // object and we have now locked it.
2599   b(cont);
2600 
2601   bind(cas_failed);
2602   // We did not see an unlocked object so try the fast recursive case.
2603 
2604   // Check if the owner is self by comparing the value in the markOop of object
2605   // (current_header) with the stack pointer.
2606   sub(current_header, current_header, R1_SP);
2607   load_const_optimized(temp, ~(os::vm_page_size()-1) | markOopDesc::lock_mask_in_place);
2608 
2609   and_(R0/*==0?*/, current_header, temp);
2610   // If condition is true we are cont and hence we can store 0 as the
2611   // displaced header in the box, which indicates that it is a recursive lock.
2612   mcrf(flag,CCR0);
2613   std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box);
2614 
2615   // Handle existing monitor.
2616   if ((EmitSync & 0x02) == 0) {
2617     b(cont);
2618 
2619     bind(object_has_monitor);
2620     // The object's monitor m is unlocked iff m->owner == NULL,
2621     // otherwise m->owner may contain a thread or a stack address.
2622 
2623 #if INCLUDE_RTM_OPT
2624     // Use the same RTM locking code in 32- and 64-bit VM.
2625     if (use_rtm) {
2626       rtm_inflated_locking(flag, oop, displaced_header, box, temp, /*temp*/ current_header,
2627                            rtm_counters, method_data, profile_rtm, cont);
2628     } else {
2629 #endif // INCLUDE_RTM_OPT
2630 
2631     // Try to CAS m->owner from NULL to current thread.
2632     addi(temp, displaced_header, ObjectMonitor::owner_offset_in_bytes()-markOopDesc::monitor_value);
2633     cmpxchgd(/*flag=*/flag,
2634              /*current_value=*/current_header,
2635              /*compare_value=*/(intptr_t)0,
2636              /*exchange_value=*/R16_thread,
2637              /*where=*/temp,
2638              MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2639              MacroAssembler::cmpxchgx_hint_acquire_lock());
2640 
2641     // Store a non-null value into the box.
2642     std(box, BasicLock::displaced_header_offset_in_bytes(), box);
2643 
2644 #   ifdef ASSERT
2645     bne(flag, cont);
2646     // We have acquired the monitor, check some invariants.
2647     addi(/*monitor=*/temp, temp, -ObjectMonitor::owner_offset_in_bytes());
2648     // Invariant 1: _recursions should be 0.
2649     //assert(ObjectMonitor::recursions_size_in_bytes() == 8, "unexpected size");
2650     asm_assert_mem8_is_zero(ObjectMonitor::recursions_offset_in_bytes(), temp,
2651                             "monitor->_recursions should be 0", -1);
2652     // Invariant 2: OwnerIsThread shouldn't be 0.
2653     //assert(ObjectMonitor::OwnerIsThread_size_in_bytes() == 4, "unexpected size");
2654     //asm_assert_mem4_isnot_zero(ObjectMonitor::OwnerIsThread_offset_in_bytes(), temp,
2655     //                           "monitor->OwnerIsThread shouldn't be 0", -1);
2656 #   endif
2657 
2658 #if INCLUDE_RTM_OPT
2659     } // use_rtm()
2660 #endif
2661   }
2662 
2663   bind(cont);
2664   // flag == EQ indicates success
2665   // flag == NE indicates failure
2666 }
2667 
2668 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box,
2669                                                  Register temp, Register displaced_header, Register current_header,
2670                                                  bool try_bias, bool use_rtm) {
2671   assert_different_registers(oop, box, temp, displaced_header, current_header);
2672   assert(flag != CCR0, "bad condition register");
2673   Label cont;
2674   Label object_has_monitor;
2675 
2676   // Always do locking in runtime.
2677   if (EmitSync & 0x01) {
2678     cmpdi(flag, oop, 0); // Oop can't be 0 here => always false.
2679     return;
2680   }
2681 
2682   if (try_bias) {
2683     biased_locking_exit(flag, oop, current_header, cont);
2684   }
2685 
2686 #if INCLUDE_RTM_OPT
2687   if (UseRTMForStackLocks && use_rtm) {
2688     assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
2689     Label L_regular_unlock;
2690     ld(current_header, oopDesc::mark_offset_in_bytes(), oop);         // fetch markword
2691     andi(R0, current_header, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
2692     cmpwi(flag, R0, markOopDesc::unlocked_value);                     // bits = 001 unlocked
2693     bne(flag, L_regular_unlock);                                      // else RegularLock
2694     tend_();                                                          // otherwise end...
2695     b(cont);                                                          // ... and we're done
2696     bind(L_regular_unlock);
2697   }
2698 #endif
2699 
2700   // Find the lock address and load the displaced header from the stack.
2701   ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2702 
2703   // If the displaced header is 0, we have a recursive unlock.
2704   cmpdi(flag, displaced_header, 0);
2705   beq(flag, cont);
2706 
2707   // Handle existing monitor.
2708   if ((EmitSync & 0x02) == 0) {
2709     // The object has an existing monitor iff (mark & monitor_value) != 0.
2710     RTM_OPT_ONLY( if (!(UseRTMForStackLocks && use_rtm)) ) // skip load if already done
2711     ld(current_header, oopDesc::mark_offset_in_bytes(), oop);
2712     andi_(R0, current_header, markOopDesc::monitor_value);
2713     bne(CCR0, object_has_monitor);
2714   }
2715 
2716   // Check if it is still a light weight lock, this is is true if we see
2717   // the stack address of the basicLock in the markOop of the object.
2718   // Cmpxchg sets flag to cmpd(current_header, box).
2719   cmpxchgd(/*flag=*/flag,
2720            /*current_value=*/current_header,
2721            /*compare_value=*/box,
2722            /*exchange_value=*/displaced_header,
2723            /*where=*/oop,
2724            MacroAssembler::MemBarRel,
2725            MacroAssembler::cmpxchgx_hint_release_lock(),
2726            noreg,
2727            &cont);
2728 
2729   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2730 
2731   // Handle existing monitor.
2732   if ((EmitSync & 0x02) == 0) {
2733     b(cont);
2734 
2735     bind(object_has_monitor);
2736     addi(current_header, current_header, -markOopDesc::monitor_value); // monitor
2737     ld(temp,             ObjectMonitor::owner_offset_in_bytes(), current_header);
2738 
2739     // It's inflated.
2740 #if INCLUDE_RTM_OPT
2741     if (use_rtm) {
2742       Label L_regular_inflated_unlock;
2743       // Clean monitor_value bit to get valid pointer
2744       cmpdi(flag, temp, 0);
2745       bne(flag, L_regular_inflated_unlock);
2746       tend_();
2747       b(cont);
2748       bind(L_regular_inflated_unlock);
2749     }
2750 #endif
2751 
2752     ld(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header);
2753     xorr(temp, R16_thread, temp);      // Will be 0 if we are the owner.
2754     orr(temp, temp, displaced_header); // Will be 0 if there are 0 recursions.
2755     cmpdi(flag, temp, 0);
2756     bne(flag, cont);
2757 
2758     ld(temp,             ObjectMonitor::EntryList_offset_in_bytes(), current_header);
2759     ld(displaced_header, ObjectMonitor::cxq_offset_in_bytes(), current_header);
2760     orr(temp, temp, displaced_header); // Will be 0 if both are 0.
2761     cmpdi(flag, temp, 0);
2762     bne(flag, cont);
2763     release();
2764     std(temp, ObjectMonitor::owner_offset_in_bytes(), current_header);
2765   }
2766 
2767   bind(cont);
2768   // flag == EQ indicates success
2769   // flag == NE indicates failure
2770 }
2771 
2772 // Write serialization page so VM thread can do a pseudo remote membar.
2773 // We use the current thread pointer to calculate a thread specific
2774 // offset to write to within the page. This minimizes bus traffic
2775 // due to cache line collision.
2776 void MacroAssembler::serialize_memory(Register thread, Register tmp1, Register tmp2) {
2777   srdi(tmp2, thread, os::get_serialize_page_shift_count());
2778 
2779   int mask = os::vm_page_size() - sizeof(int);
2780   if (Assembler::is_simm(mask, 16)) {
2781     andi(tmp2, tmp2, mask);
2782   } else {
2783     lis(tmp1, (int)((signed short) (mask >> 16)));
2784     ori(tmp1, tmp1, mask & 0x0000ffff);
2785     andr(tmp2, tmp2, tmp1);
2786   }
2787 
2788   load_const(tmp1, (long) os::get_memory_serialize_page());
2789   release();
2790   stwx(R0, tmp1, tmp2);
2791 }
2792 
2793 
2794 // GC barrier helper macros
2795 
2796 // Write the card table byte if needed.
2797 void MacroAssembler::card_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp) {
2798   CardTableModRefBS* bs =
2799     barrier_set_cast<CardTableModRefBS>(Universe::heap()->barrier_set());
2800   assert(bs->kind() == BarrierSet::CardTableForRS ||
2801          bs->kind() == BarrierSet::CardTableExtension, "wrong barrier");
2802 #ifdef ASSERT
2803   cmpdi(CCR0, Rnew_val, 0);
2804   asm_assert_ne("null oop not allowed", 0x321);
2805 #endif
2806   card_table_write(bs->byte_map_base, Rtmp, Rstore_addr);
2807 }
2808 
2809 // Write the card table byte.
2810 void MacroAssembler::card_table_write(jbyte* byte_map_base, Register Rtmp, Register Robj) {
2811   assert_different_registers(Robj, Rtmp, R0);
2812   load_const_optimized(Rtmp, (address)byte_map_base, R0);
2813   srdi(Robj, Robj, CardTableModRefBS::card_shift);
2814   li(R0, 0); // dirty
2815   if (UseConcMarkSweepGC) membar(Assembler::StoreStore);
2816   stbx(R0, Rtmp, Robj);
2817 }
2818 
2819 #if INCLUDE_ALL_GCS
2820 // General G1 pre-barrier generator.
2821 // Goal: record the previous value if it is not null.
2822 void MacroAssembler::g1_write_barrier_pre(Register Robj, RegisterOrConstant offset, Register Rpre_val,
2823                                           Register Rtmp1, Register Rtmp2, bool needs_frame) {
2824   Label runtime, filtered;
2825 
2826   // Is marking active?
2827   if (in_bytes(SATBMarkQueue::byte_width_of_active()) == 4) {
2828     lwz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_active()), R16_thread);
2829   } else {
2830     guarantee(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, "Assumption");
2831     lbz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_active()), R16_thread);
2832   }
2833   cmpdi(CCR0, Rtmp1, 0);
2834   beq(CCR0, filtered);
2835 
2836   // Do we need to load the previous value?
2837   if (Robj != noreg) {
2838     // Load the previous value...
2839     if (UseCompressedOops) {
2840       lwz(Rpre_val, offset, Robj);
2841     } else {
2842       ld(Rpre_val, offset, Robj);
2843     }
2844     // Previous value has been loaded into Rpre_val.
2845   }
2846   assert(Rpre_val != noreg, "must have a real register");
2847 
2848   // Is the previous value null?
2849   cmpdi(CCR0, Rpre_val, 0);
2850   beq(CCR0, filtered);
2851 
2852   if (Robj != noreg && UseCompressedOops) {
2853     decode_heap_oop_not_null(Rpre_val);
2854   }
2855 
2856   // OK, it's not filtered, so we'll need to call enqueue. In the normal
2857   // case, pre_val will be a scratch G-reg, but there are some cases in
2858   // which it's an O-reg. In the first case, do a normal call. In the
2859   // latter, do a save here and call the frameless version.
2860 
2861   // Can we store original value in the thread's buffer?
2862   // Is index == 0?
2863   // (The index field is typed as size_t.)
2864   const Register Rbuffer = Rtmp1, Rindex = Rtmp2;
2865 
2866   ld(Rindex, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_index()), R16_thread);
2867   cmpdi(CCR0, Rindex, 0);
2868   beq(CCR0, runtime); // If index == 0, goto runtime.
2869   ld(Rbuffer, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_buf()), R16_thread);
2870 
2871   addi(Rindex, Rindex, -wordSize); // Decrement index.
2872   std(Rindex, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_index()), R16_thread);
2873 
2874   // Record the previous value.
2875   stdx(Rpre_val, Rbuffer, Rindex);
2876   b(filtered);
2877 
2878   bind(runtime);
2879 
2880   // VM call need frame to access(write) O register.
2881   if (needs_frame) {
2882     save_LR_CR(Rtmp1);
2883     push_frame_reg_args(0, Rtmp2);
2884   }
2885 
2886   if (Rpre_val->is_volatile() && Robj == noreg) mr(R31, Rpre_val); // Save pre_val across C call if it was preloaded.
2887   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), Rpre_val, R16_thread);
2888   if (Rpre_val->is_volatile() && Robj == noreg) mr(Rpre_val, R31); // restore
2889 
2890   if (needs_frame) {
2891     pop_frame();
2892     restore_LR_CR(Rtmp1);
2893   }
2894 
2895   bind(filtered);
2896 }
2897 
2898 // General G1 post-barrier generator
2899 // Store cross-region card.
2900 void MacroAssembler::g1_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp1, Register Rtmp2, Register Rtmp3, Label *filtered_ext) {
2901   Label runtime, filtered_int;
2902   Label& filtered = (filtered_ext != NULL) ? *filtered_ext : filtered_int;
2903   assert_different_registers(Rstore_addr, Rnew_val, Rtmp1, Rtmp2);
2904 
2905   G1SATBCardTableLoggingModRefBS* bs =
2906     barrier_set_cast<G1SATBCardTableLoggingModRefBS>(Universe::heap()->barrier_set());
2907 
2908   // Does store cross heap regions?
2909   if (G1RSBarrierRegionFilter) {
2910     xorr(Rtmp1, Rstore_addr, Rnew_val);
2911     srdi_(Rtmp1, Rtmp1, HeapRegion::LogOfHRGrainBytes);
2912     beq(CCR0, filtered);
2913   }
2914 
2915   // Crosses regions, storing NULL?
2916 #ifdef ASSERT
2917   cmpdi(CCR0, Rnew_val, 0);
2918   asm_assert_ne("null oop not allowed (G1)", 0x322); // Checked by caller on PPC64, so following branch is obsolete:
2919   //beq(CCR0, filtered);
2920 #endif
2921 
2922   // Storing region crossing non-NULL, is card already dirty?
2923   assert(sizeof(*bs->byte_map_base) == sizeof(jbyte), "adjust this code");
2924   const Register Rcard_addr = Rtmp1;
2925   Register Rbase = Rtmp2;
2926   load_const_optimized(Rbase, (address)bs->byte_map_base, /*temp*/ Rtmp3);
2927 
2928   srdi(Rcard_addr, Rstore_addr, CardTableModRefBS::card_shift);
2929 
2930   // Get the address of the card.
2931   lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr);
2932   cmpwi(CCR0, Rtmp3, (int)G1SATBCardTableModRefBS::g1_young_card_val());
2933   beq(CCR0, filtered);
2934 
2935   membar(Assembler::StoreLoad);
2936   lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr);  // Reload after membar.
2937   cmpwi(CCR0, Rtmp3 /* card value */, CardTableModRefBS::dirty_card_val());
2938   beq(CCR0, filtered);
2939 
2940   // Storing a region crossing, non-NULL oop, card is clean.
2941   // Dirty card and log.
2942   li(Rtmp3, CardTableModRefBS::dirty_card_val());
2943   //release(); // G1: oops are allowed to get visible after dirty marking.
2944   stbx(Rtmp3, Rbase, Rcard_addr);
2945 
2946   add(Rcard_addr, Rbase, Rcard_addr); // This is the address which needs to get enqueued.
2947   Rbase = noreg; // end of lifetime
2948 
2949   const Register Rqueue_index = Rtmp2,
2950                  Rqueue_buf   = Rtmp3;
2951   ld(Rqueue_index, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_index()), R16_thread);
2952   cmpdi(CCR0, Rqueue_index, 0);
2953   beq(CCR0, runtime); // index == 0 then jump to runtime
2954   ld(Rqueue_buf, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_buf()), R16_thread);
2955 
2956   addi(Rqueue_index, Rqueue_index, -wordSize); // decrement index
2957   std(Rqueue_index, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_index()), R16_thread);
2958 
2959   stdx(Rcard_addr, Rqueue_buf, Rqueue_index); // store card
2960   b(filtered);
2961 
2962   bind(runtime);
2963 
2964   // Save the live input values.
2965   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), Rcard_addr, R16_thread);
2966 
2967   bind(filtered_int);
2968 }
2969 #endif // INCLUDE_ALL_GCS
2970 
2971 // Values for last_Java_pc, and last_Java_sp must comply to the rules
2972 // in frame_ppc.hpp.
2973 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) {
2974   // Always set last_Java_pc and flags first because once last_Java_sp
2975   // is visible has_last_Java_frame is true and users will look at the
2976   // rest of the fields. (Note: flags should always be zero before we
2977   // get here so doesn't need to be set.)
2978 
2979   // Verify that last_Java_pc was zeroed on return to Java
2980   asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread,
2981                           "last_Java_pc not zeroed before leaving Java", 0x200);
2982 
2983   // When returning from calling out from Java mode the frame anchor's
2984   // last_Java_pc will always be set to NULL. It is set here so that
2985   // if we are doing a call to native (not VM) that we capture the
2986   // known pc and don't have to rely on the native call having a
2987   // standard frame linkage where we can find the pc.
2988   if (last_Java_pc != noreg)
2989     std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
2990 
2991   // Set last_Java_sp last.
2992   std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
2993 }
2994 
2995 void MacroAssembler::reset_last_Java_frame(void) {
2996   asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()),
2997                              R16_thread, "SP was not set, still zero", 0x202);
2998 
2999   BLOCK_COMMENT("reset_last_Java_frame {");
3000   li(R0, 0);
3001 
3002   // _last_Java_sp = 0
3003   std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
3004 
3005   // _last_Java_pc = 0
3006   std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
3007   BLOCK_COMMENT("} reset_last_Java_frame");
3008 }
3009 
3010 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) {
3011   assert_different_registers(sp, tmp1);
3012 
3013   // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via
3014   // TOP_IJAVA_FRAME_ABI.
3015   // FIXME: assert that we really have a TOP_IJAVA_FRAME here!
3016   address entry = pc();
3017   load_const_optimized(tmp1, entry);
3018 
3019   set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1);
3020 }
3021 
3022 void MacroAssembler::get_vm_result(Register oop_result) {
3023   // Read:
3024   //   R16_thread
3025   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
3026   //
3027   // Updated:
3028   //   oop_result
3029   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
3030 
3031   verify_thread();
3032 
3033   ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread);
3034   li(R0, 0);
3035   std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread);
3036 
3037   verify_oop(oop_result);
3038 }
3039 
3040 void MacroAssembler::get_vm_result_2(Register metadata_result) {
3041   // Read:
3042   //   R16_thread
3043   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
3044   //
3045   // Updated:
3046   //   metadata_result
3047   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
3048 
3049   ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
3050   li(R0, 0);
3051   std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
3052 }
3053 
3054 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3055   Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided.
3056   if (Universe::narrow_klass_base() != 0) {
3057     // Use dst as temp if it is free.
3058     sub_const_optimized(dst, current, Universe::narrow_klass_base(), R0);
3059     current = dst;
3060   }
3061   if (Universe::narrow_klass_shift() != 0) {
3062     srdi(dst, current, Universe::narrow_klass_shift());
3063     current = dst;
3064   }
3065   return current;
3066 }
3067 
3068 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) {
3069   if (UseCompressedClassPointers) {
3070     Register compressedKlass = encode_klass_not_null(ck, klass);
3071     stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop);
3072   } else {
3073     std(klass, oopDesc::klass_offset_in_bytes(), dst_oop);
3074   }
3075 }
3076 
3077 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) {
3078   if (UseCompressedClassPointers) {
3079     if (val == noreg) {
3080       val = R0;
3081       li(val, 0);
3082     }
3083     stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed
3084   }
3085 }
3086 
3087 int MacroAssembler::instr_size_for_decode_klass_not_null() {
3088   if (!UseCompressedClassPointers) return 0;
3089   int num_instrs = 1;  // shift or move
3090   if (Universe::narrow_klass_base() != 0) num_instrs = 7;  // shift + load const + add
3091   return num_instrs * BytesPerInstWord;
3092 }
3093 
3094 void MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3095   assert(dst != R0, "Dst reg may not be R0, as R0 is used here.");
3096   if (src == noreg) src = dst;
3097   Register shifted_src = src;
3098   if (Universe::narrow_klass_shift() != 0 ||
3099       Universe::narrow_klass_base() == 0 && src != dst) {  // Move required.
3100     shifted_src = dst;
3101     sldi(shifted_src, src, Universe::narrow_klass_shift());
3102   }
3103   if (Universe::narrow_klass_base() != 0) {
3104     add_const_optimized(dst, shifted_src, Universe::narrow_klass_base(), R0);
3105   }
3106 }
3107 
3108 void MacroAssembler::load_klass(Register dst, Register src) {
3109   if (UseCompressedClassPointers) {
3110     lwz(dst, oopDesc::klass_offset_in_bytes(), src);
3111     // Attention: no null check here!
3112     decode_klass_not_null(dst, dst);
3113   } else {
3114     ld(dst, oopDesc::klass_offset_in_bytes(), src);
3115   }
3116 }
3117 
3118 // Clear Array
3119 // Kills both input registers. tmp == R0 is allowed.
3120 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp) {
3121   // Procedure for large arrays (uses data cache block zero instruction).
3122     Label startloop, fast, fastloop, small_rest, restloop, done;
3123     const int cl_size         = VM_Version::L1_data_cache_line_size(),
3124               cl_dwords       = cl_size>>3,
3125               cl_dw_addr_bits = exact_log2(cl_dwords),
3126               dcbz_min        = 1;                     // Min count of dcbz executions, needs to be >0.
3127 
3128 //2:
3129     cmpdi(CCR1, cnt_dwords, ((dcbz_min+1)<<cl_dw_addr_bits)-1); // Big enough? (ensure >=dcbz_min lines included).
3130     blt(CCR1, small_rest);                                      // Too small.
3131     rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits);           // Extract dword offset within first cache line.
3132     beq(CCR0, fast);                                            // Already 128byte aligned.
3133 
3134     subfic(tmp, tmp, cl_dwords);
3135     mtctr(tmp);                        // Set ctr to hit 128byte boundary (0<ctr<cl_dwords).
3136     subf(cnt_dwords, tmp, cnt_dwords); // rest.
3137     li(tmp, 0);
3138 //10:
3139   bind(startloop);                     // Clear at the beginning to reach 128byte boundary.
3140     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3141     addi(base_ptr, base_ptr, 8);
3142     bdnz(startloop);
3143 //13:
3144   bind(fast);                                  // Clear 128byte blocks.
3145     srdi(tmp, cnt_dwords, cl_dw_addr_bits);    // Loop count for 128byte loop (>0).
3146     andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords.
3147     mtctr(tmp);                                // Load counter.
3148 //16:
3149   bind(fastloop);
3150     dcbz(base_ptr);                    // Clear 128byte aligned block.
3151     addi(base_ptr, base_ptr, cl_size);
3152     bdnz(fastloop);
3153     if (InsertEndGroupPPC64) { endgroup(); } else { nop(); }
3154 //20:
3155   bind(small_rest);
3156     cmpdi(CCR0, cnt_dwords, 0);        // size 0?
3157     beq(CCR0, done);                   // rest == 0
3158     li(tmp, 0);
3159     mtctr(cnt_dwords);                 // Load counter.
3160 //24:
3161   bind(restloop);                      // Clear rest.
3162     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3163     addi(base_ptr, base_ptr, 8);
3164     bdnz(restloop);
3165 //27:
3166   bind(done);
3167 }
3168 
3169 /////////////////////////////////////////// String intrinsics ////////////////////////////////////////////
3170 
3171 // Search for a single jchar in an jchar[].
3172 //
3173 // Assumes that result differs from all other registers.
3174 //
3175 // Haystack, needle are the addresses of jchar-arrays.
3176 // NeedleChar is needle[0] if it is known at compile time.
3177 // Haycnt is the length of the haystack. We assume haycnt >=1.
3178 //
3179 // Preserves haystack, haycnt, kills all other registers.
3180 //
3181 // If needle == R0, we search for the constant needleChar.
3182 void MacroAssembler::string_indexof_1(Register result, Register haystack, Register haycnt,
3183                                       Register needle, jchar needleChar,
3184                                       Register tmp1, Register tmp2) {
3185 
3186   assert_different_registers(result, haystack, haycnt, needle, tmp1, tmp2);
3187 
3188   Label L_InnerLoop, L_FinalCheck, L_Found1, L_Found2, L_Found3, L_NotFound, L_End;
3189   Register needle0 = needle, // Contains needle[0].
3190            addr = tmp1,
3191            ch1 = tmp2,
3192            ch2 = R0;
3193 
3194 //2 (variable) or 3 (const):
3195    if (needle != R0) lhz(needle0, 0, needle); // Preload needle character, needle has len==1.
3196    dcbtct(haystack, 0x00);                        // Indicate R/O access to haystack.
3197 
3198    srwi_(tmp2, haycnt, 1);   // Shift right by exact_log2(UNROLL_FACTOR).
3199    mr(addr, haystack);
3200    beq(CCR0, L_FinalCheck);
3201    mtctr(tmp2);              // Move to count register.
3202 //8:
3203   bind(L_InnerLoop);             // Main work horse (2x unrolled search loop).
3204    lhz(ch1, 0, addr);        // Load characters from haystack.
3205    lhz(ch2, 2, addr);
3206    (needle != R0) ? cmpw(CCR0, ch1, needle0) : cmplwi(CCR0, ch1, needleChar);
3207    (needle != R0) ? cmpw(CCR1, ch2, needle0) : cmplwi(CCR1, ch2, needleChar);
3208    beq(CCR0, L_Found1);   // Did we find the needle?
3209    beq(CCR1, L_Found2);
3210    addi(addr, addr, 4);
3211    bdnz(L_InnerLoop);
3212 //16:
3213   bind(L_FinalCheck);
3214    andi_(R0, haycnt, 1);
3215    beq(CCR0, L_NotFound);
3216    lhz(ch1, 0, addr);        // One position left at which we have to compare.
3217    (needle != R0) ? cmpw(CCR1, ch1, needle0) : cmplwi(CCR1, ch1, needleChar);
3218    beq(CCR1, L_Found3);
3219 //21:
3220   bind(L_NotFound);
3221    li(result, -1);           // Not found.
3222    b(L_End);
3223 
3224   bind(L_Found2);
3225    addi(addr, addr, 2);
3226 //24:
3227   bind(L_Found1);
3228   bind(L_Found3);                  // Return index ...
3229    subf(addr, haystack, addr); // relative to haystack,
3230    srdi(result, addr, 1);      // in characters.
3231   bind(L_End);
3232 }
3233 
3234 
3235 // Implementation of IndexOf for jchar arrays.
3236 //
3237 // The length of haystack and needle are not constant, i.e. passed in a register.
3238 //
3239 // Preserves registers haystack, needle.
3240 // Kills registers haycnt, needlecnt.
3241 // Assumes that result differs from all other registers.
3242 // Haystack, needle are the addresses of jchar-arrays.
3243 // Haycnt, needlecnt are the lengths of them, respectively.
3244 //
3245 // Needlecntval must be zero or 15-bit unsigned immediate and > 1.
3246 void MacroAssembler::string_indexof(Register result, Register haystack, Register haycnt,
3247                                     Register needle, ciTypeArray* needle_values, Register needlecnt, int needlecntval,
3248                                     Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
3249 
3250   // Ensure 0<needlecnt<=haycnt in ideal graph as prerequisite!
3251   Label L_TooShort, L_Found, L_NotFound, L_End;
3252   Register last_addr = haycnt, // Kill haycnt at the beginning.
3253            addr      = tmp1,
3254            n_start   = tmp2,
3255            ch1       = tmp3,
3256            ch2       = R0;
3257 
3258   // **************************************************************************************************
3259   // Prepare for main loop: optimized for needle count >=2, bail out otherwise.
3260   // **************************************************************************************************
3261 
3262 //1 (variable) or 3 (const):
3263    dcbtct(needle, 0x00);    // Indicate R/O access to str1.
3264    dcbtct(haystack, 0x00);  // Indicate R/O access to str2.
3265 
3266   // Compute last haystack addr to use if no match gets found.
3267   if (needlecntval == 0) { // variable needlecnt
3268 //3:
3269    subf(ch1, needlecnt, haycnt);      // Last character index to compare is haycnt-needlecnt.
3270    addi(addr, haystack, -2);          // Accesses use pre-increment.
3271    cmpwi(CCR6, needlecnt, 2);
3272    blt(CCR6, L_TooShort);          // Variable needlecnt: handle short needle separately.
3273    slwi(ch1, ch1, 1);                 // Scale to number of bytes.
3274    lwz(n_start, 0, needle);           // Load first 2 characters of needle.
3275    add(last_addr, haystack, ch1);     // Point to last address to compare (haystack+2*(haycnt-needlecnt)).
3276    addi(needlecnt, needlecnt, -2);    // Rest of needle.
3277   } else { // constant needlecnt
3278   guarantee(needlecntval != 1, "IndexOf with single-character needle must be handled separately");
3279   assert((needlecntval & 0x7fff) == needlecntval, "wrong immediate");
3280 //5:
3281    addi(ch1, haycnt, -needlecntval);  // Last character index to compare is haycnt-needlecnt.
3282    lwz(n_start, 0, needle);           // Load first 2 characters of needle.
3283    addi(addr, haystack, -2);          // Accesses use pre-increment.
3284    slwi(ch1, ch1, 1);                 // Scale to number of bytes.
3285    add(last_addr, haystack, ch1);     // Point to last address to compare (haystack+2*(haycnt-needlecnt)).
3286    li(needlecnt, needlecntval-2);     // Rest of needle.
3287   }
3288 
3289   // Main Loop (now we have at least 3 characters).
3290 //11:
3291   Label L_OuterLoop, L_InnerLoop, L_FinalCheck, L_Comp1, L_Comp2, L_Comp3;
3292   bind(L_OuterLoop); // Search for 1st 2 characters.
3293   Register addr_diff = tmp4;
3294    subf(addr_diff, addr, last_addr); // Difference between already checked address and last address to check.
3295    addi(addr, addr, 2);              // This is the new address we want to use for comparing.
3296    srdi_(ch2, addr_diff, 2);
3297    beq(CCR0, L_FinalCheck);       // 2 characters left?
3298    mtctr(ch2);                       // addr_diff/4
3299 //16:
3300   bind(L_InnerLoop);                // Main work horse (2x unrolled search loop)
3301    lwz(ch1, 0, addr);           // Load 2 characters of haystack (ignore alignment).
3302    lwz(ch2, 2, addr);
3303    cmpw(CCR0, ch1, n_start); // Compare 2 characters (1 would be sufficient but try to reduce branches to CompLoop).
3304    cmpw(CCR1, ch2, n_start);
3305    beq(CCR0, L_Comp1);       // Did we find the needle start?
3306    beq(CCR1, L_Comp2);
3307    addi(addr, addr, 4);
3308    bdnz(L_InnerLoop);
3309 //24:
3310   bind(L_FinalCheck);
3311    rldicl_(addr_diff, addr_diff, 64-1, 63); // Remaining characters not covered by InnerLoop: (addr_diff>>1)&1.
3312    beq(CCR0, L_NotFound);
3313    lwz(ch1, 0, addr);                       // One position left at which we have to compare.
3314    cmpw(CCR1, ch1, n_start);
3315    beq(CCR1, L_Comp3);
3316 //29:
3317   bind(L_NotFound);
3318    li(result, -1); // not found
3319    b(L_End);
3320 
3321 
3322    // **************************************************************************************************
3323    // Special Case: unfortunately, the variable needle case can be called with needlecnt<2
3324    // **************************************************************************************************
3325 //31:
3326  if ((needlecntval>>1) !=1 ) { // Const needlecnt is 2 or 3? Reduce code size.
3327   int nopcnt = 5;
3328   if (needlecntval !=0 ) ++nopcnt; // Balance alignment (other case: see below).
3329   if (needlecntval == 0) {         // We have to handle these cases separately.
3330   Label L_OneCharLoop;
3331   bind(L_TooShort);
3332    mtctr(haycnt);
3333    lhz(n_start, 0, needle);    // First character of needle
3334   bind(L_OneCharLoop);
3335    lhzu(ch1, 2, addr);
3336    cmpw(CCR1, ch1, n_start);
3337    beq(CCR1, L_Found);      // Did we find the one character needle?
3338    bdnz(L_OneCharLoop);
3339    li(result, -1);             // Not found.
3340    b(L_End);
3341   } // 8 instructions, so no impact on alignment.
3342   for (int x = 0; x < nopcnt; ++x) nop();
3343  }
3344 
3345   // **************************************************************************************************
3346   // Regular Case Part II: compare rest of needle (first 2 characters have been compared already)
3347   // **************************************************************************************************
3348 
3349   // Compare the rest
3350 //36 if needlecntval==0, else 37:
3351   bind(L_Comp2);
3352    addi(addr, addr, 2); // First comparison has failed, 2nd one hit.
3353   bind(L_Comp1);            // Addr points to possible needle start.
3354   bind(L_Comp3);            // Could have created a copy and use a different return address but saving code size here.
3355   if (needlecntval != 2) {  // Const needlecnt==2?
3356    if (needlecntval != 3) {
3357     if (needlecntval == 0) beq(CCR6, L_Found); // Variable needlecnt==2?
3358     Register ind_reg = tmp4;
3359     li(ind_reg, 2*2);   // First 2 characters are already compared, use index 2.
3360     mtctr(needlecnt);   // Decremented by 2, still > 0.
3361 //40:
3362    Label L_CompLoop;
3363    bind(L_CompLoop);
3364     lhzx(ch2, needle, ind_reg);
3365     lhzx(ch1, addr, ind_reg);
3366     cmpw(CCR1, ch1, ch2);
3367     bne(CCR1, L_OuterLoop);
3368     addi(ind_reg, ind_reg, 2);
3369     bdnz(L_CompLoop);
3370    } else { // No loop required if there's only one needle character left.
3371     lhz(ch2, 2*2, needle);
3372     lhz(ch1, 2*2, addr);
3373     cmpw(CCR1, ch1, ch2);
3374     bne(CCR1, L_OuterLoop);
3375    }
3376   }
3377   // Return index ...
3378 //46:
3379   bind(L_Found);
3380    subf(addr, haystack, addr); // relative to haystack, ...
3381    srdi(result, addr, 1);      // in characters.
3382 //48:
3383   bind(L_End);
3384 }
3385 
3386 // Implementation of Compare for jchar arrays.
3387 //
3388 // Kills the registers str1, str2, cnt1, cnt2.
3389 // Kills cr0, ctr.
3390 // Assumes that result differes from the input registers.
3391 void MacroAssembler::string_compare(Register str1_reg, Register str2_reg, Register cnt1_reg, Register cnt2_reg,
3392                                     Register result_reg, Register tmp_reg) {
3393    assert_different_registers(result_reg, str1_reg, str2_reg, cnt1_reg, cnt2_reg, tmp_reg);
3394 
3395    Label Ldone, Lslow_case, Lslow_loop, Lfast_loop;
3396    Register cnt_diff = R0,
3397             limit_reg = cnt1_reg,
3398             chr1_reg = result_reg,
3399             chr2_reg = cnt2_reg,
3400             addr_diff = str2_reg;
3401 
3402    // Offset 0 should be 32 byte aligned.
3403 //-4:
3404     dcbtct(str1_reg, 0x00);  // Indicate R/O access to str1.
3405     dcbtct(str2_reg, 0x00);  // Indicate R/O access to str2.
3406 //-2:
3407    // Compute min(cnt1, cnt2) and check if 0 (bail out if we don't need to compare characters).
3408     subf(result_reg, cnt2_reg, cnt1_reg);  // difference between cnt1/2
3409     subf_(addr_diff, str1_reg, str2_reg);  // alias?
3410     beq(CCR0, Ldone);                   // return cnt difference if both ones are identical
3411     srawi(limit_reg, result_reg, 31);      // generate signmask (cnt1/2 must be non-negative so cnt_diff can't overflow)
3412     mr(cnt_diff, result_reg);
3413     andr(limit_reg, result_reg, limit_reg); // difference or zero (negative): cnt1<cnt2 ? cnt1-cnt2 : 0
3414     add_(limit_reg, cnt2_reg, limit_reg);  // min(cnt1, cnt2)==0?
3415     beq(CCR0, Ldone);                   // return cnt difference if one has 0 length
3416 
3417     lhz(chr1_reg, 0, str1_reg);            // optional: early out if first characters mismatch
3418     lhzx(chr2_reg, str1_reg, addr_diff);   // optional: early out if first characters mismatch
3419     addi(tmp_reg, limit_reg, -1);          // min(cnt1, cnt2)-1
3420     subf_(result_reg, chr2_reg, chr1_reg); // optional: early out if first characters mismatch
3421     bne(CCR0, Ldone);                   // optional: early out if first characters mismatch
3422 
3423    // Set loop counter by scaling down tmp_reg
3424     srawi_(chr2_reg, tmp_reg, exact_log2(4)); // (min(cnt1, cnt2)-1)/4
3425     ble(CCR0, Lslow_case);                 // need >4 characters for fast loop
3426     andi(limit_reg, tmp_reg, 4-1);            // remaining characters
3427 
3428    // Adapt str1_reg str2_reg for the first loop iteration
3429     mtctr(chr2_reg);                 // (min(cnt1, cnt2)-1)/4
3430     addi(limit_reg, limit_reg, 4+1); // compare last 5-8 characters in slow_case if mismatch found in fast_loop
3431 //16:
3432    // Compare the rest of the characters
3433    bind(Lfast_loop);
3434     ld(chr1_reg, 0, str1_reg);
3435     ldx(chr2_reg, str1_reg, addr_diff);
3436     cmpd(CCR0, chr2_reg, chr1_reg);
3437     bne(CCR0, Lslow_case); // return chr1_reg
3438     addi(str1_reg, str1_reg, 4*2);
3439     bdnz(Lfast_loop);
3440     addi(limit_reg, limit_reg, -4); // no mismatch found in fast_loop, only 1-4 characters missing
3441 //23:
3442    bind(Lslow_case);
3443     mtctr(limit_reg);
3444 //24:
3445    bind(Lslow_loop);
3446     lhz(chr1_reg, 0, str1_reg);
3447     lhzx(chr2_reg, str1_reg, addr_diff);
3448     subf_(result_reg, chr2_reg, chr1_reg);
3449     bne(CCR0, Ldone); // return chr1_reg
3450     addi(str1_reg, str1_reg, 1*2);
3451     bdnz(Lslow_loop);
3452 //30:
3453    // If strings are equal up to min length, return the length difference.
3454     mr(result_reg, cnt_diff);
3455     nop(); // alignment
3456 //32:
3457    // Otherwise, return the difference between the first mismatched chars.
3458    bind(Ldone);
3459 }
3460 
3461 
3462 // Compare char[] arrays.
3463 //
3464 // str1_reg   USE only
3465 // str2_reg   USE only
3466 // cnt_reg    USE_DEF, due to tmp reg shortage
3467 // result_reg DEF only, might compromise USE only registers
3468 void MacroAssembler::char_arrays_equals(Register str1_reg, Register str2_reg, Register cnt_reg, Register result_reg,
3469                                         Register tmp1_reg, Register tmp2_reg, Register tmp3_reg, Register tmp4_reg,
3470                                         Register tmp5_reg) {
3471 
3472   // Str1 may be the same register as str2 which can occur e.g. after scalar replacement.
3473   assert_different_registers(result_reg, str1_reg, cnt_reg, tmp1_reg, tmp2_reg, tmp3_reg, tmp4_reg, tmp5_reg);
3474   assert_different_registers(result_reg, str2_reg, cnt_reg, tmp1_reg, tmp2_reg, tmp3_reg, tmp4_reg, tmp5_reg);
3475 
3476   // Offset 0 should be 32 byte aligned.
3477   Label Linit_cbc, Lcbc, Lloop, Ldone_true, Ldone_false;
3478   Register index_reg = tmp5_reg;
3479   Register cbc_iter  = tmp4_reg;
3480 
3481 //-1:
3482   dcbtct(str1_reg, 0x00);  // Indicate R/O access to str1.
3483   dcbtct(str2_reg, 0x00);  // Indicate R/O access to str2.
3484 //1:
3485   andi(cbc_iter, cnt_reg, 4-1);            // Remaining iterations after 4 java characters per iteration loop.
3486   li(index_reg, 0); // init
3487   li(result_reg, 0); // assume false
3488   srwi_(tmp2_reg, cnt_reg, exact_log2(4)); // Div: 4 java characters per iteration (main loop).
3489 
3490   cmpwi(CCR1, cbc_iter, 0);             // CCR1 = (cbc_iter==0)
3491   beq(CCR0, Linit_cbc);                 // too short
3492     mtctr(tmp2_reg);
3493 //8:
3494     bind(Lloop);
3495       ldx(tmp1_reg, str1_reg, index_reg);
3496       ldx(tmp2_reg, str2_reg, index_reg);
3497       cmpd(CCR0, tmp1_reg, tmp2_reg);
3498       bne(CCR0, Ldone_false);  // Unequal char pair found -> done.
3499       addi(index_reg, index_reg, 4*sizeof(jchar));
3500       bdnz(Lloop);
3501 //14:
3502   bind(Linit_cbc);
3503   beq(CCR1, Ldone_true);
3504     mtctr(cbc_iter);
3505 //16:
3506     bind(Lcbc);
3507       lhzx(tmp1_reg, str1_reg, index_reg);
3508       lhzx(tmp2_reg, str2_reg, index_reg);
3509       cmpw(CCR0, tmp1_reg, tmp2_reg);
3510       bne(CCR0, Ldone_false);  // Unequal char pair found -> done.
3511       addi(index_reg, index_reg, 1*sizeof(jchar));
3512       bdnz(Lcbc);
3513     nop();
3514   bind(Ldone_true);
3515   li(result_reg, 1);
3516 //24:
3517   bind(Ldone_false);
3518 }
3519 
3520 
3521 void MacroAssembler::char_arrays_equalsImm(Register str1_reg, Register str2_reg, int cntval, Register result_reg,
3522                                            Register tmp1_reg, Register tmp2_reg) {
3523   // Str1 may be the same register as str2 which can occur e.g. after scalar replacement.
3524   assert_different_registers(result_reg, str1_reg, tmp1_reg, tmp2_reg);
3525   assert_different_registers(result_reg, str2_reg, tmp1_reg, tmp2_reg);
3526   assert(sizeof(jchar) == 2, "must be");
3527   assert(cntval >= 0 && ((cntval & 0x7fff) == cntval), "wrong immediate");
3528 
3529   Label Ldone_false;
3530 
3531   if (cntval < 16) { // short case
3532     if (cntval != 0) li(result_reg, 0); // assume false
3533 
3534     const int num_bytes = cntval*sizeof(jchar);
3535     int index = 0;
3536     for (int next_index; (next_index = index + 8) <= num_bytes; index = next_index) {
3537       ld(tmp1_reg, index, str1_reg);
3538       ld(tmp2_reg, index, str2_reg);
3539       cmpd(CCR0, tmp1_reg, tmp2_reg);
3540       bne(CCR0, Ldone_false);
3541     }
3542     if (cntval & 2) {
3543       lwz(tmp1_reg, index, str1_reg);
3544       lwz(tmp2_reg, index, str2_reg);
3545       cmpw(CCR0, tmp1_reg, tmp2_reg);
3546       bne(CCR0, Ldone_false);
3547       index += 4;
3548     }
3549     if (cntval & 1) {
3550       lhz(tmp1_reg, index, str1_reg);
3551       lhz(tmp2_reg, index, str2_reg);
3552       cmpw(CCR0, tmp1_reg, tmp2_reg);
3553       bne(CCR0, Ldone_false);
3554     }
3555     // fallthrough: true
3556   } else {
3557     Label Lloop;
3558     Register index_reg = tmp1_reg;
3559     const int loopcnt = cntval/4;
3560     assert(loopcnt > 0, "must be");
3561     // Offset 0 should be 32 byte aligned.
3562     //2:
3563     dcbtct(str1_reg, 0x00);  // Indicate R/O access to str1.
3564     dcbtct(str2_reg, 0x00);  // Indicate R/O access to str2.
3565     li(tmp2_reg, loopcnt);
3566     li(index_reg, 0); // init
3567     li(result_reg, 0); // assume false
3568     mtctr(tmp2_reg);
3569     //8:
3570     bind(Lloop);
3571     ldx(R0, str1_reg, index_reg);
3572     ldx(tmp2_reg, str2_reg, index_reg);
3573     cmpd(CCR0, R0, tmp2_reg);
3574     bne(CCR0, Ldone_false);  // Unequal char pair found -> done.
3575     addi(index_reg, index_reg, 4*sizeof(jchar));
3576     bdnz(Lloop);
3577     //14:
3578     if (cntval & 2) {
3579       lwzx(R0, str1_reg, index_reg);
3580       lwzx(tmp2_reg, str2_reg, index_reg);
3581       cmpw(CCR0, R0, tmp2_reg);
3582       bne(CCR0, Ldone_false);
3583       if (cntval & 1) addi(index_reg, index_reg, 2*sizeof(jchar));
3584     }
3585     if (cntval & 1) {
3586       lhzx(R0, str1_reg, index_reg);
3587       lhzx(tmp2_reg, str2_reg, index_reg);
3588       cmpw(CCR0, R0, tmp2_reg);
3589       bne(CCR0, Ldone_false);
3590     }
3591     // fallthru: true
3592   }
3593   li(result_reg, 1);
3594   bind(Ldone_false);
3595 }
3596 
3597 // Helpers for Intrinsic Emitters
3598 //
3599 // Revert the byte order of a 32bit value in a register
3600 //   src: 0x44556677
3601 //   dst: 0x77665544
3602 // Three steps to obtain the result:
3603 //  1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word
3604 //     into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared.
3605 //     This value initializes dst.
3606 //  2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost
3607 //     byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go.
3608 //     This value is mask inserted into dst with a [0..23] mask of 1s.
3609 //  3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position.
3610 //     This value is mask inserted into dst with a [8..15] mask of 1s.
3611 void MacroAssembler::load_reverse_32(Register dst, Register src) {
3612   assert_different_registers(dst, src);
3613 
3614   rldicl(dst, src, (4+1)*8, 56);       // Rotate byte 4 into position 7 (rightmost), clear all to the left.
3615   rlwimi(dst, src,     3*8,  0, 23);   // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone.
3616   rlwimi(dst, src,     1*8,  8, 15);   // Insert byte 6 into position 5, leave the rest alone.
3617 }
3618 
3619 // Calculate the column addresses of the crc32 lookup table into distinct registers.
3620 // This loop-invariant calculation is moved out of the loop body, reducing the loop
3621 // body size from 20 to 16 instructions.
3622 // Returns the offset that was used to calculate the address of column tc3.
3623 // Due to register shortage, setting tc3 may overwrite table. With the return offset
3624 // at hand, the original table address can be easily reconstructed.
3625 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {
3626 
3627 #ifdef VM_LITTLE_ENDIAN
3628   // This is what we implement (the DOLIT4 part):
3629   // ========================================================================= */
3630   // #define DOLIT4 c ^= *buf4++; \
3631   //         c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \
3632   //             crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24]
3633   // #define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4
3634   // ========================================================================= */
3635   const int ix0 = 3*(4*CRC32_COLUMN_SIZE);
3636   const int ix1 = 2*(4*CRC32_COLUMN_SIZE);
3637   const int ix2 = 1*(4*CRC32_COLUMN_SIZE);
3638   const int ix3 = 0*(4*CRC32_COLUMN_SIZE);
3639 #else
3640   // This is what we implement (the DOBIG4 part):
3641   // =========================================================================
3642   // #define DOBIG4 c ^= *++buf4; \
3643   //         c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \
3644   //             crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24]
3645   // #define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4
3646   // =========================================================================
3647   const int ix0 = 4*(4*CRC32_COLUMN_SIZE);
3648   const int ix1 = 5*(4*CRC32_COLUMN_SIZE);
3649   const int ix2 = 6*(4*CRC32_COLUMN_SIZE);
3650   const int ix3 = 7*(4*CRC32_COLUMN_SIZE);
3651 #endif
3652   assert_different_registers(table, tc0, tc1, tc2);
3653   assert(table == tc3, "must be!");
3654 
3655   if (ix0 != 0) addi(tc0, table, ix0);
3656   if (ix1 != 0) addi(tc1, table, ix1);
3657   if (ix2 != 0) addi(tc2, table, ix2);
3658   if (ix3 != 0) addi(tc3, table, ix3);
3659 
3660   return ix3;
3661 }
3662 
3663 /**
3664  * uint32_t crc;
3665  * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
3666  */
3667 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) {
3668   assert_different_registers(crc, table, tmp);
3669   assert_different_registers(val, table);
3670 
3671   if (crc == val) {                   // Must rotate first to use the unmodified value.
3672     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3673                                       // As we use a word (4-byte) instruction, we have to adapt the mask bit positions.
3674     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
3675   } else {
3676     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
3677     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3678   }
3679   lwzx(tmp, table, tmp);
3680   xorr(crc, crc, tmp);
3681 }
3682 
3683 /**
3684  * uint32_t crc;
3685  * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
3686  */
3687 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
3688   fold_byte_crc32(crc, crc, table, tmp);
3689 }
3690 
3691 /**
3692  * Emits code to update CRC-32 with a byte value according to constants in table.
3693  *
3694  * @param [in,out]crc   Register containing the crc.
3695  * @param [in]val       Register containing the byte to fold into the CRC.
3696  * @param [in]table     Register containing the table of crc constants.
3697  *
3698  * uint32_t crc;
3699  * val = crc_table[(val ^ crc) & 0xFF];
3700  * crc = val ^ (crc >> 8);
3701  */
3702 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3703   BLOCK_COMMENT("update_byte_crc32:");
3704   xorr(val, val, crc);
3705   fold_byte_crc32(crc, val, table, val);
3706 }
3707 
3708 /**
3709  * @param crc   register containing existing CRC (32-bit)
3710  * @param buf   register pointing to input byte buffer (byte*)
3711  * @param len   register containing number of bytes
3712  * @param table register pointing to CRC table
3713  */
3714 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,
3715                                            Register data, bool loopAlignment, bool invertCRC) {
3716   assert_different_registers(crc, buf, len, table, data);
3717 
3718   Label L_mainLoop, L_done;
3719   const int mainLoop_stepping  = 1;
3720   const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4;
3721 
3722   // Process all bytes in a single-byte loop.
3723   cmpdi(CCR0, len, 0);                           // Anything to do?
3724   mtctr(len);
3725   beq(CCR0, L_done);
3726 
3727   if (invertCRC) {
3728     nand(crc, crc, crc);                         // ~c
3729   }
3730 
3731   align(mainLoop_alignment);
3732   BIND(L_mainLoop);
3733     lbz(data, 0, buf);                           // Byte from buffer, zero-extended.
3734     addi(buf, buf, mainLoop_stepping);           // Advance buffer position.
3735     update_byte_crc32(crc, data, table);
3736     bdnz(L_mainLoop);                            // Iterate.
3737 
3738   if (invertCRC) {
3739     nand(crc, crc, crc);                         // ~c
3740   }
3741 
3742   bind(L_done);
3743 }
3744 
3745 /**
3746  * Emits code to update CRC-32 with a 4-byte value according to constants in table
3747  * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
3748  */
3749 // A not on the lookup table address(es):
3750 // The lookup table consists of two sets of four columns each.
3751 // The columns {0..3} are used for little-endian machines.
3752 // The columns {4..7} are used for big-endian machines.
3753 // To save the effort of adding the column offset to the table address each time
3754 // a table element is looked up, it is possible to pass the pre-calculated
3755 // column addresses.
3756 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary.
3757 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,
3758                                         Register t0,  Register t1,  Register t2,  Register t3,
3759                                         Register tc0, Register tc1, Register tc2, Register tc3) {
3760   assert_different_registers(crc, t3);
3761 
3762   // XOR crc with next four bytes of buffer.
3763   lwz(t3, bufDisp, buf);
3764   if (bufInc != 0) {
3765     addi(buf, buf, bufInc);
3766   }
3767   xorr(t3, t3, crc);
3768 
3769   // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices.
3770   rlwinm(t0, t3,  2,         24-2, 31-2);  // ((t1 >>  0) & 0xff) << 2
3771   rlwinm(t1, t3,  32+(2- 8), 24-2, 31-2);  // ((t1 >>  8) & 0xff) << 2
3772   rlwinm(t2, t3,  32+(2-16), 24-2, 31-2);  // ((t1 >> 16) & 0xff) << 2
3773   rlwinm(t3, t3,  32+(2-24), 24-2, 31-2);  // ((t1 >> 24) & 0xff) << 2
3774 
3775   // Use the pre-calculated column addresses.
3776   // Load pre-calculated table values.
3777   lwzx(t0, tc0, t0);
3778   lwzx(t1, tc1, t1);
3779   lwzx(t2, tc2, t2);
3780   lwzx(t3, tc3, t3);
3781 
3782   // Calculate new crc from table values.
3783   xorr(t0,  t0, t1);
3784   xorr(t2,  t2, t3);
3785   xorr(crc, t0, t2);  // Now crc contains the final checksum value.
3786 }
3787 
3788 /**
3789  * @param crc   register containing existing CRC (32-bit)
3790  * @param buf   register pointing to input byte buffer (byte*)
3791  * @param len   register containing number of bytes
3792  * @param table register pointing to CRC table
3793  *
3794  * Uses R9..R12 as work register. Must be saved/restored by caller!
3795  */
3796 void MacroAssembler::kernel_crc32_2word(Register crc, Register buf, Register len, Register table,
3797                                         Register t0,  Register t1,  Register t2,  Register t3,
3798                                         Register tc0, Register tc1, Register tc2, Register tc3) {
3799   assert_different_registers(crc, buf, len, table);
3800 
3801   Label L_mainLoop, L_tail;
3802   Register  tmp  = t0;
3803   Register  data = t0;
3804   Register  tmp2 = t1;
3805   const int mainLoop_stepping  = 8;
3806   const int tailLoop_stepping  = 1;
3807   const int log_stepping       = exact_log2(mainLoop_stepping);
3808   const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
3809   const int complexThreshold   = 2*mainLoop_stepping;
3810 
3811   // Don't test for len <= 0 here. This pathological case should not occur anyway.
3812   // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles.
3813   // The situation itself is detected and handled correctly by the conditional branches
3814   // following  aghi(len, -stepping) and aghi(len, +stepping).
3815   assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
3816 
3817   BLOCK_COMMENT("kernel_crc32_2word {");
3818 
3819   nand(crc, crc, crc);                           // ~c
3820 
3821   // Check for short (<mainLoop_stepping) buffer.
3822   cmpdi(CCR0, len, complexThreshold);
3823   blt(CCR0, L_tail);
3824 
3825   // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
3826   // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
3827   {
3828     // Align buf addr to mainLoop_stepping boundary.
3829     neg(tmp2, buf);                           // Calculate # preLoop iterations for alignment.
3830     rldicl(tmp2, tmp2, 0, 64-log_stepping);   // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
3831 
3832     if (complexThreshold > mainLoop_stepping) {
3833       sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3834     } else {
3835       sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
3836       cmpdi(CCR0, tmp, mainLoop_stepping);
3837       blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
3838       mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3839     }
3840     update_byteLoop_crc32(crc, buf, tmp2, table, data, false, false);
3841   }
3842 
3843   srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
3844   andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
3845   mtctr(tmp2);
3846 
3847 #ifdef VM_LITTLE_ENDIAN
3848   Register crc_rv = crc;
3849 #else
3850   Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
3851                                                  // Occupies tmp, but frees up crc.
3852   load_reverse_32(crc_rv, crc);                  // Revert byte order because we are dealing with big-endian data.
3853   tmp = crc;
3854 #endif
3855 
3856   int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
3857 
3858   align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
3859   BIND(L_mainLoop);
3860     update_1word_crc32(crc_rv, buf, table, 0, 0, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
3861     update_1word_crc32(crc_rv, buf, table, 4, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
3862     bdnz(L_mainLoop);
3863 
3864 #ifndef VM_LITTLE_ENDIAN
3865   load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
3866   tmp = crc_rv;                                  // Tmp uses it's original register again.
3867 #endif
3868 
3869   // Restore original table address for tailLoop.
3870   if (reconstructTableOffset != 0) {
3871     addi(table, table, -reconstructTableOffset);
3872   }
3873 
3874   // Process last few (<complexThreshold) bytes of buffer.
3875   BIND(L_tail);
3876   update_byteLoop_crc32(crc, buf, len, table, data, false, false);
3877 
3878   nand(crc, crc, crc);                           // ~c
3879   BLOCK_COMMENT("} kernel_crc32_2word");
3880 }
3881 
3882 /**
3883  * @param crc   register containing existing CRC (32-bit)
3884  * @param buf   register pointing to input byte buffer (byte*)
3885  * @param len   register containing number of bytes
3886  * @param table register pointing to CRC table
3887  *
3888  * uses R9..R12 as work register. Must be saved/restored by caller!
3889  */
3890 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table,
3891                                         Register t0,  Register t1,  Register t2,  Register t3,
3892                                         Register tc0, Register tc1, Register tc2, Register tc3) {
3893   assert_different_registers(crc, buf, len, table);
3894 
3895   Label L_mainLoop, L_tail;
3896   Register  tmp          = t0;
3897   Register  data         = t0;
3898   Register  tmp2         = t1;
3899   const int mainLoop_stepping  = 4;
3900   const int tailLoop_stepping  = 1;
3901   const int log_stepping       = exact_log2(mainLoop_stepping);
3902   const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
3903   const int complexThreshold   = 2*mainLoop_stepping;
3904 
3905   // Don't test for len <= 0 here. This pathological case should not occur anyway.
3906   // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles.
3907   // The situation itself is detected and handled correctly by the conditional branches
3908   // following  aghi(len, -stepping) and aghi(len, +stepping).
3909   assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
3910 
3911   BLOCK_COMMENT("kernel_crc32_1word {");
3912 
3913   nand(crc, crc, crc);                           // ~c
3914 
3915   // Check for short (<mainLoop_stepping) buffer.
3916   cmpdi(CCR0, len, complexThreshold);
3917   blt(CCR0, L_tail);
3918 
3919   // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
3920   // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
3921   {
3922     // Align buf addr to mainLoop_stepping boundary.
3923     neg(tmp2, buf);                              // Calculate # preLoop iterations for alignment.
3924     rldicl(tmp2, tmp2, 0, 64-log_stepping);      // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
3925 
3926     if (complexThreshold > mainLoop_stepping) {
3927       sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3928     } else {
3929       sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
3930       cmpdi(CCR0, tmp, mainLoop_stepping);
3931       blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
3932       mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3933     }
3934     update_byteLoop_crc32(crc, buf, tmp2, table, data, false, false);
3935   }
3936 
3937   srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
3938   andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
3939   mtctr(tmp2);
3940 
3941 #ifdef VM_LITTLE_ENDIAN
3942   Register crc_rv = crc;
3943 #else
3944   Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
3945                                                  // Occupies tmp, but frees up crc.
3946   load_reverse_32(crc_rv, crc);                  // evert byte order because we are dealing with big-endian data.
3947   tmp = crc;
3948 #endif
3949 
3950   int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
3951 
3952   align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
3953   BIND(L_mainLoop);
3954     update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
3955     bdnz(L_mainLoop);
3956 
3957 #ifndef VM_LITTLE_ENDIAN
3958   load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
3959   tmp = crc_rv;                                  // Tmp uses it's original register again.
3960 #endif
3961 
3962   // Restore original table address for tailLoop.
3963   if (reconstructTableOffset != 0) {
3964     addi(table, table, -reconstructTableOffset);
3965   }
3966 
3967   // Process last few (<complexThreshold) bytes of buffer.
3968   BIND(L_tail);
3969   update_byteLoop_crc32(crc, buf, len, table, data, false, false);
3970 
3971   nand(crc, crc, crc);                           // ~c
3972   BLOCK_COMMENT("} kernel_crc32_1word");
3973 }
3974 
3975 /**
3976  * @param crc   register containing existing CRC (32-bit)
3977  * @param buf   register pointing to input byte buffer (byte*)
3978  * @param len   register containing number of bytes
3979  * @param table register pointing to CRC table
3980  *
3981  * Uses R7_ARG5, R8_ARG6 as work registers.
3982  */
3983 void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len, Register table,
3984                                         Register t0,  Register t1,  Register t2,  Register t3) {
3985   assert_different_registers(crc, buf, len, table);
3986 
3987   Register  data = t0;                   // Holds the current byte to be folded into crc.
3988 
3989   BLOCK_COMMENT("kernel_crc32_1byte {");
3990 
3991   // Process all bytes in a single-byte loop.
3992   update_byteLoop_crc32(crc, buf, len, table, data, true, true);
3993 
3994   BLOCK_COMMENT("} kernel_crc32_1byte");
3995 }
3996 
3997 void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp) {
3998   assert_different_registers(crc, buf, /* len,  not used!! */ table, tmp);
3999 
4000   BLOCK_COMMENT("kernel_crc32_singleByte:");
4001   nand(crc, crc, crc);       // ~c
4002 
4003   lbz(tmp, 0, buf);          // Byte from buffer, zero-extended.
4004   update_byte_crc32(crc, tmp, table);
4005 
4006   nand(crc, crc, crc);       // ~c
4007 }
4008 
4009 // dest_lo += src1 + src2
4010 // dest_hi += carry1 + carry2
4011 void MacroAssembler::add2_with_carry(Register dest_hi,
4012                                      Register dest_lo,
4013                                      Register src1, Register src2) {
4014   li(R0, 0);
4015   addc(dest_lo, dest_lo, src1);
4016   adde(dest_hi, dest_hi, R0);
4017   addc(dest_lo, dest_lo, src2);
4018   adde(dest_hi, dest_hi, R0);
4019 }
4020 
4021 // Multiply 64 bit by 64 bit first loop.
4022 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,
4023                                            Register x_xstart,
4024                                            Register y, Register y_idx,
4025                                            Register z,
4026                                            Register carry,
4027                                            Register product_high, Register product,
4028                                            Register idx, Register kdx,
4029                                            Register tmp) {
4030   //  jlong carry, x[], y[], z[];
4031   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
4032   //    huge_128 product = y[idx] * x[xstart] + carry;
4033   //    z[kdx] = (jlong)product;
4034   //    carry  = (jlong)(product >>> 64);
4035   //  }
4036   //  z[xstart] = carry;
4037 
4038   Label L_first_loop, L_first_loop_exit;
4039   Label L_one_x, L_one_y, L_multiply;
4040 
4041   addic_(xstart, xstart, -1);
4042   blt(CCR0, L_one_x);   // Special case: length of x is 1.
4043 
4044   // Load next two integers of x.
4045   sldi(tmp, xstart, LogBytesPerInt);
4046   ldx(x_xstart, x, tmp);
4047 #ifdef VM_LITTLE_ENDIAN
4048   rldicl(x_xstart, x_xstart, 32, 0);
4049 #endif
4050 
4051   align(32, 16);
4052   bind(L_first_loop);
4053 
4054   cmpdi(CCR0, idx, 1);
4055   blt(CCR0, L_first_loop_exit);
4056   addi(idx, idx, -2);
4057   beq(CCR0, L_one_y);
4058 
4059   // Load next two integers of y.
4060   sldi(tmp, idx, LogBytesPerInt);
4061   ldx(y_idx, y, tmp);
4062 #ifdef VM_LITTLE_ENDIAN
4063   rldicl(y_idx, y_idx, 32, 0);
4064 #endif
4065 
4066 
4067   bind(L_multiply);
4068   multiply64(product_high, product, x_xstart, y_idx);
4069 
4070   li(tmp, 0);
4071   addc(product, product, carry);         // Add carry to result.
4072   adde(product_high, product_high, tmp); // Add carry of the last addition.
4073   addi(kdx, kdx, -2);
4074 
4075   // Store result.
4076 #ifdef VM_LITTLE_ENDIAN
4077   rldicl(product, product, 32, 0);
4078 #endif
4079   sldi(tmp, kdx, LogBytesPerInt);
4080   stdx(product, z, tmp);
4081   mr_if_needed(carry, product_high);
4082   b(L_first_loop);
4083 
4084 
4085   bind(L_one_y); // Load one 32 bit portion of y as (0,value).
4086 
4087   lwz(y_idx, 0, y);
4088   b(L_multiply);
4089 
4090 
4091   bind(L_one_x); // Load one 32 bit portion of x as (0,value).
4092 
4093   lwz(x_xstart, 0, x);
4094   b(L_first_loop);
4095 
4096   bind(L_first_loop_exit);
4097 }
4098 
4099 // Multiply 64 bit by 64 bit and add 128 bit.
4100 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y,
4101                                             Register z, Register yz_idx,
4102                                             Register idx, Register carry,
4103                                             Register product_high, Register product,
4104                                             Register tmp, int offset) {
4105 
4106   //  huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
4107   //  z[kdx] = (jlong)product;
4108 
4109   sldi(tmp, idx, LogBytesPerInt);
4110   if (offset) {
4111     addi(tmp, tmp, offset);
4112   }
4113   ldx(yz_idx, y, tmp);
4114 #ifdef VM_LITTLE_ENDIAN
4115   rldicl(yz_idx, yz_idx, 32, 0);
4116 #endif
4117 
4118   multiply64(product_high, product, x_xstart, yz_idx);
4119   ldx(yz_idx, z, tmp);
4120 #ifdef VM_LITTLE_ENDIAN
4121   rldicl(yz_idx, yz_idx, 32, 0);
4122 #endif
4123 
4124   add2_with_carry(product_high, product, carry, yz_idx);
4125 
4126   sldi(tmp, idx, LogBytesPerInt);
4127   if (offset) {
4128     addi(tmp, tmp, offset);
4129   }
4130 #ifdef VM_LITTLE_ENDIAN
4131   rldicl(product, product, 32, 0);
4132 #endif
4133   stdx(product, z, tmp);
4134 }
4135 
4136 // Multiply 128 bit by 128 bit. Unrolled inner loop.
4137 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart,
4138                                              Register y, Register z,
4139                                              Register yz_idx, Register idx, Register carry,
4140                                              Register product_high, Register product,
4141                                              Register carry2, Register tmp) {
4142 
4143   //  jlong carry, x[], y[], z[];
4144   //  int kdx = ystart+1;
4145   //  for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
4146   //    huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
4147   //    z[kdx+idx+1] = (jlong)product;
4148   //    jlong carry2 = (jlong)(product >>> 64);
4149   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
4150   //    z[kdx+idx] = (jlong)product;
4151   //    carry = (jlong)(product >>> 64);
4152   //  }
4153   //  idx += 2;
4154   //  if (idx > 0) {
4155   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
4156   //    z[kdx+idx] = (jlong)product;
4157   //    carry = (jlong)(product >>> 64);
4158   //  }
4159 
4160   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
4161   const Register jdx = R0;
4162 
4163   // Scale the index.
4164   srdi_(jdx, idx, 2);
4165   beq(CCR0, L_third_loop_exit);
4166   mtctr(jdx);
4167 
4168   align(32, 16);
4169   bind(L_third_loop);
4170 
4171   addi(idx, idx, -4);
4172 
4173   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8);
4174   mr_if_needed(carry2, product_high);
4175 
4176   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0);
4177   mr_if_needed(carry, product_high);
4178   bdnz(L_third_loop);
4179 
4180   bind(L_third_loop_exit);  // Handle any left-over operand parts.
4181 
4182   andi_(idx, idx, 0x3);
4183   beq(CCR0, L_post_third_loop_done);
4184 
4185   Label L_check_1;
4186 
4187   addic_(idx, idx, -2);
4188   blt(CCR0, L_check_1);
4189 
4190   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0);
4191   mr_if_needed(carry, product_high);
4192 
4193   bind(L_check_1);
4194 
4195   addi(idx, idx, 0x2);
4196   andi_(idx, idx, 0x1);
4197   addic_(idx, idx, -1);
4198   blt(CCR0, L_post_third_loop_done);
4199 
4200   sldi(tmp, idx, LogBytesPerInt);
4201   lwzx(yz_idx, y, tmp);
4202   multiply64(product_high, product, x_xstart, yz_idx);
4203   lwzx(yz_idx, z, tmp);
4204 
4205   add2_with_carry(product_high, product, yz_idx, carry);
4206 
4207   sldi(tmp, idx, LogBytesPerInt);
4208   stwx(product, z, tmp);
4209   srdi(product, product, 32);
4210 
4211   sldi(product_high, product_high, 32);
4212   orr(product, product, product_high);
4213   mr_if_needed(carry, product);
4214 
4215   bind(L_post_third_loop_done);
4216 }   // multiply_128_x_128_loop
4217 
4218 void MacroAssembler::multiply_to_len(Register x, Register xlen,
4219                                      Register y, Register ylen,
4220                                      Register z, Register zlen,
4221                                      Register tmp1, Register tmp2,
4222                                      Register tmp3, Register tmp4,
4223                                      Register tmp5, Register tmp6,
4224                                      Register tmp7, Register tmp8,
4225                                      Register tmp9, Register tmp10,
4226                                      Register tmp11, Register tmp12,
4227                                      Register tmp13) {
4228 
4229   ShortBranchVerifier sbv(this);
4230 
4231   assert_different_registers(x, xlen, y, ylen, z, zlen,
4232                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
4233   assert_different_registers(x, xlen, y, ylen, z, zlen,
4234                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp7);
4235   assert_different_registers(x, xlen, y, ylen, z, zlen,
4236                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp8);
4237 
4238   const Register idx = tmp1;
4239   const Register kdx = tmp2;
4240   const Register xstart = tmp3;
4241 
4242   const Register y_idx = tmp4;
4243   const Register carry = tmp5;
4244   const Register product = tmp6;
4245   const Register product_high = tmp7;
4246   const Register x_xstart = tmp8;
4247   const Register tmp = tmp9;
4248 
4249   // First Loop.
4250   //
4251   //  final static long LONG_MASK = 0xffffffffL;
4252   //  int xstart = xlen - 1;
4253   //  int ystart = ylen - 1;
4254   //  long carry = 0;
4255   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
4256   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
4257   //    z[kdx] = (int)product;
4258   //    carry = product >>> 32;
4259   //  }
4260   //  z[xstart] = (int)carry;
4261 
4262   mr_if_needed(idx, ylen);        // idx = ylen
4263   mr_if_needed(kdx, zlen);        // kdx = xlen + ylen
4264   li(carry, 0);                   // carry = 0
4265 
4266   Label L_done;
4267 
4268   addic_(xstart, xlen, -1);
4269   blt(CCR0, L_done);
4270 
4271   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z,
4272                         carry, product_high, product, idx, kdx, tmp);
4273 
4274   Label L_second_loop;
4275 
4276   cmpdi(CCR0, kdx, 0);
4277   beq(CCR0, L_second_loop);
4278 
4279   Label L_carry;
4280 
4281   addic_(kdx, kdx, -1);
4282   beq(CCR0, L_carry);
4283 
4284   // Store lower 32 bits of carry.
4285   sldi(tmp, kdx, LogBytesPerInt);
4286   stwx(carry, z, tmp);
4287   srdi(carry, carry, 32);
4288   addi(kdx, kdx, -1);
4289 
4290 
4291   bind(L_carry);
4292 
4293   // Store upper 32 bits of carry.
4294   sldi(tmp, kdx, LogBytesPerInt);
4295   stwx(carry, z, tmp);
4296 
4297   // Second and third (nested) loops.
4298   //
4299   //  for (int i = xstart-1; i >= 0; i--) { // Second loop
4300   //    carry = 0;
4301   //    for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
4302   //      long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
4303   //                     (z[k] & LONG_MASK) + carry;
4304   //      z[k] = (int)product;
4305   //      carry = product >>> 32;
4306   //    }
4307   //    z[i] = (int)carry;
4308   //  }
4309   //
4310   //  i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
4311 
4312   bind(L_second_loop);
4313 
4314   li(carry, 0);                   // carry = 0;
4315 
4316   addic_(xstart, xstart, -1);     // i = xstart-1;
4317   blt(CCR0, L_done);
4318 
4319   Register zsave = tmp10;
4320 
4321   mr(zsave, z);
4322 
4323 
4324   Label L_last_x;
4325 
4326   sldi(tmp, xstart, LogBytesPerInt);
4327   add(z, z, tmp);                 // z = z + k - j
4328   addi(z, z, 4);
4329   addic_(xstart, xstart, -1);     // i = xstart-1;
4330   blt(CCR0, L_last_x);
4331 
4332   sldi(tmp, xstart, LogBytesPerInt);
4333   ldx(x_xstart, x, tmp);
4334 #ifdef VM_LITTLE_ENDIAN
4335   rldicl(x_xstart, x_xstart, 32, 0);
4336 #endif
4337 
4338 
4339   Label L_third_loop_prologue;
4340 
4341   bind(L_third_loop_prologue);
4342 
4343   Register xsave = tmp11;
4344   Register xlensave = tmp12;
4345   Register ylensave = tmp13;
4346 
4347   mr(xsave, x);
4348   mr(xlensave, xstart);
4349   mr(ylensave, ylen);
4350 
4351 
4352   multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen,
4353                           carry, product_high, product, x, tmp);
4354 
4355   mr(z, zsave);
4356   mr(x, xsave);
4357   mr(xlen, xlensave);   // This is the decrement of the loop counter!
4358   mr(ylen, ylensave);
4359 
4360   addi(tmp3, xlen, 1);
4361   sldi(tmp, tmp3, LogBytesPerInt);
4362   stwx(carry, z, tmp);
4363   addic_(tmp3, tmp3, -1);
4364   blt(CCR0, L_done);
4365 
4366   srdi(carry, carry, 32);
4367   sldi(tmp, tmp3, LogBytesPerInt);
4368   stwx(carry, z, tmp);
4369   b(L_second_loop);
4370 
4371   // Next infrequent code is moved outside loops.
4372   bind(L_last_x);
4373 
4374   lwz(x_xstart, 0, x);
4375   b(L_third_loop_prologue);
4376 
4377   bind(L_done);
4378 }   // multiply_to_len
4379 
4380 void MacroAssembler::asm_assert(bool check_equal, const char *msg, int id) {
4381 #ifdef ASSERT
4382   Label ok;
4383   if (check_equal) {
4384     beq(CCR0, ok);
4385   } else {
4386     bne(CCR0, ok);
4387   }
4388   stop(msg, id);
4389   bind(ok);
4390 #endif
4391 }
4392 
4393 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset,
4394                                           Register mem_base, const char* msg, int id) {
4395 #ifdef ASSERT
4396   switch (size) {
4397     case 4:
4398       lwz(R0, mem_offset, mem_base);
4399       cmpwi(CCR0, R0, 0);
4400       break;
4401     case 8:
4402       ld(R0, mem_offset, mem_base);
4403       cmpdi(CCR0, R0, 0);
4404       break;
4405     default:
4406       ShouldNotReachHere();
4407   }
4408   asm_assert(check_equal, msg, id);
4409 #endif // ASSERT
4410 }
4411 
4412 void MacroAssembler::verify_thread() {
4413   if (VerifyThread) {
4414     unimplemented("'VerifyThread' currently not implemented on PPC");
4415   }
4416 }
4417 
4418 // READ: oop. KILL: R0. Volatile floats perhaps.
4419 void MacroAssembler::verify_oop(Register oop, const char* msg) {
4420   if (!VerifyOops) {
4421     return;
4422   }
4423 
4424   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4425   const Register tmp = R11; // Will be preserved.
4426   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
4427   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
4428 
4429   mr_if_needed(R4_ARG2, oop);
4430   save_LR_CR(tmp); // save in old frame
4431   push_frame_reg_args(nbytes_save, tmp);
4432   // load FunctionDescriptor** / entry_address *
4433   load_const_optimized(tmp, fd, R0);
4434   // load FunctionDescriptor* / entry_address
4435   ld(tmp, 0, tmp);
4436   load_const_optimized(R3_ARG1, (address)msg, R0);
4437   // Call destination for its side effect.
4438   call_c(tmp);
4439 
4440   pop_frame();
4441   restore_LR_CR(tmp);
4442   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
4443 }
4444 
4445 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) {
4446   if (!VerifyOops) {
4447     return;
4448   }
4449 
4450   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4451   const Register tmp = R11; // Will be preserved.
4452   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
4453   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
4454 
4455   ld(R4_ARG2, offs, base);
4456   save_LR_CR(tmp); // save in old frame
4457   push_frame_reg_args(nbytes_save, tmp);
4458   // load FunctionDescriptor** / entry_address *
4459   load_const_optimized(tmp, fd, R0);
4460   // load FunctionDescriptor* / entry_address
4461   ld(tmp, 0, tmp);
4462   load_const_optimized(R3_ARG1, (address)msg, R0);
4463   // Call destination for its side effect.
4464   call_c(tmp);
4465 
4466   pop_frame();
4467   restore_LR_CR(tmp);
4468   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
4469 }
4470 
4471 const char* stop_types[] = {
4472   "stop",
4473   "untested",
4474   "unimplemented",
4475   "shouldnotreachhere"
4476 };
4477 
4478 static void stop_on_request(int tp, const char* msg) {
4479   tty->print("PPC assembly code requires stop: (%s) %s\n", stop_types[tp%/*stop_end*/4], msg);
4480   guarantee(false, "PPC assembly code requires stop: %s", msg);
4481 }
4482 
4483 // Call a C-function that prints output.
4484 void MacroAssembler::stop(int type, const char* msg, int id) {
4485 #ifndef PRODUCT
4486   block_comment(err_msg("stop: %s %s {", stop_types[type%stop_end], msg));
4487 #else
4488   block_comment("stop {");
4489 #endif
4490 
4491   // setup arguments
4492   load_const_optimized(R3_ARG1, type);
4493   load_const_optimized(R4_ARG2, (void *)msg, /*tmp=*/R0);
4494   call_VM_leaf(CAST_FROM_FN_PTR(address, stop_on_request), R3_ARG1, R4_ARG2);
4495   illtrap();
4496   emit_int32(id);
4497   block_comment("} stop;");
4498 }
4499 
4500 #ifndef PRODUCT
4501 // Write pattern 0x0101010101010101 in memory region [low-before, high+after].
4502 // Val, addr are temp registers.
4503 // If low == addr, addr is killed.
4504 // High is preserved.
4505 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) {
4506   if (!ZapMemory) return;
4507 
4508   assert_different_registers(low, val);
4509 
4510   BLOCK_COMMENT("zap memory region {");
4511   load_const_optimized(val, 0x0101010101010101);
4512   int size = before + after;
4513   if (low == high && size < 5 && size > 0) {
4514     int offset = -before*BytesPerWord;
4515     for (int i = 0; i < size; ++i) {
4516       std(val, offset, low);
4517       offset += (1*BytesPerWord);
4518     }
4519   } else {
4520     addi(addr, low, -before*BytesPerWord);
4521     assert_different_registers(high, val);
4522     if (after) addi(high, high, after * BytesPerWord);
4523     Label loop;
4524     bind(loop);
4525     std(val, 0, addr);
4526     addi(addr, addr, 8);
4527     cmpd(CCR6, addr, high);
4528     ble(CCR6, loop);
4529     if (after) addi(high, high, -after * BytesPerWord);  // Correct back to old value.
4530   }
4531   BLOCK_COMMENT("} zap memory region");
4532 }
4533 
4534 #endif // !PRODUCT
4535 
4536 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() {
4537   int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true);
4538   assert(sizeof(bool) == 1, "PowerPC ABI");
4539   masm->lbz(temp, simm16_offset, temp);
4540   masm->cmpwi(CCR0, temp, 0);
4541   masm->beq(CCR0, _label);
4542 }
4543 
4544 SkipIfEqualZero::~SkipIfEqualZero() {
4545   _masm->bind(_label);
4546 }