1 /*
   2  * Copyright (c) 1997, 2016, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2012, 2016 SAP SE. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "compiler/disassembler.hpp"
  29 #include "gc/shared/cardTableModRefBS.hpp"
  30 #include "gc/shared/collectedHeap.inline.hpp"
  31 #include "interpreter/interpreter.hpp"
  32 #include "memory/resourceArea.hpp"
  33 #include "nativeInst_ppc.hpp"
  34 #include "prims/methodHandles.hpp"
  35 #include "runtime/biasedLocking.hpp"
  36 #include "runtime/icache.hpp"
  37 #include "runtime/interfaceSupport.hpp"
  38 #include "runtime/objectMonitor.hpp"
  39 #include "runtime/os.hpp"
  40 #include "runtime/sharedRuntime.hpp"
  41 #include "runtime/stubRoutines.hpp"
  42 #include "utilities/macros.hpp"
  43 #if INCLUDE_ALL_GCS
  44 #include "gc/g1/g1CollectedHeap.inline.hpp"
  45 #include "gc/g1/g1SATBCardTableModRefBS.hpp"
  46 #include "gc/g1/heapRegion.hpp"
  47 #endif // INCLUDE_ALL_GCS
  48 
  49 #ifdef PRODUCT
  50 #define BLOCK_COMMENT(str) // nothing
  51 #else
  52 #define BLOCK_COMMENT(str) block_comment(str)
  53 #endif
  54 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  55 
  56 #ifdef ASSERT
  57 // On RISC, there's no benefit to verifying instruction boundaries.
  58 bool AbstractAssembler::pd_check_instruction_mark() { return false; }
  59 #endif
  60 
  61 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) {
  62   assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range");
  63   if (Assembler::is_simm(si31, 16)) {
  64     ld(d, si31, a);
  65     if (emit_filler_nop) nop();
  66   } else {
  67     const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31);
  68     const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31);
  69     addis(d, a, hi);
  70     ld(d, lo, d);
  71   }
  72 }
  73 
  74 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) {
  75   assert_different_registers(d, a);
  76   ld_largeoffset_unchecked(d, si31, a, emit_filler_nop);
  77 }
  78 
  79 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base,
  80                                       size_t size_in_bytes, bool is_signed) {
  81   switch (size_in_bytes) {
  82   case  8:              ld(dst, offs, base);                         break;
  83   case  4:  is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break;
  84   case  2:  is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break;
  85   case  1:  lbz(dst, offs, base); if (is_signed) extsb(dst, dst);    break; // lba doesn't exist :(
  86   default:  ShouldNotReachHere();
  87   }
  88 }
  89 
  90 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base,
  91                                        size_t size_in_bytes) {
  92   switch (size_in_bytes) {
  93   case  8:  std(dst, offs, base); break;
  94   case  4:  stw(dst, offs, base); break;
  95   case  2:  sth(dst, offs, base); break;
  96   case  1:  stb(dst, offs, base); break;
  97   default:  ShouldNotReachHere();
  98   }
  99 }
 100 
 101 void MacroAssembler::align(int modulus, int max, int rem) {
 102   int padding = (rem + modulus - (offset() % modulus)) % modulus;
 103   if (padding > max) return;
 104   for (int c = (padding >> 2); c > 0; --c) { nop(); }
 105 }
 106 
 107 // Issue instructions that calculate given TOC from global TOC.
 108 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16,
 109                                                        bool add_relocation, bool emit_dummy_addr) {
 110   int offset = -1;
 111   if (emit_dummy_addr) {
 112     offset = -128; // dummy address
 113   } else if (addr != (address)(intptr_t)-1) {
 114     offset = MacroAssembler::offset_to_global_toc(addr);
 115   }
 116 
 117   if (hi16) {
 118     addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset));
 119   }
 120   if (lo16) {
 121     if (add_relocation) {
 122       // Relocate at the addi to avoid confusion with a load from the method's TOC.
 123       relocate(internal_word_Relocation::spec(addr));
 124     }
 125     addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset));
 126   }
 127 }
 128 
 129 int MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) {
 130   const int offset = MacroAssembler::offset_to_global_toc(addr);
 131 
 132   const address inst2_addr = a;
 133   const int inst2 = *(int *)inst2_addr;
 134 
 135   // The relocation points to the second instruction, the addi,
 136   // and the addi reads and writes the same register dst.
 137   const int dst = inv_rt_field(inst2);
 138   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 139 
 140   // Now, find the preceding addis which writes to dst.
 141   int inst1 = 0;
 142   address inst1_addr = inst2_addr - BytesPerInstWord;
 143   while (inst1_addr >= bound) {
 144     inst1 = *(int *) inst1_addr;
 145     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 146       // Stop, found the addis which writes dst.
 147       break;
 148     }
 149     inst1_addr -= BytesPerInstWord;
 150   }
 151 
 152   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 153   set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset));
 154   set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset));
 155   return (int)((intptr_t)addr - (intptr_t)inst1_addr);
 156 }
 157 
 158 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) {
 159   const address inst2_addr = a;
 160   const int inst2 = *(int *)inst2_addr;
 161 
 162   // The relocation points to the second instruction, the addi,
 163   // and the addi reads and writes the same register dst.
 164   const int dst = inv_rt_field(inst2);
 165   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 166 
 167   // Now, find the preceding addis which writes to dst.
 168   int inst1 = 0;
 169   address inst1_addr = inst2_addr - BytesPerInstWord;
 170   while (inst1_addr >= bound) {
 171     inst1 = *(int *) inst1_addr;
 172     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 173       // stop, found the addis which writes dst
 174       break;
 175     }
 176     inst1_addr -= BytesPerInstWord;
 177   }
 178 
 179   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 180 
 181   int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0);
 182   // -1 is a special case
 183   if (offset == -1) {
 184     return (address)(intptr_t)-1;
 185   } else {
 186     return global_toc() + offset;
 187   }
 188 }
 189 
 190 #ifdef _LP64
 191 // Patch compressed oops or klass constants.
 192 // Assembler sequence is
 193 // 1) compressed oops:
 194 //    lis  rx = const.hi
 195 //    ori rx = rx | const.lo
 196 // 2) compressed klass:
 197 //    lis  rx = const.hi
 198 //    clrldi rx = rx & 0xFFFFffff // clearMS32b, optional
 199 //    ori rx = rx | const.lo
 200 // Clrldi will be passed by.
 201 int MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) {
 202   assert(UseCompressedOops, "Should only patch compressed oops");
 203 
 204   const address inst2_addr = a;
 205   const int inst2 = *(int *)inst2_addr;
 206 
 207   // The relocation points to the second instruction, the ori,
 208   // and the ori reads and writes the same register dst.
 209   const int dst = inv_rta_field(inst2);
 210   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 211   // Now, find the preceding addis which writes to dst.
 212   int inst1 = 0;
 213   address inst1_addr = inst2_addr - BytesPerInstWord;
 214   bool inst1_found = false;
 215   while (inst1_addr >= bound) {
 216     inst1 = *(int *)inst1_addr;
 217     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; }
 218     inst1_addr -= BytesPerInstWord;
 219   }
 220   assert(inst1_found, "inst is not lis");
 221 
 222   int xc = (data >> 16) & 0xffff;
 223   int xd = (data >>  0) & 0xffff;
 224 
 225   set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo
 226   set_imm((int *)inst2_addr,        (xd)); // unsigned int
 227   return (int)((intptr_t)inst2_addr - (intptr_t)inst1_addr);
 228 }
 229 
 230 // Get compressed oop or klass constant.
 231 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) {
 232   assert(UseCompressedOops, "Should only patch compressed oops");
 233 
 234   const address inst2_addr = a;
 235   const int inst2 = *(int *)inst2_addr;
 236 
 237   // The relocation points to the second instruction, the ori,
 238   // and the ori reads and writes the same register dst.
 239   const int dst = inv_rta_field(inst2);
 240   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 241   // Now, find the preceding lis which writes to dst.
 242   int inst1 = 0;
 243   address inst1_addr = inst2_addr - BytesPerInstWord;
 244   bool inst1_found = false;
 245 
 246   while (inst1_addr >= bound) {
 247     inst1 = *(int *) inst1_addr;
 248     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;}
 249     inst1_addr -= BytesPerInstWord;
 250   }
 251   assert(inst1_found, "inst is not lis");
 252 
 253   uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff));
 254   uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16);
 255 
 256   return (int) (xl | xh);
 257 }
 258 #endif // _LP64
 259 
 260 // Returns true if successful.
 261 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a,
 262                                                 Register toc, bool fixed_size) {
 263   int toc_offset = 0;
 264   // Use RelocationHolder::none for the constant pool entry, otherwise
 265   // we will end up with a failing NativeCall::verify(x) where x is
 266   // the address of the constant pool entry.
 267   // FIXME: We should insert relocation information for oops at the constant
 268   // pool entries instead of inserting it at the loads; patching of a constant
 269   // pool entry should be less expensive.
 270   address const_address = address_constant((address)a.value(), RelocationHolder::none);
 271   if (const_address == NULL) { return false; } // allocation failure
 272   // Relocate at the pc of the load.
 273   relocate(a.rspec());
 274   toc_offset = (int)(const_address - code()->consts()->start());
 275   ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size);
 276   return true;
 277 }
 278 
 279 bool MacroAssembler::is_load_const_from_method_toc_at(address a) {
 280   const address inst1_addr = a;
 281   const int inst1 = *(int *)inst1_addr;
 282 
 283    // The relocation points to the ld or the addis.
 284    return (is_ld(inst1)) ||
 285           (is_addis(inst1) && inv_ra_field(inst1) != 0);
 286 }
 287 
 288 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) {
 289   assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc");
 290 
 291   const address inst1_addr = a;
 292   const int inst1 = *(int *)inst1_addr;
 293 
 294   if (is_ld(inst1)) {
 295     return inv_d1_field(inst1);
 296   } else if (is_addis(inst1)) {
 297     const int dst = inv_rt_field(inst1);
 298 
 299     // Now, find the succeeding ld which reads and writes to dst.
 300     address inst2_addr = inst1_addr + BytesPerInstWord;
 301     int inst2 = 0;
 302     while (true) {
 303       inst2 = *(int *) inst2_addr;
 304       if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) {
 305         // Stop, found the ld which reads and writes dst.
 306         break;
 307       }
 308       inst2_addr += BytesPerInstWord;
 309     }
 310     return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2);
 311   }
 312   ShouldNotReachHere();
 313   return 0;
 314 }
 315 
 316 // Get the constant from a `load_const' sequence.
 317 long MacroAssembler::get_const(address a) {
 318   assert(is_load_const_at(a), "not a load of a constant");
 319   const int *p = (const int*) a;
 320   unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48);
 321   if (is_ori(*(p+1))) {
 322     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32);
 323     x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16);
 324     x |= (((unsigned long) (get_imm(a,4) & 0xffff)));
 325   } else if (is_lis(*(p+1))) {
 326     x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32);
 327     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16);
 328     x |= (((unsigned long) (get_imm(a,3) & 0xffff)));
 329   } else {
 330     ShouldNotReachHere();
 331     return (long) 0;
 332   }
 333   return (long) x;
 334 }
 335 
 336 // Patch the 64 bit constant of a `load_const' sequence. This is a low
 337 // level procedure. It neither flushes the instruction cache nor is it
 338 // mt safe.
 339 void MacroAssembler::patch_const(address a, long x) {
 340   assert(is_load_const_at(a), "not a load of a constant");
 341   int *p = (int*) a;
 342   if (is_ori(*(p+1))) {
 343     set_imm(0 + p, (x >> 48) & 0xffff);
 344     set_imm(1 + p, (x >> 32) & 0xffff);
 345     set_imm(3 + p, (x >> 16) & 0xffff);
 346     set_imm(4 + p, x & 0xffff);
 347   } else if (is_lis(*(p+1))) {
 348     set_imm(0 + p, (x >> 48) & 0xffff);
 349     set_imm(2 + p, (x >> 32) & 0xffff);
 350     set_imm(1 + p, (x >> 16) & 0xffff);
 351     set_imm(3 + p, x & 0xffff);
 352   } else {
 353     ShouldNotReachHere();
 354   }
 355 }
 356 
 357 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) {
 358   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
 359   int index = oop_recorder()->allocate_metadata_index(obj);
 360   RelocationHolder rspec = metadata_Relocation::spec(index);
 361   return AddressLiteral((address)obj, rspec);
 362 }
 363 
 364 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) {
 365   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
 366   int index = oop_recorder()->find_index(obj);
 367   RelocationHolder rspec = metadata_Relocation::spec(index);
 368   return AddressLiteral((address)obj, rspec);
 369 }
 370 
 371 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) {
 372   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
 373   int oop_index = oop_recorder()->allocate_oop_index(obj);
 374   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 375 }
 376 
 377 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) {
 378   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
 379   int oop_index = oop_recorder()->find_index(obj);
 380   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 381 }
 382 
 383 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
 384                                                       Register tmp, int offset) {
 385   intptr_t value = *delayed_value_addr;
 386   if (value != 0) {
 387     return RegisterOrConstant(value + offset);
 388   }
 389 
 390   // Load indirectly to solve generation ordering problem.
 391   // static address, no relocation
 392   int simm16_offset = load_const_optimized(tmp, delayed_value_addr, noreg, true);
 393   ld(tmp, simm16_offset, tmp); // must be aligned ((xa & 3) == 0)
 394 
 395   if (offset != 0) {
 396     addi(tmp, tmp, offset);
 397   }
 398 
 399   return RegisterOrConstant(tmp);
 400 }
 401 
 402 #ifndef PRODUCT
 403 void MacroAssembler::pd_print_patched_instruction(address branch) {
 404   Unimplemented(); // TODO: PPC port
 405 }
 406 #endif // ndef PRODUCT
 407 
 408 // Conditional far branch for destinations encodable in 24+2 bits.
 409 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) {
 410 
 411   // If requested by flag optimize, relocate the bc_far as a
 412   // runtime_call and prepare for optimizing it when the code gets
 413   // relocated.
 414   if (optimize == bc_far_optimize_on_relocate) {
 415     relocate(relocInfo::runtime_call_type);
 416   }
 417 
 418   // variant 2:
 419   //
 420   //    b!cxx SKIP
 421   //    bxx   DEST
 422   //  SKIP:
 423   //
 424 
 425   const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 426                                                 opposite_bcond(inv_boint_bcond(boint)));
 427 
 428   // We emit two branches.
 429   // First, a conditional branch which jumps around the far branch.
 430   const address not_taken_pc = pc() + 2 * BytesPerInstWord;
 431   const address bc_pc        = pc();
 432   bc(opposite_boint, biint, not_taken_pc);
 433 
 434   const int bc_instr = *(int*)bc_pc;
 435   assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition");
 436   assert(opposite_boint == inv_bo_field(bc_instr), "postcondition");
 437   assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))),
 438                                      opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))),
 439          "postcondition");
 440   assert(biint == inv_bi_field(bc_instr), "postcondition");
 441 
 442   // Second, an unconditional far branch which jumps to dest.
 443   // Note: target(dest) remembers the current pc (see CodeSection::target)
 444   //       and returns the current pc if the label is not bound yet; when
 445   //       the label gets bound, the unconditional far branch will be patched.
 446   const address target_pc = target(dest);
 447   const address b_pc  = pc();
 448   b(target_pc);
 449 
 450   assert(not_taken_pc == pc(),                     "postcondition");
 451   assert(dest.is_bound() || target_pc == b_pc, "postcondition");
 452 }
 453 
 454 // 1 or 2 instructions
 455 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) {
 456   if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) {
 457     bc(boint, biint, dest);
 458   } else {
 459     bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate);
 460   }
 461 }
 462 
 463 bool MacroAssembler::is_bc_far_at(address instruction_addr) {
 464   return is_bc_far_variant1_at(instruction_addr) ||
 465          is_bc_far_variant2_at(instruction_addr) ||
 466          is_bc_far_variant3_at(instruction_addr);
 467 }
 468 
 469 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) {
 470   if (is_bc_far_variant1_at(instruction_addr)) {
 471     const address instruction_1_addr = instruction_addr;
 472     const int instruction_1 = *(int*)instruction_1_addr;
 473     return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr);
 474   } else if (is_bc_far_variant2_at(instruction_addr)) {
 475     const address instruction_2_addr = instruction_addr + 4;
 476     return bxx_destination(instruction_2_addr);
 477   } else if (is_bc_far_variant3_at(instruction_addr)) {
 478     return instruction_addr + 8;
 479   }
 480   // variant 4 ???
 481   ShouldNotReachHere();
 482   return NULL;
 483 }
 484 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) {
 485 
 486   if (is_bc_far_variant3_at(instruction_addr)) {
 487     // variant 3, far cond branch to the next instruction, already patched to nops:
 488     //
 489     //    nop
 490     //    endgroup
 491     //  SKIP/DEST:
 492     //
 493     return;
 494   }
 495 
 496   // first, extract boint and biint from the current branch
 497   int boint = 0;
 498   int biint = 0;
 499 
 500   ResourceMark rm;
 501   const int code_size = 2 * BytesPerInstWord;
 502   CodeBuffer buf(instruction_addr, code_size);
 503   MacroAssembler masm(&buf);
 504   if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) {
 505     // Far branch to next instruction: Optimize it by patching nops (produce variant 3).
 506     masm.nop();
 507     masm.endgroup();
 508   } else {
 509     if (is_bc_far_variant1_at(instruction_addr)) {
 510       // variant 1, the 1st instruction contains the destination address:
 511       //
 512       //    bcxx  DEST
 513       //    nop
 514       //
 515       const int instruction_1 = *(int*)(instruction_addr);
 516       boint = inv_bo_field(instruction_1);
 517       biint = inv_bi_field(instruction_1);
 518     } else if (is_bc_far_variant2_at(instruction_addr)) {
 519       // variant 2, the 2nd instruction contains the destination address:
 520       //
 521       //    b!cxx SKIP
 522       //    bxx   DEST
 523       //  SKIP:
 524       //
 525       const int instruction_1 = *(int*)(instruction_addr);
 526       boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))),
 527           opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1))));
 528       biint = inv_bi_field(instruction_1);
 529     } else {
 530       // variant 4???
 531       ShouldNotReachHere();
 532     }
 533 
 534     // second, set the new branch destination and optimize the code
 535     if (dest != instruction_addr + 4 && // the bc_far is still unbound!
 536         masm.is_within_range_of_bcxx(dest, instruction_addr)) {
 537       // variant 1:
 538       //
 539       //    bcxx  DEST
 540       //    nop
 541       //
 542       masm.bc(boint, biint, dest);
 543       masm.nop();
 544     } else {
 545       // variant 2:
 546       //
 547       //    b!cxx SKIP
 548       //    bxx   DEST
 549       //  SKIP:
 550       //
 551       const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 552                                                     opposite_bcond(inv_boint_bcond(boint)));
 553       const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord;
 554       masm.bc(opposite_boint, biint, not_taken_pc);
 555       masm.b(dest);
 556     }
 557   }
 558   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 559 }
 560 
 561 // Emit a NOT mt-safe patchable 64 bit absolute call/jump.
 562 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) {
 563   // get current pc
 564   uint64_t start_pc = (uint64_t) pc();
 565 
 566   const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last
 567   const address pc_of_b  = (address) (start_pc + (0*BytesPerInstWord)); // b is first
 568 
 569   // relocate here
 570   if (rt != relocInfo::none) {
 571     relocate(rt);
 572   }
 573 
 574   if ( ReoptimizeCallSequences &&
 575        (( link && is_within_range_of_b(dest, pc_of_bl)) ||
 576         (!link && is_within_range_of_b(dest, pc_of_b)))) {
 577     // variant 2:
 578     // Emit an optimized, pc-relative call/jump.
 579 
 580     if (link) {
 581       // some padding
 582       nop();
 583       nop();
 584       nop();
 585       nop();
 586       nop();
 587       nop();
 588 
 589       // do the call
 590       assert(pc() == pc_of_bl, "just checking");
 591       bl(dest, relocInfo::none);
 592     } else {
 593       // do the jump
 594       assert(pc() == pc_of_b, "just checking");
 595       b(dest, relocInfo::none);
 596 
 597       // some padding
 598       nop();
 599       nop();
 600       nop();
 601       nop();
 602       nop();
 603       nop();
 604     }
 605 
 606     // Assert that we can identify the emitted call/jump.
 607     assert(is_bxx64_patchable_variant2_at((address)start_pc, link),
 608            "can't identify emitted call");
 609   } else {
 610     // variant 1:
 611     mr(R0, R11);  // spill R11 -> R0.
 612 
 613     // Load the destination address into CTR,
 614     // calculate destination relative to global toc.
 615     calculate_address_from_global_toc(R11, dest, true, true, false);
 616 
 617     mtctr(R11);
 618     mr(R11, R0);  // spill R11 <- R0.
 619     nop();
 620 
 621     // do the call/jump
 622     if (link) {
 623       bctrl();
 624     } else{
 625       bctr();
 626     }
 627     // Assert that we can identify the emitted call/jump.
 628     assert(is_bxx64_patchable_variant1b_at((address)start_pc, link),
 629            "can't identify emitted call");
 630   }
 631 
 632   // Assert that we can identify the emitted call/jump.
 633   assert(is_bxx64_patchable_at((address)start_pc, link),
 634          "can't identify emitted call");
 635   assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest,
 636          "wrong encoding of dest address");
 637 }
 638 
 639 // Identify a bxx64_patchable instruction.
 640 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) {
 641   return is_bxx64_patchable_variant1b_at(instruction_addr, link)
 642     //|| is_bxx64_patchable_variant1_at(instruction_addr, link)
 643       || is_bxx64_patchable_variant2_at(instruction_addr, link);
 644 }
 645 
 646 // Does the call64_patchable instruction use a pc-relative encoding of
 647 // the call destination?
 648 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) {
 649   // variant 2 is pc-relative
 650   return is_bxx64_patchable_variant2_at(instruction_addr, link);
 651 }
 652 
 653 // Identify variant 1.
 654 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) {
 655   unsigned int* instr = (unsigned int*) instruction_addr;
 656   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 657       && is_mtctr(instr[5]) // mtctr
 658     && is_load_const_at(instruction_addr);
 659 }
 660 
 661 // Identify variant 1b: load destination relative to global toc.
 662 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) {
 663   unsigned int* instr = (unsigned int*) instruction_addr;
 664   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 665     && is_mtctr(instr[3]) // mtctr
 666     && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr);
 667 }
 668 
 669 // Identify variant 2.
 670 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) {
 671   unsigned int* instr = (unsigned int*) instruction_addr;
 672   if (link) {
 673     return is_bl (instr[6])  // bl dest is last
 674       && is_nop(instr[0])  // nop
 675       && is_nop(instr[1])  // nop
 676       && is_nop(instr[2])  // nop
 677       && is_nop(instr[3])  // nop
 678       && is_nop(instr[4])  // nop
 679       && is_nop(instr[5]); // nop
 680   } else {
 681     return is_b  (instr[0])  // b  dest is first
 682       && is_nop(instr[1])  // nop
 683       && is_nop(instr[2])  // nop
 684       && is_nop(instr[3])  // nop
 685       && is_nop(instr[4])  // nop
 686       && is_nop(instr[5])  // nop
 687       && is_nop(instr[6]); // nop
 688   }
 689 }
 690 
 691 // Set dest address of a bxx64_patchable instruction.
 692 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) {
 693   ResourceMark rm;
 694   int code_size = MacroAssembler::bxx64_patchable_size;
 695   CodeBuffer buf(instruction_addr, code_size);
 696   MacroAssembler masm(&buf);
 697   masm.bxx64_patchable(dest, relocInfo::none, link);
 698   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 699 }
 700 
 701 // Get dest address of a bxx64_patchable instruction.
 702 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) {
 703   if (is_bxx64_patchable_variant1_at(instruction_addr, link)) {
 704     return (address) (unsigned long) get_const(instruction_addr);
 705   } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) {
 706     unsigned int* instr = (unsigned int*) instruction_addr;
 707     if (link) {
 708       const int instr_idx = 6; // bl is last
 709       int branchoffset = branch_destination(instr[instr_idx], 0);
 710       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 711     } else {
 712       const int instr_idx = 0; // b is first
 713       int branchoffset = branch_destination(instr[instr_idx], 0);
 714       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 715     }
 716   // Load dest relative to global toc.
 717   } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) {
 718     return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord,
 719                                                                instruction_addr);
 720   } else {
 721     ShouldNotReachHere();
 722     return NULL;
 723   }
 724 }
 725 
 726 // Uses ordering which corresponds to ABI:
 727 //    _savegpr0_14:  std  r14,-144(r1)
 728 //    _savegpr0_15:  std  r15,-136(r1)
 729 //    _savegpr0_16:  std  r16,-128(r1)
 730 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) {
 731   std(R14, offset, dst);   offset += 8;
 732   std(R15, offset, dst);   offset += 8;
 733   std(R16, offset, dst);   offset += 8;
 734   std(R17, offset, dst);   offset += 8;
 735   std(R18, offset, dst);   offset += 8;
 736   std(R19, offset, dst);   offset += 8;
 737   std(R20, offset, dst);   offset += 8;
 738   std(R21, offset, dst);   offset += 8;
 739   std(R22, offset, dst);   offset += 8;
 740   std(R23, offset, dst);   offset += 8;
 741   std(R24, offset, dst);   offset += 8;
 742   std(R25, offset, dst);   offset += 8;
 743   std(R26, offset, dst);   offset += 8;
 744   std(R27, offset, dst);   offset += 8;
 745   std(R28, offset, dst);   offset += 8;
 746   std(R29, offset, dst);   offset += 8;
 747   std(R30, offset, dst);   offset += 8;
 748   std(R31, offset, dst);   offset += 8;
 749 
 750   stfd(F14, offset, dst);   offset += 8;
 751   stfd(F15, offset, dst);   offset += 8;
 752   stfd(F16, offset, dst);   offset += 8;
 753   stfd(F17, offset, dst);   offset += 8;
 754   stfd(F18, offset, dst);   offset += 8;
 755   stfd(F19, offset, dst);   offset += 8;
 756   stfd(F20, offset, dst);   offset += 8;
 757   stfd(F21, offset, dst);   offset += 8;
 758   stfd(F22, offset, dst);   offset += 8;
 759   stfd(F23, offset, dst);   offset += 8;
 760   stfd(F24, offset, dst);   offset += 8;
 761   stfd(F25, offset, dst);   offset += 8;
 762   stfd(F26, offset, dst);   offset += 8;
 763   stfd(F27, offset, dst);   offset += 8;
 764   stfd(F28, offset, dst);   offset += 8;
 765   stfd(F29, offset, dst);   offset += 8;
 766   stfd(F30, offset, dst);   offset += 8;
 767   stfd(F31, offset, dst);
 768 }
 769 
 770 // Uses ordering which corresponds to ABI:
 771 //    _restgpr0_14:  ld   r14,-144(r1)
 772 //    _restgpr0_15:  ld   r15,-136(r1)
 773 //    _restgpr0_16:  ld   r16,-128(r1)
 774 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) {
 775   ld(R14, offset, src);   offset += 8;
 776   ld(R15, offset, src);   offset += 8;
 777   ld(R16, offset, src);   offset += 8;
 778   ld(R17, offset, src);   offset += 8;
 779   ld(R18, offset, src);   offset += 8;
 780   ld(R19, offset, src);   offset += 8;
 781   ld(R20, offset, src);   offset += 8;
 782   ld(R21, offset, src);   offset += 8;
 783   ld(R22, offset, src);   offset += 8;
 784   ld(R23, offset, src);   offset += 8;
 785   ld(R24, offset, src);   offset += 8;
 786   ld(R25, offset, src);   offset += 8;
 787   ld(R26, offset, src);   offset += 8;
 788   ld(R27, offset, src);   offset += 8;
 789   ld(R28, offset, src);   offset += 8;
 790   ld(R29, offset, src);   offset += 8;
 791   ld(R30, offset, src);   offset += 8;
 792   ld(R31, offset, src);   offset += 8;
 793 
 794   // FP registers
 795   lfd(F14, offset, src);   offset += 8;
 796   lfd(F15, offset, src);   offset += 8;
 797   lfd(F16, offset, src);   offset += 8;
 798   lfd(F17, offset, src);   offset += 8;
 799   lfd(F18, offset, src);   offset += 8;
 800   lfd(F19, offset, src);   offset += 8;
 801   lfd(F20, offset, src);   offset += 8;
 802   lfd(F21, offset, src);   offset += 8;
 803   lfd(F22, offset, src);   offset += 8;
 804   lfd(F23, offset, src);   offset += 8;
 805   lfd(F24, offset, src);   offset += 8;
 806   lfd(F25, offset, src);   offset += 8;
 807   lfd(F26, offset, src);   offset += 8;
 808   lfd(F27, offset, src);   offset += 8;
 809   lfd(F28, offset, src);   offset += 8;
 810   lfd(F29, offset, src);   offset += 8;
 811   lfd(F30, offset, src);   offset += 8;
 812   lfd(F31, offset, src);
 813 }
 814 
 815 // For verify_oops.
 816 void MacroAssembler::save_volatile_gprs(Register dst, int offset) {
 817   std(R2,  offset, dst);   offset += 8;
 818   std(R3,  offset, dst);   offset += 8;
 819   std(R4,  offset, dst);   offset += 8;
 820   std(R5,  offset, dst);   offset += 8;
 821   std(R6,  offset, dst);   offset += 8;
 822   std(R7,  offset, dst);   offset += 8;
 823   std(R8,  offset, dst);   offset += 8;
 824   std(R9,  offset, dst);   offset += 8;
 825   std(R10, offset, dst);   offset += 8;
 826   std(R11, offset, dst);   offset += 8;
 827   std(R12, offset, dst);   offset += 8;
 828 
 829   stfd(F0, offset, dst);   offset += 8;
 830   stfd(F1, offset, dst);   offset += 8;
 831   stfd(F2, offset, dst);   offset += 8;
 832   stfd(F3, offset, dst);   offset += 8;
 833   stfd(F4, offset, dst);   offset += 8;
 834   stfd(F5, offset, dst);   offset += 8;
 835   stfd(F6, offset, dst);   offset += 8;
 836   stfd(F7, offset, dst);   offset += 8;
 837   stfd(F8, offset, dst);   offset += 8;
 838   stfd(F9, offset, dst);   offset += 8;
 839   stfd(F10, offset, dst);  offset += 8;
 840   stfd(F11, offset, dst);  offset += 8;
 841   stfd(F12, offset, dst);  offset += 8;
 842   stfd(F13, offset, dst);
 843 }
 844 
 845 // For verify_oops.
 846 void MacroAssembler::restore_volatile_gprs(Register src, int offset) {
 847   ld(R2,  offset, src);   offset += 8;
 848   ld(R3,  offset, src);   offset += 8;
 849   ld(R4,  offset, src);   offset += 8;
 850   ld(R5,  offset, src);   offset += 8;
 851   ld(R6,  offset, src);   offset += 8;
 852   ld(R7,  offset, src);   offset += 8;
 853   ld(R8,  offset, src);   offset += 8;
 854   ld(R9,  offset, src);   offset += 8;
 855   ld(R10, offset, src);   offset += 8;
 856   ld(R11, offset, src);   offset += 8;
 857   ld(R12, offset, src);   offset += 8;
 858 
 859   lfd(F0, offset, src);   offset += 8;
 860   lfd(F1, offset, src);   offset += 8;
 861   lfd(F2, offset, src);   offset += 8;
 862   lfd(F3, offset, src);   offset += 8;
 863   lfd(F4, offset, src);   offset += 8;
 864   lfd(F5, offset, src);   offset += 8;
 865   lfd(F6, offset, src);   offset += 8;
 866   lfd(F7, offset, src);   offset += 8;
 867   lfd(F8, offset, src);   offset += 8;
 868   lfd(F9, offset, src);   offset += 8;
 869   lfd(F10, offset, src);  offset += 8;
 870   lfd(F11, offset, src);  offset += 8;
 871   lfd(F12, offset, src);  offset += 8;
 872   lfd(F13, offset, src);
 873 }
 874 
 875 void MacroAssembler::save_LR_CR(Register tmp) {
 876   mfcr(tmp);
 877   std(tmp, _abi(cr), R1_SP);
 878   mflr(tmp);
 879   std(tmp, _abi(lr), R1_SP);
 880   // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad)
 881 }
 882 
 883 void MacroAssembler::restore_LR_CR(Register tmp) {
 884   assert(tmp != R1_SP, "must be distinct");
 885   ld(tmp, _abi(lr), R1_SP);
 886   mtlr(tmp);
 887   ld(tmp, _abi(cr), R1_SP);
 888   mtcr(tmp);
 889 }
 890 
 891 address MacroAssembler::get_PC_trash_LR(Register result) {
 892   Label L;
 893   bl(L);
 894   bind(L);
 895   address lr_pc = pc();
 896   mflr(result);
 897   return lr_pc;
 898 }
 899 
 900 void MacroAssembler::resize_frame(Register offset, Register tmp) {
 901 #ifdef ASSERT
 902   assert_different_registers(offset, tmp, R1_SP);
 903   andi_(tmp, offset, frame::alignment_in_bytes-1);
 904   asm_assert_eq("resize_frame: unaligned", 0x204);
 905 #endif
 906 
 907   // tmp <- *(SP)
 908   ld(tmp, _abi(callers_sp), R1_SP);
 909   // addr <- SP + offset;
 910   // *(addr) <- tmp;
 911   // SP <- addr
 912   stdux(tmp, R1_SP, offset);
 913 }
 914 
 915 void MacroAssembler::resize_frame(int offset, Register tmp) {
 916   assert(is_simm(offset, 16), "too big an offset");
 917   assert_different_registers(tmp, R1_SP);
 918   assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned");
 919   // tmp <- *(SP)
 920   ld(tmp, _abi(callers_sp), R1_SP);
 921   // addr <- SP + offset;
 922   // *(addr) <- tmp;
 923   // SP <- addr
 924   stdu(tmp, offset, R1_SP);
 925 }
 926 
 927 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) {
 928   // (addr == tmp1) || (addr == tmp2) is allowed here!
 929   assert(tmp1 != tmp2, "must be distinct");
 930 
 931   // compute offset w.r.t. current stack pointer
 932   // tmp_1 <- addr - SP (!)
 933   subf(tmp1, R1_SP, addr);
 934 
 935   // atomically update SP keeping back link.
 936   resize_frame(tmp1/* offset */, tmp2/* tmp */);
 937 }
 938 
 939 void MacroAssembler::push_frame(Register bytes, Register tmp) {
 940 #ifdef ASSERT
 941   assert(bytes != R0, "r0 not allowed here");
 942   andi_(R0, bytes, frame::alignment_in_bytes-1);
 943   asm_assert_eq("push_frame(Reg, Reg): unaligned", 0x203);
 944 #endif
 945   neg(tmp, bytes);
 946   stdux(R1_SP, R1_SP, tmp);
 947 }
 948 
 949 // Push a frame of size `bytes'.
 950 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) {
 951   long offset = align_addr(bytes, frame::alignment_in_bytes);
 952   if (is_simm(-offset, 16)) {
 953     stdu(R1_SP, -offset, R1_SP);
 954   } else {
 955     load_const_optimized(tmp, -offset);
 956     stdux(R1_SP, R1_SP, tmp);
 957   }
 958 }
 959 
 960 // Push a frame of size `bytes' plus abi_reg_args on top.
 961 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) {
 962   push_frame(bytes + frame::abi_reg_args_size, tmp);
 963 }
 964 
 965 // Setup up a new C frame with a spill area for non-volatile GPRs and
 966 // additional space for local variables.
 967 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes,
 968                                                       Register tmp) {
 969   push_frame(bytes + frame::abi_reg_args_size + frame::spill_nonvolatiles_size, tmp);
 970 }
 971 
 972 // Pop current C frame.
 973 void MacroAssembler::pop_frame() {
 974   ld(R1_SP, _abi(callers_sp), R1_SP);
 975 }
 976 
 977 #if defined(ABI_ELFv2)
 978 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) {
 979   // TODO(asmundak): make sure the caller uses R12 as function descriptor
 980   // most of the times.
 981   if (R12 != r_function_entry) {
 982     mr(R12, r_function_entry);
 983   }
 984   mtctr(R12);
 985   // Do a call or a branch.
 986   if (and_link) {
 987     bctrl();
 988   } else {
 989     bctr();
 990   }
 991   _last_calls_return_pc = pc();
 992 
 993   return _last_calls_return_pc;
 994 }
 995 
 996 // Call a C function via a function descriptor and use full C
 997 // calling conventions. Updates and returns _last_calls_return_pc.
 998 address MacroAssembler::call_c(Register r_function_entry) {
 999   return branch_to(r_function_entry, /*and_link=*/true);
1000 }
1001 
1002 // For tail calls: only branch, don't link, so callee returns to caller of this function.
1003 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) {
1004   return branch_to(r_function_entry, /*and_link=*/false);
1005 }
1006 
1007 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) {
1008   load_const(R12, function_entry, R0);
1009   return branch_to(R12,  /*and_link=*/true);
1010 }
1011 
1012 #else
1013 // Generic version of a call to C function via a function descriptor
1014 // with variable support for C calling conventions (TOC, ENV, etc.).
1015 // Updates and returns _last_calls_return_pc.
1016 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call,
1017                                   bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) {
1018   // we emit standard ptrgl glue code here
1019   assert((function_descriptor != R0), "function_descriptor cannot be R0");
1020 
1021   // retrieve necessary entries from the function descriptor
1022   ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor);
1023   mtctr(R0);
1024 
1025   if (load_toc_of_callee) {
1026     ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor);
1027   }
1028   if (load_env_of_callee) {
1029     ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor);
1030   } else if (load_toc_of_callee) {
1031     li(R11, 0);
1032   }
1033 
1034   // do a call or a branch
1035   if (and_link) {
1036     bctrl();
1037   } else {
1038     bctr();
1039   }
1040   _last_calls_return_pc = pc();
1041 
1042   return _last_calls_return_pc;
1043 }
1044 
1045 // Call a C function via a function descriptor and use full C calling
1046 // conventions.
1047 // We don't use the TOC in generated code, so there is no need to save
1048 // and restore its value.
1049 address MacroAssembler::call_c(Register fd) {
1050   return branch_to(fd, /*and_link=*/true,
1051                        /*save toc=*/false,
1052                        /*restore toc=*/false,
1053                        /*load toc=*/true,
1054                        /*load env=*/true);
1055 }
1056 
1057 address MacroAssembler::call_c_and_return_to_caller(Register fd) {
1058   return branch_to(fd, /*and_link=*/false,
1059                        /*save toc=*/false,
1060                        /*restore toc=*/false,
1061                        /*load toc=*/true,
1062                        /*load env=*/true);
1063 }
1064 
1065 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) {
1066   if (rt != relocInfo::none) {
1067     // this call needs to be relocatable
1068     if (!ReoptimizeCallSequences
1069         || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1070         || fd == NULL   // support code-size estimation
1071         || !fd->is_friend_function()
1072         || fd->entry() == NULL) {
1073       // it's not a friend function as defined by class FunctionDescriptor,
1074       // so do a full call-c here.
1075       load_const(R11, (address)fd, R0);
1076 
1077       bool has_env = (fd != NULL && fd->env() != NULL);
1078       return branch_to(R11, /*and_link=*/true,
1079                             /*save toc=*/false,
1080                             /*restore toc=*/false,
1081                             /*load toc=*/true,
1082                             /*load env=*/has_env);
1083     } else {
1084       // It's a friend function. Load the entry point and don't care about
1085       // toc and env. Use an optimizable call instruction, but ensure the
1086       // same code-size as in the case of a non-friend function.
1087       nop();
1088       nop();
1089       nop();
1090       bl64_patchable(fd->entry(), rt);
1091       _last_calls_return_pc = pc();
1092       return _last_calls_return_pc;
1093     }
1094   } else {
1095     // This call does not need to be relocatable, do more aggressive
1096     // optimizations.
1097     if (!ReoptimizeCallSequences
1098       || !fd->is_friend_function()) {
1099       // It's not a friend function as defined by class FunctionDescriptor,
1100       // so do a full call-c here.
1101       load_const(R11, (address)fd, R0);
1102       return branch_to(R11, /*and_link=*/true,
1103                             /*save toc=*/false,
1104                             /*restore toc=*/false,
1105                             /*load toc=*/true,
1106                             /*load env=*/true);
1107     } else {
1108       // it's a friend function, load the entry point and don't care about
1109       // toc and env.
1110       address dest = fd->entry();
1111       if (is_within_range_of_b(dest, pc())) {
1112         bl(dest);
1113       } else {
1114         bl64_patchable(dest, rt);
1115       }
1116       _last_calls_return_pc = pc();
1117       return _last_calls_return_pc;
1118     }
1119   }
1120 }
1121 
1122 // Call a C function.  All constants needed reside in TOC.
1123 //
1124 // Read the address to call from the TOC.
1125 // Read env from TOC, if fd specifies an env.
1126 // Read new TOC from TOC.
1127 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd,
1128                                          relocInfo::relocType rt, Register toc) {
1129   if (!ReoptimizeCallSequences
1130     || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1131     || !fd->is_friend_function()) {
1132     // It's not a friend function as defined by class FunctionDescriptor,
1133     // so do a full call-c here.
1134     assert(fd->entry() != NULL, "function must be linked");
1135 
1136     AddressLiteral fd_entry(fd->entry());
1137     bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true);
1138     mtctr(R11);
1139     if (fd->env() == NULL) {
1140       li(R11, 0);
1141       nop();
1142     } else {
1143       AddressLiteral fd_env(fd->env());
1144       success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true);
1145     }
1146     AddressLiteral fd_toc(fd->toc());
1147     // Set R2_TOC (load from toc)
1148     success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true);
1149     bctrl();
1150     _last_calls_return_pc = pc();
1151     if (!success) { return NULL; }
1152   } else {
1153     // It's a friend function, load the entry point and don't care about
1154     // toc and env. Use an optimizable call instruction, but ensure the
1155     // same code-size as in the case of a non-friend function.
1156     nop();
1157     bl64_patchable(fd->entry(), rt);
1158     _last_calls_return_pc = pc();
1159   }
1160   return _last_calls_return_pc;
1161 }
1162 #endif // ABI_ELFv2
1163 
1164 void MacroAssembler::call_VM_base(Register oop_result,
1165                                   Register last_java_sp,
1166                                   address  entry_point,
1167                                   bool     check_exceptions) {
1168   BLOCK_COMMENT("call_VM {");
1169   // Determine last_java_sp register.
1170   if (!last_java_sp->is_valid()) {
1171     last_java_sp = R1_SP;
1172   }
1173   set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1);
1174 
1175   // ARG1 must hold thread address.
1176   mr(R3_ARG1, R16_thread);
1177 #if defined(ABI_ELFv2)
1178   address return_pc = call_c(entry_point, relocInfo::none);
1179 #else
1180   address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none);
1181 #endif
1182 
1183   reset_last_Java_frame();
1184 
1185   // Check for pending exceptions.
1186   if (check_exceptions) {
1187     // We don't check for exceptions here.
1188     ShouldNotReachHere();
1189   }
1190 
1191   // Get oop result if there is one and reset the value in the thread.
1192   if (oop_result->is_valid()) {
1193     get_vm_result(oop_result);
1194   }
1195 
1196   _last_calls_return_pc = return_pc;
1197   BLOCK_COMMENT("} call_VM");
1198 }
1199 
1200 void MacroAssembler::call_VM_leaf_base(address entry_point) {
1201   BLOCK_COMMENT("call_VM_leaf {");
1202 #if defined(ABI_ELFv2)
1203   call_c(entry_point, relocInfo::none);
1204 #else
1205   call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none);
1206 #endif
1207   BLOCK_COMMENT("} call_VM_leaf");
1208 }
1209 
1210 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) {
1211   call_VM_base(oop_result, noreg, entry_point, check_exceptions);
1212 }
1213 
1214 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1,
1215                              bool check_exceptions) {
1216   // R3_ARG1 is reserved for the thread.
1217   mr_if_needed(R4_ARG2, arg_1);
1218   call_VM(oop_result, entry_point, check_exceptions);
1219 }
1220 
1221 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2,
1222                              bool check_exceptions) {
1223   // R3_ARG1 is reserved for the thread
1224   mr_if_needed(R4_ARG2, arg_1);
1225   assert(arg_2 != R4_ARG2, "smashed argument");
1226   mr_if_needed(R5_ARG3, arg_2);
1227   call_VM(oop_result, entry_point, check_exceptions);
1228 }
1229 
1230 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3,
1231                              bool check_exceptions) {
1232   // R3_ARG1 is reserved for the thread
1233   mr_if_needed(R4_ARG2, arg_1);
1234   assert(arg_2 != R4_ARG2, "smashed argument");
1235   mr_if_needed(R5_ARG3, arg_2);
1236   mr_if_needed(R6_ARG4, arg_3);
1237   call_VM(oop_result, entry_point, check_exceptions);
1238 }
1239 
1240 void MacroAssembler::call_VM_leaf(address entry_point) {
1241   call_VM_leaf_base(entry_point);
1242 }
1243 
1244 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) {
1245   mr_if_needed(R3_ARG1, arg_1);
1246   call_VM_leaf(entry_point);
1247 }
1248 
1249 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) {
1250   mr_if_needed(R3_ARG1, arg_1);
1251   assert(arg_2 != R3_ARG1, "smashed argument");
1252   mr_if_needed(R4_ARG2, arg_2);
1253   call_VM_leaf(entry_point);
1254 }
1255 
1256 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) {
1257   mr_if_needed(R3_ARG1, arg_1);
1258   assert(arg_2 != R3_ARG1, "smashed argument");
1259   mr_if_needed(R4_ARG2, arg_2);
1260   assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument");
1261   mr_if_needed(R5_ARG3, arg_3);
1262   call_VM_leaf(entry_point);
1263 }
1264 
1265 // Check whether instruction is a read access to the polling page
1266 // which was emitted by load_from_polling_page(..).
1267 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext,
1268                                                address* polling_address_ptr) {
1269   if (!is_ld(instruction))
1270     return false; // It's not a ld. Fail.
1271 
1272   int rt = inv_rt_field(instruction);
1273   int ra = inv_ra_field(instruction);
1274   int ds = inv_ds_field(instruction);
1275   if (!(ds == 0 && ra != 0 && rt == 0)) {
1276     return false; // It's not a ld(r0, X, ra). Fail.
1277   }
1278 
1279   if (!ucontext) {
1280     // Set polling address.
1281     if (polling_address_ptr != NULL) {
1282       *polling_address_ptr = NULL;
1283     }
1284     return true; // No ucontext given. Can't check value of ra. Assume true.
1285   }
1286 
1287 #ifdef LINUX
1288   // Ucontext given. Check that register ra contains the address of
1289   // the safepoing polling page.
1290   ucontext_t* uc = (ucontext_t*) ucontext;
1291   // Set polling address.
1292   address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds;
1293   if (polling_address_ptr != NULL) {
1294     *polling_address_ptr = addr;
1295   }
1296   return os::is_poll_address(addr);
1297 #else
1298   // Not on Linux, ucontext must be NULL.
1299   ShouldNotReachHere();
1300   return false;
1301 #endif
1302 }
1303 
1304 bool MacroAssembler::is_memory_serialization(int instruction, JavaThread* thread, void* ucontext) {
1305 #ifdef LINUX
1306   ucontext_t* uc = (ucontext_t*) ucontext;
1307 
1308   if (is_stwx(instruction) || is_stwux(instruction)) {
1309     int ra = inv_ra_field(instruction);
1310     int rb = inv_rb_field(instruction);
1311 
1312     // look up content of ra and rb in ucontext
1313     address ra_val=(address)uc->uc_mcontext.regs->gpr[ra];
1314     long rb_val=(long)uc->uc_mcontext.regs->gpr[rb];
1315     return os::is_memory_serialize_page(thread, ra_val+rb_val);
1316   } else if (is_stw(instruction) || is_stwu(instruction)) {
1317     int ra = inv_ra_field(instruction);
1318     int d1 = inv_d1_field(instruction);
1319 
1320     // look up content of ra in ucontext
1321     address ra_val=(address)uc->uc_mcontext.regs->gpr[ra];
1322     return os::is_memory_serialize_page(thread, ra_val+d1);
1323   } else {
1324     return false;
1325   }
1326 #else
1327   // workaround not needed on !LINUX :-)
1328   ShouldNotCallThis();
1329   return false;
1330 #endif
1331 }
1332 
1333 void MacroAssembler::bang_stack_with_offset(int offset) {
1334   // When increasing the stack, the old stack pointer will be written
1335   // to the new top of stack according to the PPC64 abi.
1336   // Therefore, stack banging is not necessary when increasing
1337   // the stack by <= os::vm_page_size() bytes.
1338   // When increasing the stack by a larger amount, this method is
1339   // called repeatedly to bang the intermediate pages.
1340 
1341   // Stack grows down, caller passes positive offset.
1342   assert(offset > 0, "must bang with positive offset");
1343 
1344   long stdoffset = -offset;
1345 
1346   if (is_simm(stdoffset, 16)) {
1347     // Signed 16 bit offset, a simple std is ok.
1348     if (UseLoadInstructionsForStackBangingPPC64) {
1349       ld(R0, (int)(signed short)stdoffset, R1_SP);
1350     } else {
1351       std(R0,(int)(signed short)stdoffset, R1_SP);
1352     }
1353   } else if (is_simm(stdoffset, 31)) {
1354     const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset);
1355     const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset);
1356 
1357     Register tmp = R11;
1358     addis(tmp, R1_SP, hi);
1359     if (UseLoadInstructionsForStackBangingPPC64) {
1360       ld(R0,  lo, tmp);
1361     } else {
1362       std(R0, lo, tmp);
1363     }
1364   } else {
1365     ShouldNotReachHere();
1366   }
1367 }
1368 
1369 // If instruction is a stack bang of the form
1370 //    std    R0,    x(Ry),       (see bang_stack_with_offset())
1371 //    stdu   R1_SP, x(R1_SP),    (see push_frame(), resize_frame())
1372 // or stdux  R1_SP, Rx, R1_SP    (see push_frame(), resize_frame())
1373 // return the banged address. Otherwise, return 0.
1374 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) {
1375 #ifdef LINUX
1376   ucontext_t* uc = (ucontext_t*) ucontext;
1377   int rs = inv_rs_field(instruction);
1378   int ra = inv_ra_field(instruction);
1379   if (   (is_ld(instruction)   && rs == 0 &&  UseLoadInstructionsForStackBangingPPC64)
1380       || (is_std(instruction)  && rs == 0 && !UseLoadInstructionsForStackBangingPPC64)
1381       || (is_stdu(instruction) && rs == 1)) {
1382     int ds = inv_ds_field(instruction);
1383     // return banged address
1384     return ds+(address)uc->uc_mcontext.regs->gpr[ra];
1385   } else if (is_stdux(instruction) && rs == 1) {
1386     int rb = inv_rb_field(instruction);
1387     address sp = (address)uc->uc_mcontext.regs->gpr[1];
1388     long rb_val = (long)uc->uc_mcontext.regs->gpr[rb];
1389     return ra != 1 || rb_val >= 0 ? NULL         // not a stack bang
1390                                   : sp + rb_val; // banged address
1391   }
1392   return NULL; // not a stack bang
1393 #else
1394   // workaround not needed on !LINUX :-)
1395   ShouldNotCallThis();
1396   return NULL;
1397 #endif
1398 }
1399 
1400 // CmpxchgX sets condition register to cmpX(current, compare).
1401 void MacroAssembler::cmpxchgw(ConditionRegister flag, Register dest_current_value,
1402                               Register compare_value, Register exchange_value,
1403                               Register addr_base, int semantics, bool cmpxchgx_hint,
1404                               Register int_flag_success, bool contention_hint) {
1405   Label retry;
1406   Label failed;
1407   Label done;
1408 
1409   // Save one branch if result is returned via register and
1410   // result register is different from the other ones.
1411   bool use_result_reg    = (int_flag_success != noreg);
1412   bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value &&
1413                             int_flag_success != exchange_value && int_flag_success != addr_base);
1414 
1415   if (use_result_reg && preset_result_reg) {
1416     li(int_flag_success, 0); // preset (assume cas failed)
1417   }
1418 
1419   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1420   if (contention_hint) { // Don't try to reserve if cmp fails.
1421     lwz(dest_current_value, 0, addr_base);
1422     cmpw(flag, dest_current_value, compare_value);
1423     bne(flag, failed);
1424   }
1425 
1426   // release/fence semantics
1427   if (semantics & MemBarRel) {
1428     release();
1429   }
1430 
1431   // atomic emulation loop
1432   bind(retry);
1433 
1434   lwarx(dest_current_value, addr_base, cmpxchgx_hint);
1435   cmpw(flag, dest_current_value, compare_value);
1436   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1437     bne_predict_not_taken(flag, failed);
1438   } else {
1439     bne(                  flag, failed);
1440   }
1441   // branch to done  => (flag == ne), (dest_current_value != compare_value)
1442   // fall through    => (flag == eq), (dest_current_value == compare_value)
1443 
1444   stwcx_(exchange_value, addr_base);
1445   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1446     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1447   } else {
1448     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1449   }
1450   // fall through    => (flag == eq), (dest_current_value == compare_value), (swapped)
1451 
1452   // Result in register (must do this at the end because int_flag_success can be the
1453   // same register as one above).
1454   if (use_result_reg) {
1455     li(int_flag_success, 1);
1456   }
1457 
1458   if (semantics & MemBarFenceAfter) {
1459     fence();
1460   } else if (semantics & MemBarAcq) {
1461     isync();
1462   }
1463 
1464   if (use_result_reg && !preset_result_reg) {
1465     b(done);
1466   }
1467 
1468   bind(failed);
1469   if (use_result_reg && !preset_result_reg) {
1470     li(int_flag_success, 0);
1471   }
1472 
1473   bind(done);
1474   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1475   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1476 }
1477 
1478 // Preforms atomic compare exchange:
1479 //   if (compare_value == *addr_base)
1480 //     *addr_base = exchange_value
1481 //     int_flag_success = 1;
1482 //   else
1483 //     int_flag_success = 0;
1484 //
1485 // ConditionRegister flag       = cmp(compare_value, *addr_base)
1486 // Register dest_current_value  = *addr_base
1487 // Register compare_value       Used to compare with value in memory
1488 // Register exchange_value      Written to memory if compare_value == *addr_base
1489 // Register addr_base           The memory location to compareXChange
1490 // Register int_flag_success    Set to 1 if exchange_value was written to *addr_base
1491 //
1492 // To avoid the costly compare exchange the value is tested beforehand.
1493 // Several special cases exist to avoid that unnecessary information is generated.
1494 //
1495 void MacroAssembler::cmpxchgd(ConditionRegister flag,
1496                               Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value,
1497                               Register addr_base, int semantics, bool cmpxchgx_hint,
1498                               Register int_flag_success, Label* failed_ext, bool contention_hint) {
1499   Label retry;
1500   Label failed_int;
1501   Label& failed = (failed_ext != NULL) ? *failed_ext : failed_int;
1502   Label done;
1503 
1504   // Save one branch if result is returned via register and result register is different from the other ones.
1505   bool use_result_reg    = (int_flag_success!=noreg);
1506   bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() &&
1507                             int_flag_success!=exchange_value && int_flag_success!=addr_base);
1508   assert(int_flag_success == noreg || failed_ext == NULL, "cannot have both");
1509 
1510   if (use_result_reg && preset_result_reg) {
1511     li(int_flag_success, 0); // preset (assume cas failed)
1512   }
1513 
1514   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1515   if (contention_hint) { // Don't try to reserve if cmp fails.
1516     ld(dest_current_value, 0, addr_base);
1517     cmpd(flag, compare_value, dest_current_value);
1518     bne(flag, failed);
1519   }
1520 
1521   // release/fence semantics
1522   if (semantics & MemBarRel) {
1523     release();
1524   }
1525 
1526   // atomic emulation loop
1527   bind(retry);
1528 
1529   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1530   cmpd(flag, compare_value, dest_current_value);
1531   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1532     bne_predict_not_taken(flag, failed);
1533   } else {
1534     bne(                  flag, failed);
1535   }
1536 
1537   stdcx_(exchange_value, addr_base);
1538   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1539     bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
1540   } else {
1541     bne(                  CCR0, retry); // stXcx_ sets CCR0
1542   }
1543 
1544   // result in register (must do this at the end because int_flag_success can be the same register as one above)
1545   if (use_result_reg) {
1546     li(int_flag_success, 1);
1547   }
1548 
1549   if (semantics & MemBarFenceAfter) {
1550     fence();
1551   } else if (semantics & MemBarAcq) {
1552     isync();
1553   }
1554 
1555   if (use_result_reg && !preset_result_reg) {
1556     b(done);
1557   }
1558 
1559   bind(failed_int);
1560   if (use_result_reg && !preset_result_reg) {
1561     li(int_flag_success, 0);
1562   }
1563 
1564   bind(done);
1565   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1566   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1567 }
1568 
1569 // Look up the method for a megamorphic invokeinterface call.
1570 // The target method is determined by <intf_klass, itable_index>.
1571 // The receiver klass is in recv_klass.
1572 // On success, the result will be in method_result, and execution falls through.
1573 // On failure, execution transfers to the given label.
1574 void MacroAssembler::lookup_interface_method(Register recv_klass,
1575                                              Register intf_klass,
1576                                              RegisterOrConstant itable_index,
1577                                              Register method_result,
1578                                              Register scan_temp,
1579                                              Register sethi_temp,
1580                                              Label& L_no_such_interface) {
1581   assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
1582   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
1583          "caller must use same register for non-constant itable index as for method");
1584 
1585   // Compute start of first itableOffsetEntry (which is at the end of the vtable).
1586   int vtable_base = InstanceKlass::vtable_start_offset() * wordSize;
1587   int itentry_off = itableMethodEntry::method_offset_in_bytes();
1588   int logMEsize   = exact_log2(itableMethodEntry::size() * wordSize);
1589   int scan_step   = itableOffsetEntry::size() * wordSize;
1590   int log_vte_size= exact_log2(vtableEntry::size() * wordSize);
1591 
1592   lwz(scan_temp, InstanceKlass::vtable_length_offset() * wordSize, recv_klass);
1593   // %%% We should store the aligned, prescaled offset in the klassoop.
1594   // Then the next several instructions would fold away.
1595 
1596   sldi(scan_temp, scan_temp, log_vte_size);
1597   addi(scan_temp, scan_temp, vtable_base);
1598   add(scan_temp, recv_klass, scan_temp);
1599 
1600   // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1601   if (itable_index.is_register()) {
1602     Register itable_offset = itable_index.as_register();
1603     sldi(itable_offset, itable_offset, logMEsize);
1604     if (itentry_off) addi(itable_offset, itable_offset, itentry_off);
1605     add(recv_klass, itable_offset, recv_klass);
1606   } else {
1607     long itable_offset = (long)itable_index.as_constant();
1608     load_const_optimized(sethi_temp, (itable_offset<<logMEsize)+itentry_off); // static address, no relocation
1609     add(recv_klass, sethi_temp, recv_klass);
1610   }
1611 
1612   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
1613   //   if (scan->interface() == intf) {
1614   //     result = (klass + scan->offset() + itable_index);
1615   //   }
1616   // }
1617   Label search, found_method;
1618 
1619   for (int peel = 1; peel >= 0; peel--) {
1620     // %%%% Could load both offset and interface in one ldx, if they were
1621     // in the opposite order. This would save a load.
1622     ld(method_result, itableOffsetEntry::interface_offset_in_bytes(), scan_temp);
1623 
1624     // Check that this entry is non-null. A null entry means that
1625     // the receiver class doesn't implement the interface, and wasn't the
1626     // same as when the caller was compiled.
1627     cmpd(CCR0, method_result, intf_klass);
1628 
1629     if (peel) {
1630       beq(CCR0, found_method);
1631     } else {
1632       bne(CCR0, search);
1633       // (invert the test to fall through to found_method...)
1634     }
1635 
1636     if (!peel) break;
1637 
1638     bind(search);
1639 
1640     cmpdi(CCR0, method_result, 0);
1641     beq(CCR0, L_no_such_interface);
1642     addi(scan_temp, scan_temp, scan_step);
1643   }
1644 
1645   bind(found_method);
1646 
1647   // Got a hit.
1648   int ito_offset = itableOffsetEntry::offset_offset_in_bytes();
1649   lwz(scan_temp, ito_offset, scan_temp);
1650   ldx(method_result, scan_temp, recv_klass);
1651 }
1652 
1653 // virtual method calling
1654 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1655                                            RegisterOrConstant vtable_index,
1656                                            Register method_result) {
1657 
1658   assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg());
1659 
1660   const int base = InstanceKlass::vtable_start_offset() * wordSize;
1661   assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1662 
1663   if (vtable_index.is_register()) {
1664     sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord);
1665     add(recv_klass, vtable_index.as_register(), recv_klass);
1666   } else {
1667     addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord);
1668   }
1669   ld(R19_method, base + vtableEntry::method_offset_in_bytes(), recv_klass);
1670 }
1671 
1672 /////////////////////////////////////////// subtype checking ////////////////////////////////////////////
1673 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1674                                                    Register super_klass,
1675                                                    Register temp1_reg,
1676                                                    Register temp2_reg,
1677                                                    Label* L_success,
1678                                                    Label* L_failure,
1679                                                    Label* L_slow_path,
1680                                                    RegisterOrConstant super_check_offset) {
1681 
1682   const Register check_cache_offset = temp1_reg;
1683   const Register cached_super       = temp2_reg;
1684 
1685   assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super);
1686 
1687   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1688   int sc_offset  = in_bytes(Klass::secondary_super_cache_offset());
1689 
1690   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1691   bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset);
1692 
1693   Label L_fallthrough;
1694   int label_nulls = 0;
1695   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1696   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1697   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1698   assert(label_nulls <= 1 ||
1699          (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path),
1700          "at most one NULL in the batch, usually");
1701 
1702   // If the pointers are equal, we are done (e.g., String[] elements).
1703   // This self-check enables sharing of secondary supertype arrays among
1704   // non-primary types such as array-of-interface. Otherwise, each such
1705   // type would need its own customized SSA.
1706   // We move this check to the front of the fast path because many
1707   // type checks are in fact trivially successful in this manner,
1708   // so we get a nicely predicted branch right at the start of the check.
1709   cmpd(CCR0, sub_klass, super_klass);
1710   beq(CCR0, *L_success);
1711 
1712   // Check the supertype display:
1713   if (must_load_sco) {
1714     // The super check offset is always positive...
1715   lwz(check_cache_offset, sco_offset, super_klass);
1716     super_check_offset = RegisterOrConstant(check_cache_offset);
1717     // super_check_offset is register.
1718     assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register());
1719   }
1720   // The loaded value is the offset from KlassOopDesc.
1721 
1722   ld(cached_super, super_check_offset, sub_klass);
1723   cmpd(CCR0, cached_super, super_klass);
1724 
1725   // This check has worked decisively for primary supers.
1726   // Secondary supers are sought in the super_cache ('super_cache_addr').
1727   // (Secondary supers are interfaces and very deeply nested subtypes.)
1728   // This works in the same check above because of a tricky aliasing
1729   // between the super_cache and the primary super display elements.
1730   // (The 'super_check_addr' can address either, as the case requires.)
1731   // Note that the cache is updated below if it does not help us find
1732   // what we need immediately.
1733   // So if it was a primary super, we can just fail immediately.
1734   // Otherwise, it's the slow path for us (no success at this point).
1735 
1736 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); }
1737 
1738   if (super_check_offset.is_register()) {
1739     beq(CCR0, *L_success);
1740     cmpwi(CCR0, super_check_offset.as_register(), sc_offset);
1741     if (L_failure == &L_fallthrough) {
1742       beq(CCR0, *L_slow_path);
1743     } else {
1744       bne(CCR0, *L_failure);
1745       FINAL_JUMP(*L_slow_path);
1746     }
1747   } else {
1748     if (super_check_offset.as_constant() == sc_offset) {
1749       // Need a slow path; fast failure is impossible.
1750       if (L_slow_path == &L_fallthrough) {
1751         beq(CCR0, *L_success);
1752       } else {
1753         bne(CCR0, *L_slow_path);
1754         FINAL_JUMP(*L_success);
1755       }
1756     } else {
1757       // No slow path; it's a fast decision.
1758       if (L_failure == &L_fallthrough) {
1759         beq(CCR0, *L_success);
1760       } else {
1761         bne(CCR0, *L_failure);
1762         FINAL_JUMP(*L_success);
1763       }
1764     }
1765   }
1766 
1767   bind(L_fallthrough);
1768 #undef FINAL_JUMP
1769 }
1770 
1771 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1772                                                    Register super_klass,
1773                                                    Register temp1_reg,
1774                                                    Register temp2_reg,
1775                                                    Label* L_success,
1776                                                    Register result_reg) {
1777   const Register array_ptr = temp1_reg; // current value from cache array
1778   const Register temp      = temp2_reg;
1779 
1780   assert_different_registers(sub_klass, super_klass, array_ptr, temp);
1781 
1782   int source_offset = in_bytes(Klass::secondary_supers_offset());
1783   int target_offset = in_bytes(Klass::secondary_super_cache_offset());
1784 
1785   int length_offset = Array<Klass*>::length_offset_in_bytes();
1786   int base_offset   = Array<Klass*>::base_offset_in_bytes();
1787 
1788   Label hit, loop, failure, fallthru;
1789 
1790   ld(array_ptr, source_offset, sub_klass);
1791 
1792   // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated.");
1793   lwz(temp, length_offset, array_ptr);
1794   cmpwi(CCR0, temp, 0);
1795   beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0
1796 
1797   mtctr(temp); // load ctr
1798 
1799   bind(loop);
1800   // Oops in table are NO MORE compressed.
1801   ld(temp, base_offset, array_ptr);
1802   cmpd(CCR0, temp, super_klass);
1803   beq(CCR0, hit);
1804   addi(array_ptr, array_ptr, BytesPerWord);
1805   bdnz(loop);
1806 
1807   bind(failure);
1808   if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss)
1809   b(fallthru);
1810 
1811   bind(hit);
1812   std(super_klass, target_offset, sub_klass); // save result to cache
1813   if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit)
1814   if (L_success != NULL) { b(*L_success); }
1815   else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided
1816 
1817   bind(fallthru);
1818 }
1819 
1820 // Try fast path, then go to slow one if not successful
1821 void MacroAssembler::check_klass_subtype(Register sub_klass,
1822                          Register super_klass,
1823                          Register temp1_reg,
1824                          Register temp2_reg,
1825                          Label& L_success) {
1826   Label L_failure;
1827   check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure);
1828   check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success);
1829   bind(L_failure); // Fallthru if not successful.
1830 }
1831 
1832 void MacroAssembler::check_method_handle_type(Register mtype_reg, Register mh_reg,
1833                                               Register temp_reg,
1834                                               Label& wrong_method_type) {
1835   assert_different_registers(mtype_reg, mh_reg, temp_reg);
1836   // Compare method type against that of the receiver.
1837   load_heap_oop_not_null(temp_reg, delayed_value(java_lang_invoke_MethodHandle::type_offset_in_bytes, temp_reg), mh_reg);
1838   cmpd(CCR0, temp_reg, mtype_reg);
1839   bne(CCR0, wrong_method_type);
1840 }
1841 
1842 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot,
1843                                                    Register temp_reg,
1844                                                    int extra_slot_offset) {
1845   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
1846   int stackElementSize = Interpreter::stackElementSize;
1847   int offset = extra_slot_offset * stackElementSize;
1848   if (arg_slot.is_constant()) {
1849     offset += arg_slot.as_constant() * stackElementSize;
1850     return offset;
1851   } else {
1852     assert(temp_reg != noreg, "must specify");
1853     sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize));
1854     if (offset != 0)
1855       addi(temp_reg, temp_reg, offset);
1856     return temp_reg;
1857   }
1858 }
1859 
1860 // Supports temp2_reg = R0.
1861 void MacroAssembler::biased_locking_enter(ConditionRegister cr_reg, Register obj_reg,
1862                                           Register mark_reg, Register temp_reg,
1863                                           Register temp2_reg, Label& done, Label* slow_case) {
1864   assert(UseBiasedLocking, "why call this otherwise?");
1865 
1866 #ifdef ASSERT
1867   assert_different_registers(obj_reg, mark_reg, temp_reg, temp2_reg);
1868 #endif
1869 
1870   Label cas_label;
1871 
1872   // Branch to done if fast path fails and no slow_case provided.
1873   Label *slow_case_int = (slow_case != NULL) ? slow_case : &done;
1874 
1875   // Biased locking
1876   // See whether the lock is currently biased toward our thread and
1877   // whether the epoch is still valid
1878   // Note that the runtime guarantees sufficient alignment of JavaThread
1879   // pointers to allow age to be placed into low bits
1880   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits,
1881          "biased locking makes assumptions about bit layout");
1882 
1883   if (PrintBiasedLockingStatistics) {
1884     load_const(temp2_reg, (address) BiasedLocking::total_entry_count_addr(), temp_reg);
1885     lwzx(temp_reg, temp2_reg);
1886     addi(temp_reg, temp_reg, 1);
1887     stwx(temp_reg, temp2_reg);
1888   }
1889 
1890   andi(temp_reg, mark_reg, markOopDesc::biased_lock_mask_in_place);
1891   cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern);
1892   bne(cr_reg, cas_label);
1893 
1894   load_klass(temp_reg, obj_reg);
1895 
1896   load_const_optimized(temp2_reg, ~((int) markOopDesc::age_mask_in_place));
1897   ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
1898   orr(temp_reg, R16_thread, temp_reg);
1899   xorr(temp_reg, mark_reg, temp_reg);
1900   andr(temp_reg, temp_reg, temp2_reg);
1901   cmpdi(cr_reg, temp_reg, 0);
1902   if (PrintBiasedLockingStatistics) {
1903     Label l;
1904     bne(cr_reg, l);
1905     load_const(temp2_reg, (address) BiasedLocking::biased_lock_entry_count_addr());
1906     lwzx(mark_reg, temp2_reg);
1907     addi(mark_reg, mark_reg, 1);
1908     stwx(mark_reg, temp2_reg);
1909     // restore mark_reg
1910     ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);
1911     bind(l);
1912   }
1913   beq(cr_reg, done);
1914 
1915   Label try_revoke_bias;
1916   Label try_rebias;
1917 
1918   // At this point we know that the header has the bias pattern and
1919   // that we are not the bias owner in the current epoch. We need to
1920   // figure out more details about the state of the header in order to
1921   // know what operations can be legally performed on the object's
1922   // header.
1923 
1924   // If the low three bits in the xor result aren't clear, that means
1925   // the prototype header is no longer biased and we have to revoke
1926   // the bias on this object.
1927   andi(temp2_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
1928   cmpwi(cr_reg, temp2_reg, 0);
1929   bne(cr_reg, try_revoke_bias);
1930 
1931   // Biasing is still enabled for this data type. See whether the
1932   // epoch of the current bias is still valid, meaning that the epoch
1933   // bits of the mark word are equal to the epoch bits of the
1934   // prototype header. (Note that the prototype header's epoch bits
1935   // only change at a safepoint.) If not, attempt to rebias the object
1936   // toward the current thread. Note that we must be absolutely sure
1937   // that the current epoch is invalid in order to do this because
1938   // otherwise the manipulations it performs on the mark word are
1939   // illegal.
1940 
1941   int shift_amount = 64 - markOopDesc::epoch_shift;
1942   // rotate epoch bits to right (little) end and set other bits to 0
1943   // [ big part | epoch | little part ] -> [ 0..0 | epoch ]
1944   rldicl_(temp2_reg, temp_reg, shift_amount, 64 - markOopDesc::epoch_bits);
1945   // branch if epoch bits are != 0, i.e. they differ, because the epoch has been incremented
1946   bne(CCR0, try_rebias);
1947 
1948   // The epoch of the current bias is still valid but we know nothing
1949   // about the owner; it might be set or it might be clear. Try to
1950   // acquire the bias of the object using an atomic operation. If this
1951   // fails we will go in to the runtime to revoke the object's bias.
1952   // Note that we first construct the presumed unbiased header so we
1953   // don't accidentally blow away another thread's valid bias.
1954   andi(mark_reg, mark_reg, (markOopDesc::biased_lock_mask_in_place |
1955                                 markOopDesc::age_mask_in_place |
1956                                 markOopDesc::epoch_mask_in_place));
1957   orr(temp_reg, R16_thread, mark_reg);
1958 
1959   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
1960 
1961   // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
1962   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
1963            /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
1964            /*where=*/obj_reg,
1965            MacroAssembler::MemBarAcq,
1966            MacroAssembler::cmpxchgx_hint_acquire_lock(),
1967            noreg, slow_case_int); // bail out if failed
1968 
1969   // If the biasing toward our thread failed, this means that
1970   // another thread succeeded in biasing it toward itself and we
1971   // need to revoke that bias. The revocation will occur in the
1972   // interpreter runtime in the slow case.
1973   if (PrintBiasedLockingStatistics) {
1974     load_const(temp2_reg, (address) BiasedLocking::anonymously_biased_lock_entry_count_addr(), temp_reg);
1975     lwzx(temp_reg, temp2_reg);
1976     addi(temp_reg, temp_reg, 1);
1977     stwx(temp_reg, temp2_reg);
1978   }
1979   b(done);
1980 
1981   bind(try_rebias);
1982   // At this point we know the epoch has expired, meaning that the
1983   // current "bias owner", if any, is actually invalid. Under these
1984   // circumstances _only_, we are allowed to use the current header's
1985   // value as the comparison value when doing the cas to acquire the
1986   // bias in the current epoch. In other words, we allow transfer of
1987   // the bias from one thread to another directly in this situation.
1988   load_klass(temp_reg, obj_reg);
1989   andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place);
1990   orr(temp2_reg, R16_thread, temp2_reg);
1991   ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
1992   orr(temp_reg, temp2_reg, temp_reg);
1993 
1994   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
1995 
1996   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
1997                  /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
1998                  /*where=*/obj_reg,
1999                  MacroAssembler::MemBarAcq,
2000                  MacroAssembler::cmpxchgx_hint_acquire_lock(),
2001                  noreg, slow_case_int); // bail out if failed
2002 
2003   // If the biasing toward our thread failed, this means that
2004   // another thread succeeded in biasing it toward itself and we
2005   // need to revoke that bias. The revocation will occur in the
2006   // interpreter runtime in the slow case.
2007   if (PrintBiasedLockingStatistics) {
2008     load_const(temp2_reg, (address) BiasedLocking::rebiased_lock_entry_count_addr(), temp_reg);
2009     lwzx(temp_reg, temp2_reg);
2010     addi(temp_reg, temp_reg, 1);
2011     stwx(temp_reg, temp2_reg);
2012   }
2013   b(done);
2014 
2015   bind(try_revoke_bias);
2016   // The prototype mark in the klass doesn't have the bias bit set any
2017   // more, indicating that objects of this data type are not supposed
2018   // to be biased any more. We are going to try to reset the mark of
2019   // this object to the prototype value and fall through to the
2020   // CAS-based locking scheme. Note that if our CAS fails, it means
2021   // that another thread raced us for the privilege of revoking the
2022   // bias of this particular object, so it's okay to continue in the
2023   // normal locking code.
2024   load_klass(temp_reg, obj_reg);
2025   ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
2026   andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place);
2027   orr(temp_reg, temp_reg, temp2_reg);
2028 
2029   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2030 
2031   // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
2032   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
2033                  /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
2034                  /*where=*/obj_reg,
2035                  MacroAssembler::MemBarAcq,
2036                  MacroAssembler::cmpxchgx_hint_acquire_lock());
2037 
2038   // reload markOop in mark_reg before continuing with lightweight locking
2039   ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);
2040 
2041   // Fall through to the normal CAS-based lock, because no matter what
2042   // the result of the above CAS, some thread must have succeeded in
2043   // removing the bias bit from the object's header.
2044   if (PrintBiasedLockingStatistics) {
2045     Label l;
2046     bne(cr_reg, l);
2047     load_const(temp2_reg, (address) BiasedLocking::revoked_lock_entry_count_addr(), temp_reg);
2048     lwzx(temp_reg, temp2_reg);
2049     addi(temp_reg, temp_reg, 1);
2050     stwx(temp_reg, temp2_reg);
2051     bind(l);
2052   }
2053 
2054   bind(cas_label);
2055 }
2056 
2057 void MacroAssembler::biased_locking_exit (ConditionRegister cr_reg, Register mark_addr, Register temp_reg, Label& done) {
2058   // Check for biased locking unlock case, which is a no-op
2059   // Note: we do not have to check the thread ID for two reasons.
2060   // First, the interpreter checks for IllegalMonitorStateException at
2061   // a higher level. Second, if the bias was revoked while we held the
2062   // lock, the object could not be rebiased toward another thread, so
2063   // the bias bit would be clear.
2064 
2065   ld(temp_reg, 0, mark_addr);
2066   andi(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
2067 
2068   cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern);
2069   beq(cr_reg, done);
2070 }
2071 
2072 // allocation (for C1)
2073 void MacroAssembler::eden_allocate(
2074   Register obj,                      // result: pointer to object after successful allocation
2075   Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
2076   int      con_size_in_bytes,        // object size in bytes if   known at compile time
2077   Register t1,                       // temp register
2078   Register t2,                       // temp register
2079   Label&   slow_case                 // continuation point if fast allocation fails
2080 ) {
2081   b(slow_case);
2082 }
2083 
2084 void MacroAssembler::tlab_allocate(
2085   Register obj,                      // result: pointer to object after successful allocation
2086   Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
2087   int      con_size_in_bytes,        // object size in bytes if   known at compile time
2088   Register t1,                       // temp register
2089   Label&   slow_case                 // continuation point if fast allocation fails
2090 ) {
2091   // make sure arguments make sense
2092   assert_different_registers(obj, var_size_in_bytes, t1);
2093   assert(0 <= con_size_in_bytes && is_simm13(con_size_in_bytes), "illegal object size");
2094   assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment");
2095 
2096   const Register new_top = t1;
2097   //verify_tlab(); not implemented
2098 
2099   ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2100   ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread);
2101   if (var_size_in_bytes == noreg) {
2102     addi(new_top, obj, con_size_in_bytes);
2103   } else {
2104     add(new_top, obj, var_size_in_bytes);
2105   }
2106   cmpld(CCR0, new_top, R0);
2107   bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case);
2108 
2109 #ifdef ASSERT
2110   // make sure new free pointer is properly aligned
2111   {
2112     Label L;
2113     andi_(R0, new_top, MinObjAlignmentInBytesMask);
2114     beq(CCR0, L);
2115     stop("updated TLAB free is not properly aligned", 0x934);
2116     bind(L);
2117   }
2118 #endif // ASSERT
2119 
2120   // update the tlab top pointer
2121   std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2122   //verify_tlab(); not implemented
2123 }
2124 void MacroAssembler::tlab_refill(Label& retry_tlab, Label& try_eden, Label& slow_case) {
2125   unimplemented("tlab_refill");
2126 }
2127 void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register t1, Register t2) {
2128   unimplemented("incr_allocated_bytes");
2129 }
2130 
2131 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset,
2132                                              int insts_call_instruction_offset, Register Rtoc) {
2133   // Start the stub.
2134   address stub = start_a_stub(64);
2135   if (stub == NULL) { return NULL; } // CodeCache full: bail out
2136 
2137   // Create a trampoline stub relocation which relates this trampoline stub
2138   // with the call instruction at insts_call_instruction_offset in the
2139   // instructions code-section.
2140   relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset));
2141   const int stub_start_offset = offset();
2142 
2143   // For java_to_interp stubs we use R11_scratch1 as scratch register
2144   // and in call trampoline stubs we use R12_scratch2. This way we
2145   // can distinguish them (see is_NativeCallTrampolineStub_at()).
2146   Register reg_scratch = R12_scratch2;
2147 
2148   // Now, create the trampoline stub's code:
2149   // - load the TOC
2150   // - load the call target from the constant pool
2151   // - call
2152   if (Rtoc == noreg) {
2153     calculate_address_from_global_toc(reg_scratch, method_toc());
2154     Rtoc = reg_scratch;
2155   }
2156 
2157   ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false);
2158   mtctr(reg_scratch);
2159   bctr();
2160 
2161   const address stub_start_addr = addr_at(stub_start_offset);
2162 
2163   // Assert that the encoded destination_toc_offset can be identified and that it is correct.
2164   assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(),
2165          "encoded offset into the constant pool must match");
2166   // Trampoline_stub_size should be good.
2167   assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size");
2168   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
2169 
2170   // End the stub.
2171   end_a_stub();
2172   return stub;
2173 }
2174 
2175 // TM on PPC64.
2176 void MacroAssembler::atomic_inc_ptr(Register addr, Register result, int simm16) {
2177   Label retry;
2178   bind(retry);
2179   ldarx(result, addr, /*hint*/ false);
2180   addi(result, result, simm16);
2181   stdcx_(result, addr);
2182   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
2183     bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
2184   } else {
2185     bne(                  CCR0, retry); // stXcx_ sets CCR0
2186   }
2187 }
2188 
2189 void MacroAssembler::atomic_ori_int(Register addr, Register result, int uimm16) {
2190   Label retry;
2191   bind(retry);
2192   lwarx(result, addr, /*hint*/ false);
2193   ori(result, result, uimm16);
2194   stwcx_(result, addr);
2195   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
2196     bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
2197   } else {
2198     bne(                  CCR0, retry); // stXcx_ sets CCR0
2199   }
2200 }
2201 
2202 #if INCLUDE_RTM_OPT
2203 
2204 // Update rtm_counters based on abort status
2205 // input: abort_status
2206 //        rtm_counters (RTMLockingCounters*)
2207 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters_Reg) {
2208   // Mapping to keep PreciseRTMLockingStatistics similar to x86.
2209   // x86 ppc (! means inverted, ? means not the same)
2210   //  0   31  Set if abort caused by XABORT instruction.
2211   //  1  ! 7  If set, the transaction may succeed on a retry. This bit is always clear if bit 0 is set.
2212   //  2   13  Set if another logical processor conflicted with a memory address that was part of the transaction that aborted.
2213   //  3   10  Set if an internal buffer overflowed.
2214   //  4  ?12  Set if a debug breakpoint was hit.
2215   //  5  ?32  Set if an abort occurred during execution of a nested transaction.
2216   const  int tm_failure_bit[] = {Assembler::tm_tabort, // Note: Seems like signal handler sets this, too.
2217                                  Assembler::tm_failure_persistent, // inverted: transient
2218                                  Assembler::tm_trans_cf,
2219                                  Assembler::tm_footprint_of,
2220                                  Assembler::tm_non_trans_cf,
2221                                  Assembler::tm_suspended};
2222   const bool tm_failure_inv[] = {false, true, false, false, false, false};
2223   assert(sizeof(tm_failure_bit)/sizeof(int) == RTMLockingCounters::ABORT_STATUS_LIMIT, "adapt mapping!");
2224 
2225   const Register addr_Reg = R0;
2226   // Keep track of offset to where rtm_counters_Reg had pointed to.
2227   int counters_offs = RTMLockingCounters::abort_count_offset();
2228   addi(addr_Reg, rtm_counters_Reg, counters_offs);
2229   const Register temp_Reg = rtm_counters_Reg;
2230 
2231   //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically
2232   ldx(temp_Reg, addr_Reg);
2233   addi(temp_Reg, temp_Reg, 1);
2234   stdx(temp_Reg, addr_Reg);
2235 
2236   if (PrintPreciseRTMLockingStatistics) {
2237     int counters_offs_delta = RTMLockingCounters::abortX_count_offset() - counters_offs;
2238 
2239     //mftexasr(abort_status); done by caller
2240     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
2241       counters_offs += counters_offs_delta;
2242       li(temp_Reg, counters_offs_delta); // can't use addi with R0
2243       add(addr_Reg, addr_Reg, temp_Reg); // point to next counter
2244       counters_offs_delta = sizeof(uintx);
2245 
2246       Label check_abort;
2247       rldicr_(temp_Reg, abort_status, tm_failure_bit[i], 0);
2248       if (tm_failure_inv[i]) {
2249         bne(CCR0, check_abort);
2250       } else {
2251         beq(CCR0, check_abort);
2252       }
2253       //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically
2254       ldx(temp_Reg, addr_Reg);
2255       addi(temp_Reg, temp_Reg, 1);
2256       stdx(temp_Reg, addr_Reg);
2257       bind(check_abort);
2258     }
2259   }
2260   li(temp_Reg, -counters_offs); // can't use addi with R0
2261   add(rtm_counters_Reg, addr_Reg, temp_Reg); // restore
2262 }
2263 
2264 // Branch if (random & (count-1) != 0), count is 2^n
2265 // tmp and CR0 are killed
2266 void MacroAssembler::branch_on_random_using_tb(Register tmp, int count, Label& brLabel) {
2267   mftb(tmp);
2268   andi_(tmp, tmp, count-1);
2269   bne(CCR0, brLabel);
2270 }
2271 
2272 // Perform abort ratio calculation, set no_rtm bit if high ratio.
2273 // input:  rtm_counters_Reg (RTMLockingCounters* address) - KILLED
2274 void MacroAssembler::rtm_abort_ratio_calculation(Register rtm_counters_Reg,
2275                                                  RTMLockingCounters* rtm_counters,
2276                                                  Metadata* method_data) {
2277   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
2278 
2279   if (RTMLockingCalculationDelay > 0) {
2280     // Delay calculation.
2281     ld(rtm_counters_Reg, (RegisterOrConstant)(intptr_t)RTMLockingCounters::rtm_calculation_flag_addr());
2282     cmpdi(CCR0, rtm_counters_Reg, 0);
2283     beq(CCR0, L_done);
2284     load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2285   }
2286   // Abort ratio calculation only if abort_count > RTMAbortThreshold.
2287   //   Aborted transactions = abort_count * 100
2288   //   All transactions = total_count *  RTMTotalCountIncrRate
2289   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
2290   ld(R0, RTMLockingCounters::abort_count_offset(), rtm_counters_Reg);
2291   cmpdi(CCR0, R0, RTMAbortThreshold);
2292   blt(CCR0, L_check_always_rtm2);
2293   mulli(R0, R0, 100);
2294 
2295   const Register tmpReg = rtm_counters_Reg;
2296   ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2297   mulli(tmpReg, tmpReg, RTMTotalCountIncrRate);
2298   mulli(tmpReg, tmpReg, RTMAbortRatio);
2299   cmpd(CCR0, R0, tmpReg);
2300   blt(CCR0, L_check_always_rtm1); // jump to reload
2301   if (method_data != NULL) {
2302     // Set rtm_state to "no rtm" in MDO.
2303     // Not using a metadata relocation. Method and Class Loader are kept alive anyway.
2304     // (See nmethod::metadata_do and CodeBuffer::finalize_oop_references.)
2305     load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2306     atomic_ori_int(R0, tmpReg, NoRTM);
2307   }
2308   b(L_done);
2309 
2310   bind(L_check_always_rtm1);
2311   load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2312   bind(L_check_always_rtm2);
2313   ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2314   cmpdi(CCR0, tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
2315   blt(CCR0, L_done);
2316   if (method_data != NULL) {
2317     // Set rtm_state to "always rtm" in MDO.
2318     // Not using a metadata relocation. See above.
2319     load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2320     atomic_ori_int(R0, tmpReg, UseRTM);
2321   }
2322   bind(L_done);
2323 }
2324 
2325 // Update counters and perform abort ratio calculation.
2326 // input: abort_status_Reg
2327 void MacroAssembler::rtm_profiling(Register abort_status_Reg, Register temp_Reg,
2328                                    RTMLockingCounters* rtm_counters,
2329                                    Metadata* method_data,
2330                                    bool profile_rtm) {
2331 
2332   assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2333   // Update rtm counters based on state at abort.
2334   // Reads abort_status_Reg, updates flags.
2335   assert_different_registers(abort_status_Reg, temp_Reg);
2336   load_const_optimized(temp_Reg, (address)rtm_counters, R0);
2337   rtm_counters_update(abort_status_Reg, temp_Reg);
2338   if (profile_rtm) {
2339     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2340     rtm_abort_ratio_calculation(temp_Reg, rtm_counters, method_data);
2341   }
2342 }
2343 
2344 // Retry on abort if abort's status indicates non-persistent failure.
2345 // inputs: retry_count_Reg
2346 //       : abort_status_Reg
2347 // output: retry_count_Reg decremented by 1
2348 void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg,
2349                                              Label& retryLabel, Label* checkRetry) {
2350   Label doneRetry;
2351   rldicr_(R0, abort_status_Reg, tm_failure_persistent, 0);
2352   bne(CCR0, doneRetry);
2353   if (checkRetry) { bind(*checkRetry); }
2354   addic_(retry_count_Reg, retry_count_Reg, -1);
2355   blt(CCR0, doneRetry);
2356   smt_yield(); // Can't use wait(). No permission (SIGILL).
2357   b(retryLabel);
2358   bind(doneRetry);
2359 }
2360 
2361 // Spin and retry if lock is busy.
2362 // inputs: box_Reg (monitor address)
2363 //       : retry_count_Reg
2364 // output: retry_count_Reg decremented by 1
2365 // CTR is killed
2366 void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register owner_addr_Reg, Label& retryLabel) {
2367   Label SpinLoop, doneRetry;
2368   addic_(retry_count_Reg, retry_count_Reg, -1);
2369   blt(CCR0, doneRetry);
2370   li(R0, RTMSpinLoopCount);
2371   mtctr(R0);
2372 
2373   bind(SpinLoop);
2374   smt_yield(); // Can't use waitrsv(). No permission (SIGILL).
2375   bdz(retryLabel);
2376   ld(R0, 0, owner_addr_Reg);
2377   cmpdi(CCR0, R0, 0);
2378   bne(CCR0, SpinLoop);
2379   b(retryLabel);
2380 
2381   bind(doneRetry);
2382 }
2383 
2384 // Use RTM for normal stack locks.
2385 // Input: objReg (object to lock)
2386 void MacroAssembler::rtm_stack_locking(ConditionRegister flag,
2387                                        Register obj, Register mark_word, Register tmp,
2388                                        Register retry_on_abort_count_Reg,
2389                                        RTMLockingCounters* stack_rtm_counters,
2390                                        Metadata* method_data, bool profile_rtm,
2391                                        Label& DONE_LABEL, Label& IsInflated) {
2392   assert(UseRTMForStackLocks, "why call this otherwise?");
2393   assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
2394   Label L_rtm_retry, L_decrement_retry, L_on_abort;
2395 
2396   if (RTMRetryCount > 0) {
2397     load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
2398     bind(L_rtm_retry);
2399   }
2400   andi_(R0, mark_word, markOopDesc::monitor_value);  // inflated vs stack-locked|neutral|biased
2401   bne(CCR0, IsInflated);
2402 
2403   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2404     Label L_noincrement;
2405     if (RTMTotalCountIncrRate > 1) {
2406       branch_on_random_using_tb(tmp, (int)RTMTotalCountIncrRate, L_noincrement);
2407     }
2408     assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
2409     load_const_optimized(tmp, (address)stack_rtm_counters->total_count_addr(), R0);
2410     //atomic_inc_ptr(tmp, /*temp, will be reloaded*/mark_word); We don't increment atomically
2411     ldx(mark_word, tmp);
2412     addi(mark_word, mark_word, 1);
2413     stdx(mark_word, tmp);
2414     bind(L_noincrement);
2415   }
2416   tbegin_();
2417   beq(CCR0, L_on_abort);
2418   ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);         // Reload in transaction, conflicts need to be tracked.
2419   andi(R0, mark_word, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
2420   cmpwi(flag, R0, markOopDesc::unlocked_value);                // bits = 001 unlocked
2421   beq(flag, DONE_LABEL);                                       // all done if unlocked
2422 
2423   if (UseRTMXendForLockBusy) {
2424     tend_();
2425     b(L_decrement_retry);
2426   } else {
2427     tabort_();
2428   }
2429   bind(L_on_abort);
2430   const Register abort_status_Reg = tmp;
2431   mftexasr(abort_status_Reg);
2432   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2433     rtm_profiling(abort_status_Reg, /*temp*/mark_word, stack_rtm_counters, method_data, profile_rtm);
2434   }
2435   ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // reload
2436   if (RTMRetryCount > 0) {
2437     // Retry on lock abort if abort status is not permanent.
2438     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry, &L_decrement_retry);
2439   } else {
2440     bind(L_decrement_retry);
2441   }
2442 }
2443 
2444 // Use RTM for inflating locks
2445 // inputs: obj       (object to lock)
2446 //         mark_word (current header - KILLED)
2447 //         boxReg    (on-stack box address (displaced header location) - KILLED)
2448 void MacroAssembler::rtm_inflated_locking(ConditionRegister flag,
2449                                           Register obj, Register mark_word, Register boxReg,
2450                                           Register retry_on_busy_count_Reg, Register retry_on_abort_count_Reg,
2451                                           RTMLockingCounters* rtm_counters,
2452                                           Metadata* method_data, bool profile_rtm,
2453                                           Label& DONE_LABEL) {
2454   assert(UseRTMLocking, "why call this otherwise?");
2455   Label L_rtm_retry, L_decrement_retry, L_on_abort;
2456   // Clean monitor_value bit to get valid pointer.
2457   int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markOopDesc::monitor_value;
2458 
2459   // Store non-null, using boxReg instead of (intptr_t)markOopDesc::unused_mark().
2460   std(boxReg, BasicLock::displaced_header_offset_in_bytes(), boxReg);
2461   const Register tmpReg = boxReg;
2462   const Register owner_addr_Reg = mark_word;
2463   addi(owner_addr_Reg, mark_word, owner_offset);
2464 
2465   if (RTMRetryCount > 0) {
2466     load_const_optimized(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy.
2467     load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort.
2468     bind(L_rtm_retry);
2469   }
2470   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2471     Label L_noincrement;
2472     if (RTMTotalCountIncrRate > 1) {
2473       branch_on_random_using_tb(R0, (int)RTMTotalCountIncrRate, L_noincrement);
2474     }
2475     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2476     load_const(R0, (address)rtm_counters->total_count_addr(), tmpReg);
2477     //atomic_inc_ptr(R0, tmpReg); We don't increment atomically
2478     ldx(tmpReg, R0);
2479     addi(tmpReg, tmpReg, 1);
2480     stdx(tmpReg, R0);
2481     bind(L_noincrement);
2482   }
2483   tbegin_();
2484   beq(CCR0, L_on_abort);
2485   // We don't reload mark word. Will only be reset at safepoint.
2486   ld(R0, 0, owner_addr_Reg); // Load in transaction, conflicts need to be tracked.
2487   cmpdi(flag, R0, 0);
2488   beq(flag, DONE_LABEL);
2489 
2490   if (UseRTMXendForLockBusy) {
2491     tend_();
2492     b(L_decrement_retry);
2493   } else {
2494     tabort_();
2495   }
2496   bind(L_on_abort);
2497   const Register abort_status_Reg = tmpReg;
2498   mftexasr(abort_status_Reg);
2499   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2500     rtm_profiling(abort_status_Reg, /*temp*/ owner_addr_Reg, rtm_counters, method_data, profile_rtm);
2501     // Restore owner_addr_Reg
2502     ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);
2503 #ifdef ASSERT
2504     andi_(R0, mark_word, markOopDesc::monitor_value);
2505     asm_assert_ne("must be inflated", 0xa754); // Deflating only allowed at safepoint.
2506 #endif
2507     addi(owner_addr_Reg, mark_word, owner_offset);
2508   }
2509   if (RTMRetryCount > 0) {
2510     // Retry on lock abort if abort status is not permanent.
2511     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
2512   }
2513 
2514   // Appears unlocked - try to swing _owner from null to non-null.
2515   cmpxchgd(flag, /*current val*/ R0, (intptr_t)0, /*new val*/ R16_thread, owner_addr_Reg,
2516            MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2517            MacroAssembler::cmpxchgx_hint_acquire_lock(), noreg, &L_decrement_retry, true);
2518 
2519   if (RTMRetryCount > 0) {
2520     // success done else retry
2521     b(DONE_LABEL);
2522     bind(L_decrement_retry);
2523     // Spin and retry if lock is busy.
2524     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, owner_addr_Reg, L_rtm_retry);
2525   } else {
2526     bind(L_decrement_retry);
2527   }
2528 }
2529 
2530 #endif //  INCLUDE_RTM_OPT
2531 
2532 // "The box" is the space on the stack where we copy the object mark.
2533 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box,
2534                                                Register temp, Register displaced_header, Register current_header,
2535                                                bool try_bias,
2536                                                RTMLockingCounters* rtm_counters,
2537                                                RTMLockingCounters* stack_rtm_counters,
2538                                                Metadata* method_data,
2539                                                bool use_rtm, bool profile_rtm) {
2540   assert_different_registers(oop, box, temp, displaced_header, current_header);
2541   assert(flag != CCR0, "bad condition register");
2542   Label cont;
2543   Label object_has_monitor;
2544   Label cas_failed;
2545 
2546   // Load markOop from object into displaced_header.
2547   ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop);
2548 
2549 
2550   // Always do locking in runtime.
2551   if (EmitSync & 0x01) {
2552     cmpdi(flag, oop, 0); // Oop can't be 0 here => always false.
2553     return;
2554   }
2555 
2556   if (try_bias) {
2557     biased_locking_enter(flag, oop, displaced_header, temp, current_header, cont);
2558   }
2559 
2560 #if INCLUDE_RTM_OPT
2561   if (UseRTMForStackLocks && use_rtm) {
2562     rtm_stack_locking(flag, oop, displaced_header, temp, /*temp*/ current_header,
2563                       stack_rtm_counters, method_data, profile_rtm,
2564                       cont, object_has_monitor);
2565   }
2566 #endif // INCLUDE_RTM_OPT
2567 
2568   // Handle existing monitor.
2569   if ((EmitSync & 0x02) == 0) {
2570     // The object has an existing monitor iff (mark & monitor_value) != 0.
2571     andi_(temp, displaced_header, markOopDesc::monitor_value);
2572     bne(CCR0, object_has_monitor);
2573   }
2574 
2575   // Set displaced_header to be (markOop of object | UNLOCK_VALUE).
2576   ori(displaced_header, displaced_header, markOopDesc::unlocked_value);
2577 
2578   // Load Compare Value application register.
2579 
2580   // Initialize the box. (Must happen before we update the object mark!)
2581   std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2582 
2583   // Must fence, otherwise, preceding store(s) may float below cmpxchg.
2584   // Compare object markOop with mark and if equal exchange scratch1 with object markOop.
2585   cmpxchgd(/*flag=*/flag,
2586            /*current_value=*/current_header,
2587            /*compare_value=*/displaced_header,
2588            /*exchange_value=*/box,
2589            /*where=*/oop,
2590            MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2591            MacroAssembler::cmpxchgx_hint_acquire_lock(),
2592            noreg,
2593            &cas_failed,
2594            /*check without membar and ldarx first*/true);
2595   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2596 
2597   // If the compare-and-exchange succeeded, then we found an unlocked
2598   // object and we have now locked it.
2599   b(cont);
2600 
2601   bind(cas_failed);
2602   // We did not see an unlocked object so try the fast recursive case.
2603 
2604   // Check if the owner is self by comparing the value in the markOop of object
2605   // (current_header) with the stack pointer.
2606   sub(current_header, current_header, R1_SP);
2607   load_const_optimized(temp, ~(os::vm_page_size()-1) | markOopDesc::lock_mask_in_place);
2608 
2609   and_(R0/*==0?*/, current_header, temp);
2610   // If condition is true we are cont and hence we can store 0 as the
2611   // displaced header in the box, which indicates that it is a recursive lock.
2612   mcrf(flag,CCR0);
2613   std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box);
2614 
2615   // Handle existing monitor.
2616   if ((EmitSync & 0x02) == 0) {
2617     b(cont);
2618 
2619     bind(object_has_monitor);
2620     // The object's monitor m is unlocked iff m->owner == NULL,
2621     // otherwise m->owner may contain a thread or a stack address.
2622 
2623 #if INCLUDE_RTM_OPT
2624     // Use the same RTM locking code in 32- and 64-bit VM.
2625     if (use_rtm) {
2626       rtm_inflated_locking(flag, oop, displaced_header, box, temp, /*temp*/ current_header,
2627                            rtm_counters, method_data, profile_rtm, cont);
2628     } else {
2629 #endif // INCLUDE_RTM_OPT
2630 
2631     // Try to CAS m->owner from NULL to current thread.
2632     addi(temp, displaced_header, ObjectMonitor::owner_offset_in_bytes()-markOopDesc::monitor_value);
2633     cmpxchgd(/*flag=*/flag,
2634              /*current_value=*/current_header,
2635              /*compare_value=*/(intptr_t)0,
2636              /*exchange_value=*/R16_thread,
2637              /*where=*/temp,
2638              MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2639              MacroAssembler::cmpxchgx_hint_acquire_lock());
2640 
2641     // Store a non-null value into the box.
2642     std(box, BasicLock::displaced_header_offset_in_bytes(), box);
2643 
2644 #   ifdef ASSERT
2645     bne(flag, cont);
2646     // We have acquired the monitor, check some invariants.
2647     addi(/*monitor=*/temp, temp, -ObjectMonitor::owner_offset_in_bytes());
2648     // Invariant 1: _recursions should be 0.
2649     //assert(ObjectMonitor::recursions_size_in_bytes() == 8, "unexpected size");
2650     asm_assert_mem8_is_zero(ObjectMonitor::recursions_offset_in_bytes(), temp,
2651                             "monitor->_recursions should be 0", -1);
2652     // Invariant 2: OwnerIsThread shouldn't be 0.
2653     //assert(ObjectMonitor::OwnerIsThread_size_in_bytes() == 4, "unexpected size");
2654     //asm_assert_mem4_isnot_zero(ObjectMonitor::OwnerIsThread_offset_in_bytes(), temp,
2655     //                           "monitor->OwnerIsThread shouldn't be 0", -1);
2656 #   endif
2657 
2658 #if INCLUDE_RTM_OPT
2659     } // use_rtm()
2660 #endif
2661   }
2662 
2663   bind(cont);
2664   // flag == EQ indicates success
2665   // flag == NE indicates failure
2666 }
2667 
2668 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box,
2669                                                  Register temp, Register displaced_header, Register current_header,
2670                                                  bool try_bias, bool use_rtm) {
2671   assert_different_registers(oop, box, temp, displaced_header, current_header);
2672   assert(flag != CCR0, "bad condition register");
2673   Label cont;
2674   Label object_has_monitor;
2675 
2676   // Always do locking in runtime.
2677   if (EmitSync & 0x01) {
2678     cmpdi(flag, oop, 0); // Oop can't be 0 here => always false.
2679     return;
2680   }
2681 
2682   if (try_bias) {
2683     biased_locking_exit(flag, oop, current_header, cont);
2684   }
2685 
2686 #if INCLUDE_RTM_OPT
2687   if (UseRTMForStackLocks && use_rtm) {
2688     assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
2689     Label L_regular_unlock;
2690     ld(current_header, oopDesc::mark_offset_in_bytes(), oop);         // fetch markword
2691     andi(R0, current_header, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
2692     cmpwi(flag, R0, markOopDesc::unlocked_value);                     // bits = 001 unlocked
2693     bne(flag, L_regular_unlock);                                      // else RegularLock
2694     tend_();                                                          // otherwise end...
2695     b(cont);                                                          // ... and we're done
2696     bind(L_regular_unlock);
2697   }
2698 #endif
2699 
2700   // Find the lock address and load the displaced header from the stack.
2701   ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2702 
2703   // If the displaced header is 0, we have a recursive unlock.
2704   cmpdi(flag, displaced_header, 0);
2705   beq(flag, cont);
2706 
2707   // Handle existing monitor.
2708   if ((EmitSync & 0x02) == 0) {
2709     // The object has an existing monitor iff (mark & monitor_value) != 0.
2710     RTM_OPT_ONLY( if (!(UseRTMForStackLocks && use_rtm)) ) // skip load if already done
2711     ld(current_header, oopDesc::mark_offset_in_bytes(), oop);
2712     andi_(R0, current_header, markOopDesc::monitor_value);
2713     bne(CCR0, object_has_monitor);
2714   }
2715 
2716   // Check if it is still a light weight lock, this is is true if we see
2717   // the stack address of the basicLock in the markOop of the object.
2718   // Cmpxchg sets flag to cmpd(current_header, box).
2719   cmpxchgd(/*flag=*/flag,
2720            /*current_value=*/current_header,
2721            /*compare_value=*/box,
2722            /*exchange_value=*/displaced_header,
2723            /*where=*/oop,
2724            MacroAssembler::MemBarRel,
2725            MacroAssembler::cmpxchgx_hint_release_lock(),
2726            noreg,
2727            &cont);
2728 
2729   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2730 
2731   // Handle existing monitor.
2732   if ((EmitSync & 0x02) == 0) {
2733     b(cont);
2734 
2735     bind(object_has_monitor);
2736     addi(current_header, current_header, -markOopDesc::monitor_value); // monitor
2737     ld(temp,             ObjectMonitor::owner_offset_in_bytes(), current_header);
2738 
2739     // It's inflated.
2740 #if INCLUDE_RTM_OPT
2741     if (use_rtm) {
2742       Label L_regular_inflated_unlock;
2743       // Clean monitor_value bit to get valid pointer
2744       cmpdi(flag, temp, 0);
2745       bne(flag, L_regular_inflated_unlock);
2746       tend_();
2747       b(cont);
2748       bind(L_regular_inflated_unlock);
2749     }
2750 #endif
2751 
2752     ld(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header);
2753     xorr(temp, R16_thread, temp);      // Will be 0 if we are the owner.
2754     orr(temp, temp, displaced_header); // Will be 0 if there are 0 recursions.
2755     cmpdi(flag, temp, 0);
2756     bne(flag, cont);
2757 
2758     ld(temp,             ObjectMonitor::EntryList_offset_in_bytes(), current_header);
2759     ld(displaced_header, ObjectMonitor::cxq_offset_in_bytes(), current_header);
2760     orr(temp, temp, displaced_header); // Will be 0 if both are 0.
2761     cmpdi(flag, temp, 0);
2762     bne(flag, cont);
2763     release();
2764     std(temp, ObjectMonitor::owner_offset_in_bytes(), current_header);
2765   }
2766 
2767   bind(cont);
2768   // flag == EQ indicates success
2769   // flag == NE indicates failure
2770 }
2771 
2772 // Write serialization page so VM thread can do a pseudo remote membar.
2773 // We use the current thread pointer to calculate a thread specific
2774 // offset to write to within the page. This minimizes bus traffic
2775 // due to cache line collision.
2776 void MacroAssembler::serialize_memory(Register thread, Register tmp1, Register tmp2) {
2777   srdi(tmp2, thread, os::get_serialize_page_shift_count());
2778 
2779   int mask = os::vm_page_size() - sizeof(int);
2780   if (Assembler::is_simm(mask, 16)) {
2781     andi(tmp2, tmp2, mask);
2782   } else {
2783     lis(tmp1, (int)((signed short) (mask >> 16)));
2784     ori(tmp1, tmp1, mask & 0x0000ffff);
2785     andr(tmp2, tmp2, tmp1);
2786   }
2787 
2788   load_const(tmp1, (long) os::get_memory_serialize_page());
2789   release();
2790   stwx(R0, tmp1, tmp2);
2791 }
2792 
2793 
2794 // GC barrier helper macros
2795 
2796 // Write the card table byte if needed.
2797 void MacroAssembler::card_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp) {
2798   CardTableModRefBS* bs =
2799     barrier_set_cast<CardTableModRefBS>(Universe::heap()->barrier_set());
2800   assert(bs->kind() == BarrierSet::CardTableForRS ||
2801          bs->kind() == BarrierSet::CardTableExtension, "wrong barrier");
2802 #ifdef ASSERT
2803   cmpdi(CCR0, Rnew_val, 0);
2804   asm_assert_ne("null oop not allowed", 0x321);
2805 #endif
2806   card_table_write(bs->byte_map_base, Rtmp, Rstore_addr);
2807 }
2808 
2809 // Write the card table byte.
2810 void MacroAssembler::card_table_write(jbyte* byte_map_base, Register Rtmp, Register Robj) {
2811   assert_different_registers(Robj, Rtmp, R0);
2812   load_const_optimized(Rtmp, (address)byte_map_base, R0);
2813   srdi(Robj, Robj, CardTableModRefBS::card_shift);
2814   li(R0, 0); // dirty
2815   if (UseConcMarkSweepGC) membar(Assembler::StoreStore);
2816   stbx(R0, Rtmp, Robj);
2817 }
2818 
2819 #if INCLUDE_ALL_GCS
2820 // General G1 pre-barrier generator.
2821 // Goal: record the previous value if it is not null.
2822 void MacroAssembler::g1_write_barrier_pre(Register Robj, RegisterOrConstant offset, Register Rpre_val,
2823                                           Register Rtmp1, Register Rtmp2, bool needs_frame) {
2824   Label runtime, filtered;
2825 
2826   // Is marking active?
2827   if (in_bytes(SATBMarkQueue::byte_width_of_active()) == 4) {
2828     lwz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_active()), R16_thread);
2829   } else {
2830     guarantee(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, "Assumption");
2831     lbz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_active()), R16_thread);
2832   }
2833   cmpdi(CCR0, Rtmp1, 0);
2834   beq(CCR0, filtered);
2835 
2836   // Do we need to load the previous value?
2837   if (Robj != noreg) {
2838     // Load the previous value...
2839     if (UseCompressedOops) {
2840       lwz(Rpre_val, offset, Robj);
2841     } else {
2842       ld(Rpre_val, offset, Robj);
2843     }
2844     // Previous value has been loaded into Rpre_val.
2845   }
2846   assert(Rpre_val != noreg, "must have a real register");
2847 
2848   // Is the previous value null?
2849   cmpdi(CCR0, Rpre_val, 0);
2850   beq(CCR0, filtered);
2851 
2852   if (Robj != noreg && UseCompressedOops) {
2853     decode_heap_oop_not_null(Rpre_val);
2854   }
2855 
2856   // OK, it's not filtered, so we'll need to call enqueue. In the normal
2857   // case, pre_val will be a scratch G-reg, but there are some cases in
2858   // which it's an O-reg. In the first case, do a normal call. In the
2859   // latter, do a save here and call the frameless version.
2860 
2861   // Can we store original value in the thread's buffer?
2862   // Is index == 0?
2863   // (The index field is typed as size_t.)
2864   const Register Rbuffer = Rtmp1, Rindex = Rtmp2;
2865 
2866   ld(Rindex, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_index()), R16_thread);
2867   cmpdi(CCR0, Rindex, 0);
2868   beq(CCR0, runtime); // If index == 0, goto runtime.
2869   ld(Rbuffer, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_buf()), R16_thread);
2870 
2871   addi(Rindex, Rindex, -wordSize); // Decrement index.
2872   std(Rindex, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_index()), R16_thread);
2873 
2874   // Record the previous value.
2875   stdx(Rpre_val, Rbuffer, Rindex);
2876   b(filtered);
2877 
2878   bind(runtime);
2879 
2880   // VM call need frame to access(write) O register.
2881   if (needs_frame) {
2882     save_LR_CR(Rtmp1);
2883     push_frame_reg_args(0, Rtmp2);
2884   }
2885 
2886   if (Rpre_val->is_volatile() && Robj == noreg) mr(R31, Rpre_val); // Save pre_val across C call if it was preloaded.
2887   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), Rpre_val, R16_thread);
2888   if (Rpre_val->is_volatile() && Robj == noreg) mr(Rpre_val, R31); // restore
2889 
2890   if (needs_frame) {
2891     pop_frame();
2892     restore_LR_CR(Rtmp1);
2893   }
2894 
2895   bind(filtered);
2896 }
2897 
2898 // General G1 post-barrier generator
2899 // Store cross-region card.
2900 void MacroAssembler::g1_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp1, Register Rtmp2, Register Rtmp3, Label *filtered_ext) {
2901   Label runtime, filtered_int;
2902   Label& filtered = (filtered_ext != NULL) ? *filtered_ext : filtered_int;
2903   assert_different_registers(Rstore_addr, Rnew_val, Rtmp1, Rtmp2);
2904 
2905   G1SATBCardTableLoggingModRefBS* bs =
2906     barrier_set_cast<G1SATBCardTableLoggingModRefBS>(Universe::heap()->barrier_set());
2907 
2908   // Does store cross heap regions?
2909   if (G1RSBarrierRegionFilter) {
2910     xorr(Rtmp1, Rstore_addr, Rnew_val);
2911     srdi_(Rtmp1, Rtmp1, HeapRegion::LogOfHRGrainBytes);
2912     beq(CCR0, filtered);
2913   }
2914 
2915   // Crosses regions, storing NULL?
2916 #ifdef ASSERT
2917   cmpdi(CCR0, Rnew_val, 0);
2918   asm_assert_ne("null oop not allowed (G1)", 0x322); // Checked by caller on PPC64, so following branch is obsolete:
2919   //beq(CCR0, filtered);
2920 #endif
2921 
2922   // Storing region crossing non-NULL, is card already dirty?
2923   assert(sizeof(*bs->byte_map_base) == sizeof(jbyte), "adjust this code");
2924   const Register Rcard_addr = Rtmp1;
2925   Register Rbase = Rtmp2;
2926   load_const_optimized(Rbase, (address)bs->byte_map_base, /*temp*/ Rtmp3);
2927 
2928   srdi(Rcard_addr, Rstore_addr, CardTableModRefBS::card_shift);
2929 
2930   // Get the address of the card.
2931   lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr);
2932   cmpwi(CCR0, Rtmp3, (int)G1SATBCardTableModRefBS::g1_young_card_val());
2933   beq(CCR0, filtered);
2934 
2935   membar(Assembler::StoreLoad);
2936   lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr);  // Reload after membar.
2937   cmpwi(CCR0, Rtmp3 /* card value */, CardTableModRefBS::dirty_card_val());
2938   beq(CCR0, filtered);
2939 
2940   // Storing a region crossing, non-NULL oop, card is clean.
2941   // Dirty card and log.
2942   li(Rtmp3, CardTableModRefBS::dirty_card_val());
2943   //release(); // G1: oops are allowed to get visible after dirty marking.
2944   stbx(Rtmp3, Rbase, Rcard_addr);
2945 
2946   add(Rcard_addr, Rbase, Rcard_addr); // This is the address which needs to get enqueued.
2947   Rbase = noreg; // end of lifetime
2948 
2949   const Register Rqueue_index = Rtmp2,
2950                  Rqueue_buf   = Rtmp3;
2951   ld(Rqueue_index, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_index()), R16_thread);
2952   cmpdi(CCR0, Rqueue_index, 0);
2953   beq(CCR0, runtime); // index == 0 then jump to runtime
2954   ld(Rqueue_buf, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_buf()), R16_thread);
2955 
2956   addi(Rqueue_index, Rqueue_index, -wordSize); // decrement index
2957   std(Rqueue_index, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_index()), R16_thread);
2958 
2959   stdx(Rcard_addr, Rqueue_buf, Rqueue_index); // store card
2960   b(filtered);
2961 
2962   bind(runtime);
2963 
2964   // Save the live input values.
2965   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), Rcard_addr, R16_thread);
2966 
2967   bind(filtered_int);
2968 }
2969 #endif // INCLUDE_ALL_GCS
2970 
2971 // Values for last_Java_pc, and last_Java_sp must comply to the rules
2972 // in frame_ppc.hpp.
2973 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) {
2974   // Always set last_Java_pc and flags first because once last_Java_sp
2975   // is visible has_last_Java_frame is true and users will look at the
2976   // rest of the fields. (Note: flags should always be zero before we
2977   // get here so doesn't need to be set.)
2978 
2979   // Verify that last_Java_pc was zeroed on return to Java
2980   asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread,
2981                           "last_Java_pc not zeroed before leaving Java", 0x200);
2982 
2983   // When returning from calling out from Java mode the frame anchor's
2984   // last_Java_pc will always be set to NULL. It is set here so that
2985   // if we are doing a call to native (not VM) that we capture the
2986   // known pc and don't have to rely on the native call having a
2987   // standard frame linkage where we can find the pc.
2988   if (last_Java_pc != noreg)
2989     std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
2990 
2991   // Set last_Java_sp last.
2992   std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
2993 }
2994 
2995 void MacroAssembler::reset_last_Java_frame(void) {
2996   asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()),
2997                              R16_thread, "SP was not set, still zero", 0x202);
2998 
2999   BLOCK_COMMENT("reset_last_Java_frame {");
3000   li(R0, 0);
3001 
3002   // _last_Java_sp = 0
3003   std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
3004 
3005   // _last_Java_pc = 0
3006   std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
3007   BLOCK_COMMENT("} reset_last_Java_frame");
3008 }
3009 
3010 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) {
3011   assert_different_registers(sp, tmp1);
3012 
3013   // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via
3014   // TOP_IJAVA_FRAME_ABI.
3015   // FIXME: assert that we really have a TOP_IJAVA_FRAME here!
3016   address entry = pc();
3017   load_const_optimized(tmp1, entry);
3018 
3019   set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1);
3020 }
3021 
3022 void MacroAssembler::get_vm_result(Register oop_result) {
3023   // Read:
3024   //   R16_thread
3025   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
3026   //
3027   // Updated:
3028   //   oop_result
3029   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
3030 
3031   verify_thread();
3032 
3033   ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread);
3034   li(R0, 0);
3035   std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread);
3036 
3037   verify_oop(oop_result);
3038 }
3039 
3040 void MacroAssembler::get_vm_result_2(Register metadata_result) {
3041   // Read:
3042   //   R16_thread
3043   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
3044   //
3045   // Updated:
3046   //   metadata_result
3047   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
3048 
3049   ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
3050   li(R0, 0);
3051   std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
3052 }
3053 
3054 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3055   Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided.
3056   if (Universe::narrow_klass_base() != 0) {
3057     // Use dst as temp if it is free.
3058     sub_const_optimized(dst, current, Universe::narrow_klass_base(), R0);
3059     current = dst;
3060   }
3061   if (Universe::narrow_klass_shift() != 0) {
3062     srdi(dst, current, Universe::narrow_klass_shift());
3063     current = dst;
3064   }
3065   return current;
3066 }
3067 
3068 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) {
3069   if (UseCompressedClassPointers) {
3070     Register compressedKlass = encode_klass_not_null(ck, klass);
3071     stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop);
3072   } else {
3073     std(klass, oopDesc::klass_offset_in_bytes(), dst_oop);
3074   }
3075 }
3076 
3077 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) {
3078   if (UseCompressedClassPointers) {
3079     if (val == noreg) {
3080       val = R0;
3081       li(val, 0);
3082     }
3083     stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed
3084   }
3085 }
3086 
3087 int MacroAssembler::instr_size_for_decode_klass_not_null() {
3088   if (!UseCompressedClassPointers) return 0;
3089   int num_instrs = 1;  // shift or move
3090   if (Universe::narrow_klass_base() != 0) num_instrs = 7;  // shift + load const + add
3091   return num_instrs * BytesPerInstWord;
3092 }
3093 
3094 void MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3095   assert(dst != R0, "Dst reg may not be R0, as R0 is used here.");
3096   if (src == noreg) src = dst;
3097   Register shifted_src = src;
3098   if (Universe::narrow_klass_shift() != 0 ||
3099       Universe::narrow_klass_base() == 0 && src != dst) {  // Move required.
3100     shifted_src = dst;
3101     sldi(shifted_src, src, Universe::narrow_klass_shift());
3102   }
3103   if (Universe::narrow_klass_base() != 0) {
3104     add_const_optimized(dst, shifted_src, Universe::narrow_klass_base(), R0);
3105   }
3106 }
3107 
3108 void MacroAssembler::load_klass(Register dst, Register src) {
3109   if (UseCompressedClassPointers) {
3110     lwz(dst, oopDesc::klass_offset_in_bytes(), src);
3111     // Attention: no null check here!
3112     decode_klass_not_null(dst, dst);
3113   } else {
3114     ld(dst, oopDesc::klass_offset_in_bytes(), src);
3115   }
3116 }
3117 
3118 // Clear Array
3119 // Kills both input registers. tmp == R0 is allowed.
3120 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp) {
3121   // Procedure for large arrays (uses data cache block zero instruction).
3122     Label startloop, fast, fastloop, small_rest, restloop, done;
3123     const int cl_size         = VM_Version::L1_data_cache_line_size(),
3124               cl_dwords       = cl_size>>3,
3125               cl_dw_addr_bits = exact_log2(cl_dwords),
3126               dcbz_min        = 1;                     // Min count of dcbz executions, needs to be >0.
3127 
3128 //2:
3129     cmpdi(CCR1, cnt_dwords, ((dcbz_min+1)<<cl_dw_addr_bits)-1); // Big enough? (ensure >=dcbz_min lines included).
3130     blt(CCR1, small_rest);                                      // Too small.
3131     rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits);           // Extract dword offset within first cache line.
3132     beq(CCR0, fast);                                            // Already 128byte aligned.
3133 
3134     subfic(tmp, tmp, cl_dwords);
3135     mtctr(tmp);                        // Set ctr to hit 128byte boundary (0<ctr<cl_dwords).
3136     subf(cnt_dwords, tmp, cnt_dwords); // rest.
3137     li(tmp, 0);
3138 //10:
3139   bind(startloop);                     // Clear at the beginning to reach 128byte boundary.
3140     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3141     addi(base_ptr, base_ptr, 8);
3142     bdnz(startloop);
3143 //13:
3144   bind(fast);                                  // Clear 128byte blocks.
3145     srdi(tmp, cnt_dwords, cl_dw_addr_bits);    // Loop count for 128byte loop (>0).
3146     andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords.
3147     mtctr(tmp);                                // Load counter.
3148 //16:
3149   bind(fastloop);
3150     dcbz(base_ptr);                    // Clear 128byte aligned block.
3151     addi(base_ptr, base_ptr, cl_size);
3152     bdnz(fastloop);
3153     if (InsertEndGroupPPC64) { endgroup(); } else { nop(); }
3154 //20:
3155   bind(small_rest);
3156     cmpdi(CCR0, cnt_dwords, 0);        // size 0?
3157     beq(CCR0, done);                   // rest == 0
3158     li(tmp, 0);
3159     mtctr(cnt_dwords);                 // Load counter.
3160 //24:
3161   bind(restloop);                      // Clear rest.
3162     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3163     addi(base_ptr, base_ptr, 8);
3164     bdnz(restloop);
3165 //27:
3166   bind(done);
3167 }
3168 
3169 /////////////////////////////////////////// String intrinsics ////////////////////////////////////////////
3170 
3171 // Search for a single jchar in an jchar[].
3172 //
3173 // Assumes that result differs from all other registers.
3174 //
3175 // 'haystack' is the addresses of a jchar-array.
3176 // 'needle' is either the character to search for or R0.
3177 // 'needleChar' is the character to search for if 'needle' == R0..
3178 // 'haycnt' is the length of the haystack. We assume 'haycnt' >=1.
3179 //
3180 // Preserves haystack, haycnt, needle and kills all other registers.
3181 //
3182 // If needle == R0, we search for the constant needleChar.
3183 void MacroAssembler::string_indexof_1(Register result, Register haystack, Register haycnt,
3184                                       Register needle, jchar needleChar,
3185                                       Register tmp1, Register tmp2) {
3186 
3187   assert_different_registers(result, haystack, haycnt, needle, tmp1, tmp2);
3188 
3189   Label L_InnerLoop, L_FinalCheck, L_Found1, L_Found2, L_Found3, L_NotFound, L_End;
3190   Register addr = tmp1,
3191            ch1 = tmp2,
3192            ch2 = R0;
3193 
3194 //3:
3195    dcbtct(haystack, 0x00);                        // Indicate R/O access to haystack.
3196 
3197    srwi_(tmp2, haycnt, 1);   // Shift right by exact_log2(UNROLL_FACTOR).
3198    mr(addr, haystack);
3199    beq(CCR0, L_FinalCheck);
3200    mtctr(tmp2);              // Move to count register.
3201 //8:
3202   bind(L_InnerLoop);             // Main work horse (2x unrolled search loop).
3203    lhz(ch1, 0, addr);        // Load characters from haystack.
3204    lhz(ch2, 2, addr);
3205    (needle != R0) ? cmpw(CCR0, ch1, needle) : cmplwi(CCR0, ch1, needleChar);
3206    (needle != R0) ? cmpw(CCR1, ch2, needle) : cmplwi(CCR1, ch2, needleChar);
3207    beq(CCR0, L_Found1);   // Did we find the needle?
3208    beq(CCR1, L_Found2);
3209    addi(addr, addr, 4);
3210    bdnz(L_InnerLoop);
3211 //16:
3212   bind(L_FinalCheck);
3213    andi_(R0, haycnt, 1);
3214    beq(CCR0, L_NotFound);
3215    lhz(ch1, 0, addr);        // One position left at which we have to compare.
3216    (needle != R0) ? cmpw(CCR1, ch1, needle) : cmplwi(CCR1, ch1, needleChar);
3217    beq(CCR1, L_Found3);
3218 //21:
3219   bind(L_NotFound);
3220    li(result, -1);           // Not found.
3221    b(L_End);
3222 
3223   bind(L_Found2);
3224    addi(addr, addr, 2);
3225 //24:
3226   bind(L_Found1);
3227   bind(L_Found3);                  // Return index ...
3228    subf(addr, haystack, addr); // relative to haystack,
3229    srdi(result, addr, 1);      // in characters.
3230   bind(L_End);
3231 }
3232 
3233 
3234 // Implementation of IndexOf for jchar arrays.
3235 //
3236 // The length of haystack and needle are not constant, i.e. passed in a register.
3237 //
3238 // Preserves registers haystack, needle.
3239 // Kills registers haycnt, needlecnt.
3240 // Assumes that result differs from all other registers.
3241 // Haystack, needle are the addresses of jchar-arrays.
3242 // Haycnt, needlecnt are the lengths of them, respectively.
3243 //
3244 // Needlecntval must be zero or 15-bit unsigned immediate and > 1.
3245 void MacroAssembler::string_indexof(Register result, Register haystack, Register haycnt,
3246                                     Register needle, ciTypeArray* needle_values, Register needlecnt, int needlecntval,
3247                                     Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
3248 
3249   // Ensure 0<needlecnt<=haycnt in ideal graph as prerequisite!
3250   Label L_TooShort, L_Found, L_NotFound, L_End;
3251   Register last_addr = haycnt, // Kill haycnt at the beginning.
3252            addr      = tmp1,
3253            n_start   = tmp2,
3254            ch1       = tmp3,
3255            ch2       = R0;
3256 
3257   // **************************************************************************************************
3258   // Prepare for main loop: optimized for needle count >=2, bail out otherwise.
3259   // **************************************************************************************************
3260 
3261 //1 (variable) or 3 (const):
3262    dcbtct(needle, 0x00);    // Indicate R/O access to str1.
3263    dcbtct(haystack, 0x00);  // Indicate R/O access to str2.
3264 
3265   // Compute last haystack addr to use if no match gets found.
3266   if (needlecntval == 0) { // variable needlecnt
3267 //3:
3268    subf(ch1, needlecnt, haycnt);      // Last character index to compare is haycnt-needlecnt.
3269    addi(addr, haystack, -2);          // Accesses use pre-increment.
3270    cmpwi(CCR6, needlecnt, 2);
3271    blt(CCR6, L_TooShort);          // Variable needlecnt: handle short needle separately.
3272    slwi(ch1, ch1, 1);                 // Scale to number of bytes.
3273    lwz(n_start, 0, needle);           // Load first 2 characters of needle.
3274    add(last_addr, haystack, ch1);     // Point to last address to compare (haystack+2*(haycnt-needlecnt)).
3275    addi(needlecnt, needlecnt, -2);    // Rest of needle.
3276   } else { // constant needlecnt
3277   guarantee(needlecntval != 1, "IndexOf with single-character needle must be handled separately");
3278   assert((needlecntval & 0x7fff) == needlecntval, "wrong immediate");
3279 //5:
3280    addi(ch1, haycnt, -needlecntval);  // Last character index to compare is haycnt-needlecnt.
3281    lwz(n_start, 0, needle);           // Load first 2 characters of needle.
3282    addi(addr, haystack, -2);          // Accesses use pre-increment.
3283    slwi(ch1, ch1, 1);                 // Scale to number of bytes.
3284    add(last_addr, haystack, ch1);     // Point to last address to compare (haystack+2*(haycnt-needlecnt)).
3285    li(needlecnt, needlecntval-2);     // Rest of needle.
3286   }
3287 
3288   // Main Loop (now we have at least 3 characters).
3289 //11:
3290   Label L_OuterLoop, L_InnerLoop, L_FinalCheck, L_Comp1, L_Comp2, L_Comp3;
3291   bind(L_OuterLoop); // Search for 1st 2 characters.
3292   Register addr_diff = tmp4;
3293    subf(addr_diff, addr, last_addr); // Difference between already checked address and last address to check.
3294    addi(addr, addr, 2);              // This is the new address we want to use for comparing.
3295    srdi_(ch2, addr_diff, 2);
3296    beq(CCR0, L_FinalCheck);       // 2 characters left?
3297    mtctr(ch2);                       // addr_diff/4
3298 //16:
3299   bind(L_InnerLoop);                // Main work horse (2x unrolled search loop)
3300    lwz(ch1, 0, addr);           // Load 2 characters of haystack (ignore alignment).
3301    lwz(ch2, 2, addr);
3302    cmpw(CCR0, ch1, n_start); // Compare 2 characters (1 would be sufficient but try to reduce branches to CompLoop).
3303    cmpw(CCR1, ch2, n_start);
3304    beq(CCR0, L_Comp1);       // Did we find the needle start?
3305    beq(CCR1, L_Comp2);
3306    addi(addr, addr, 4);
3307    bdnz(L_InnerLoop);
3308 //24:
3309   bind(L_FinalCheck);
3310    rldicl_(addr_diff, addr_diff, 64-1, 63); // Remaining characters not covered by InnerLoop: (addr_diff>>1)&1.
3311    beq(CCR0, L_NotFound);
3312    lwz(ch1, 0, addr);                       // One position left at which we have to compare.
3313    cmpw(CCR1, ch1, n_start);
3314    beq(CCR1, L_Comp3);
3315 //29:
3316   bind(L_NotFound);
3317    li(result, -1); // not found
3318    b(L_End);
3319 
3320 
3321    // **************************************************************************************************
3322    // Special Case: unfortunately, the variable needle case can be called with needlecnt<2
3323    // **************************************************************************************************
3324 //31:
3325  if ((needlecntval>>1) !=1 ) { // Const needlecnt is 2 or 3? Reduce code size.
3326   int nopcnt = 5;
3327   if (needlecntval !=0 ) ++nopcnt; // Balance alignment (other case: see below).
3328   if (needlecntval == 0) {         // We have to handle these cases separately.
3329   Label L_OneCharLoop;
3330   bind(L_TooShort);
3331    mtctr(haycnt);
3332    lhz(n_start, 0, needle);    // First character of needle
3333   bind(L_OneCharLoop);
3334    lhzu(ch1, 2, addr);
3335    cmpw(CCR1, ch1, n_start);
3336    beq(CCR1, L_Found);      // Did we find the one character needle?
3337    bdnz(L_OneCharLoop);
3338    li(result, -1);             // Not found.
3339    b(L_End);
3340   } // 8 instructions, so no impact on alignment.
3341   for (int x = 0; x < nopcnt; ++x) nop();
3342  }
3343 
3344   // **************************************************************************************************
3345   // Regular Case Part II: compare rest of needle (first 2 characters have been compared already)
3346   // **************************************************************************************************
3347 
3348   // Compare the rest
3349 //36 if needlecntval==0, else 37:
3350   bind(L_Comp2);
3351    addi(addr, addr, 2); // First comparison has failed, 2nd one hit.
3352   bind(L_Comp1);            // Addr points to possible needle start.
3353   bind(L_Comp3);            // Could have created a copy and use a different return address but saving code size here.
3354   if (needlecntval != 2) {  // Const needlecnt==2?
3355    if (needlecntval != 3) {
3356     if (needlecntval == 0) beq(CCR6, L_Found); // Variable needlecnt==2?
3357     Register ind_reg = tmp4;
3358     li(ind_reg, 2*2);   // First 2 characters are already compared, use index 2.
3359     mtctr(needlecnt);   // Decremented by 2, still > 0.
3360 //40:
3361    Label L_CompLoop;
3362    bind(L_CompLoop);
3363     lhzx(ch2, needle, ind_reg);
3364     lhzx(ch1, addr, ind_reg);
3365     cmpw(CCR1, ch1, ch2);
3366     bne(CCR1, L_OuterLoop);
3367     addi(ind_reg, ind_reg, 2);
3368     bdnz(L_CompLoop);
3369    } else { // No loop required if there's only one needle character left.
3370     lhz(ch2, 2*2, needle);
3371     lhz(ch1, 2*2, addr);
3372     cmpw(CCR1, ch1, ch2);
3373     bne(CCR1, L_OuterLoop);
3374    }
3375   }
3376   // Return index ...
3377 //46:
3378   bind(L_Found);
3379    subf(addr, haystack, addr); // relative to haystack, ...
3380    srdi(result, addr, 1);      // in characters.
3381 //48:
3382   bind(L_End);
3383 }
3384 
3385 // Implementation of Compare for jchar arrays.
3386 //
3387 // Kills the registers str1, str2, cnt1, cnt2.
3388 // Kills cr0, ctr.
3389 // Assumes that result differes from the input registers.
3390 void MacroAssembler::string_compare(Register str1_reg, Register str2_reg, Register cnt1_reg, Register cnt2_reg,
3391                                     Register result_reg, Register tmp_reg) {
3392    assert_different_registers(result_reg, str1_reg, str2_reg, cnt1_reg, cnt2_reg, tmp_reg);
3393 
3394    Label Ldone, Lslow_case, Lslow_loop, Lfast_loop;
3395    Register cnt_diff = R0,
3396             limit_reg = cnt1_reg,
3397             chr1_reg = result_reg,
3398             chr2_reg = cnt2_reg,
3399             addr_diff = str2_reg;
3400 
3401    // 'cnt_reg' contains the number of characters in the string's character array for the
3402    // pre-CompactStrings strings implementation and the number of bytes in the string's
3403    // byte array for the CompactStrings strings implementation.
3404    const int HAS_COMPACT_STRING = java_lang_String::has_coder_field() ? 1 : 0; // '1' = byte array, '0' = char array
3405 
3406    // Offset 0 should be 32 byte aligned.
3407 //-6:
3408     srawi(cnt1_reg, cnt1_reg, HAS_COMPACT_STRING);
3409     srawi(cnt2_reg, cnt2_reg, HAS_COMPACT_STRING);
3410 //-4:
3411     dcbtct(str1_reg, 0x00);  // Indicate R/O access to str1.
3412     dcbtct(str2_reg, 0x00);  // Indicate R/O access to str2.
3413 //-2:
3414    // Compute min(cnt1, cnt2) and check if 0 (bail out if we don't need to compare characters).
3415     subf(result_reg, cnt2_reg, cnt1_reg);  // difference between cnt1/2
3416     subf_(addr_diff, str1_reg, str2_reg);  // alias?
3417     beq(CCR0, Ldone);                   // return cnt difference if both ones are identical
3418     srawi(limit_reg, result_reg, 31);      // generate signmask (cnt1/2 must be non-negative so cnt_diff can't overflow)
3419     mr(cnt_diff, result_reg);
3420     andr(limit_reg, result_reg, limit_reg); // difference or zero (negative): cnt1<cnt2 ? cnt1-cnt2 : 0
3421     add_(limit_reg, cnt2_reg, limit_reg);  // min(cnt1, cnt2)==0?
3422     beq(CCR0, Ldone);                   // return cnt difference if one has 0 length
3423 
3424     lhz(chr1_reg, 0, str1_reg);            // optional: early out if first characters mismatch
3425     lhzx(chr2_reg, str1_reg, addr_diff);   // optional: early out if first characters mismatch
3426     addi(tmp_reg, limit_reg, -1);          // min(cnt1, cnt2)-1
3427     subf_(result_reg, chr2_reg, chr1_reg); // optional: early out if first characters mismatch
3428     bne(CCR0, Ldone);                   // optional: early out if first characters mismatch
3429 
3430    // Set loop counter by scaling down tmp_reg
3431     srawi_(chr2_reg, tmp_reg, exact_log2(4)); // (min(cnt1, cnt2)-1)/4
3432     ble(CCR0, Lslow_case);                 // need >4 characters for fast loop
3433     andi(limit_reg, tmp_reg, 4-1);            // remaining characters
3434 
3435    // Adapt str1_reg str2_reg for the first loop iteration
3436     mtctr(chr2_reg);                 // (min(cnt1, cnt2)-1)/4
3437     addi(limit_reg, limit_reg, 4+1); // compare last 5-8 characters in slow_case if mismatch found in fast_loop
3438 //16:
3439    // Compare the rest of the characters
3440    bind(Lfast_loop);
3441     ld(chr1_reg, 0, str1_reg);
3442     ldx(chr2_reg, str1_reg, addr_diff);
3443     cmpd(CCR0, chr2_reg, chr1_reg);
3444     bne(CCR0, Lslow_case); // return chr1_reg
3445     addi(str1_reg, str1_reg, 4*2);
3446     bdnz(Lfast_loop);
3447     addi(limit_reg, limit_reg, -4); // no mismatch found in fast_loop, only 1-4 characters missing
3448 //23:
3449    bind(Lslow_case);
3450     mtctr(limit_reg);
3451 //24:
3452    bind(Lslow_loop);
3453     lhz(chr1_reg, 0, str1_reg);
3454     lhzx(chr2_reg, str1_reg, addr_diff);
3455     subf_(result_reg, chr2_reg, chr1_reg);
3456     bne(CCR0, Ldone); // return chr1_reg
3457     addi(str1_reg, str1_reg, 1*2);
3458     bdnz(Lslow_loop);
3459 //30:
3460    // If strings are equal up to min length, return the length difference.
3461     mr(result_reg, cnt_diff);
3462     nop(); // alignment
3463 //32:
3464    // Otherwise, return the difference between the first mismatched chars.
3465    bind(Ldone);
3466 }
3467 
3468 
3469 // Compare char[] arrays.
3470 //
3471 // str1_reg   USE only
3472 // str2_reg   USE only
3473 // cnt_reg    USE_DEF, due to tmp reg shortage
3474 // result_reg DEF only, might compromise USE only registers
3475 void MacroAssembler::char_arrays_equals(Register str1_reg, Register str2_reg, Register cnt_reg, Register result_reg,
3476                                         Register tmp1_reg, Register tmp2_reg, Register tmp3_reg, Register tmp4_reg,
3477                                         Register tmp5_reg) {
3478 
3479   // Str1 may be the same register as str2 which can occur e.g. after scalar replacement.
3480   assert_different_registers(result_reg, str1_reg, cnt_reg, tmp1_reg, tmp2_reg, tmp3_reg, tmp4_reg, tmp5_reg);
3481   assert_different_registers(result_reg, str2_reg, cnt_reg, tmp1_reg, tmp2_reg, tmp3_reg, tmp4_reg, tmp5_reg);
3482 
3483   // Offset 0 should be 32 byte aligned.
3484   Label Linit_cbc, Lcbc, Lloop, Ldone_true, Ldone_false;
3485   Register index_reg = tmp5_reg;
3486   Register cbc_iter  = tmp4_reg;
3487 
3488   // 'cnt_reg' contains the number of characters in the string's character array for the
3489   // pre-CompactStrings strings implementation and the number of bytes in the string's
3490   // byte array for the CompactStrings strings implementation.
3491   const int HAS_COMPACT_STRING = java_lang_String::has_coder_field() ? 1 : 0; // '1' = byte array, '0' = char array
3492 
3493 //-1:
3494   dcbtct(str1_reg, 0x00);  // Indicate R/O access to str1.
3495   dcbtct(str2_reg, 0x00);  // Indicate R/O access to str2.
3496 //1:
3497   // cbc_iter: remaining characters after the '4 java characters per iteration' loop.
3498   rlwinm(cbc_iter, cnt_reg, 32 - HAS_COMPACT_STRING, 30, 31); // (cnt_reg % (HAS_COMPACT_STRING ? 8 : 4)) >> HAS_COMPACT_STRING
3499   li(index_reg, 0); // init
3500   li(result_reg, 0); // assume false
3501   // tmp2_reg: units of 4 java characters (i.e. 8 bytes) per iteration (main loop).
3502   srwi_(tmp2_reg, cnt_reg, exact_log2(4 << HAS_COMPACT_STRING)); // cnt_reg / (HAS_COMPACT_STRING ? 8 : 4)
3503 
3504   cmpwi(CCR1, cbc_iter, 0);             // CCR1 = (cbc_iter==0)
3505   beq(CCR0, Linit_cbc);                 // too short
3506     mtctr(tmp2_reg);
3507 //8:
3508     bind(Lloop);
3509       ldx(tmp1_reg, str1_reg, index_reg);
3510       ldx(tmp2_reg, str2_reg, index_reg);
3511       cmpd(CCR0, tmp1_reg, tmp2_reg);
3512       bne(CCR0, Ldone_false);  // Unequal char pair found -> done.
3513       addi(index_reg, index_reg, 4*sizeof(jchar));
3514       bdnz(Lloop);
3515 //14:
3516   bind(Linit_cbc);
3517   beq(CCR1, Ldone_true);
3518     mtctr(cbc_iter);
3519 //16:
3520     bind(Lcbc);
3521       lhzx(tmp1_reg, str1_reg, index_reg);
3522       lhzx(tmp2_reg, str2_reg, index_reg);
3523       cmpw(CCR0, tmp1_reg, tmp2_reg);
3524       bne(CCR0, Ldone_false);  // Unequal char pair found -> done.
3525       addi(index_reg, index_reg, 1*sizeof(jchar));
3526       bdnz(Lcbc);
3527     nop();
3528   bind(Ldone_true);
3529   li(result_reg, 1);
3530 //24:
3531   bind(Ldone_false);
3532 }
3533 
3534 
3535 void MacroAssembler::char_arrays_equalsImm(Register str1_reg, Register str2_reg, int cntval, Register result_reg,
3536                                            Register tmp1_reg, Register tmp2_reg) {
3537   // Str1 may be the same register as str2 which can occur e.g. after scalar replacement.
3538   assert_different_registers(result_reg, str1_reg, tmp1_reg, tmp2_reg);
3539   assert_different_registers(result_reg, str2_reg, tmp1_reg, tmp2_reg);
3540   assert(sizeof(jchar) == 2, "must be");
3541   assert(cntval >= 0 && ((cntval & 0x7fff) == cntval), "wrong immediate");
3542 
3543   // 'cntval' contains the number of characters in the string's character array for the
3544   // pre-CompactStrings strings implementation and the number of bytes in the string's
3545   // byte array for the CompactStrings strings implementation.
3546   cntval >>= (java_lang_String::has_coder_field() ? 1 : 0); // '1' = byte array strings, '0' = char array strings
3547 
3548   Label Ldone_false;
3549 
3550   if (cntval < 16) { // short case
3551     if (cntval != 0) li(result_reg, 0); // assume false
3552 
3553     const int num_bytes = cntval*sizeof(jchar);
3554     int index = 0;
3555     for (int next_index; (next_index = index + 8) <= num_bytes; index = next_index) {
3556       ld(tmp1_reg, index, str1_reg);
3557       ld(tmp2_reg, index, str2_reg);
3558       cmpd(CCR0, tmp1_reg, tmp2_reg);
3559       bne(CCR0, Ldone_false);
3560     }
3561     if (cntval & 2) {
3562       lwz(tmp1_reg, index, str1_reg);
3563       lwz(tmp2_reg, index, str2_reg);
3564       cmpw(CCR0, tmp1_reg, tmp2_reg);
3565       bne(CCR0, Ldone_false);
3566       index += 4;
3567     }
3568     if (cntval & 1) {
3569       lhz(tmp1_reg, index, str1_reg);
3570       lhz(tmp2_reg, index, str2_reg);
3571       cmpw(CCR0, tmp1_reg, tmp2_reg);
3572       bne(CCR0, Ldone_false);
3573     }
3574     // fallthrough: true
3575   } else {
3576     Label Lloop;
3577     Register index_reg = tmp1_reg;
3578     const int loopcnt = cntval/4;
3579     assert(loopcnt > 0, "must be");
3580     // Offset 0 should be 32 byte aligned.
3581     //2:
3582     dcbtct(str1_reg, 0x00);  // Indicate R/O access to str1.
3583     dcbtct(str2_reg, 0x00);  // Indicate R/O access to str2.
3584     li(tmp2_reg, loopcnt);
3585     li(index_reg, 0); // init
3586     li(result_reg, 0); // assume false
3587     mtctr(tmp2_reg);
3588     //8:
3589     bind(Lloop);
3590     ldx(R0, str1_reg, index_reg);
3591     ldx(tmp2_reg, str2_reg, index_reg);
3592     cmpd(CCR0, R0, tmp2_reg);
3593     bne(CCR0, Ldone_false);  // Unequal char pair found -> done.
3594     addi(index_reg, index_reg, 4*sizeof(jchar));
3595     bdnz(Lloop);
3596     //14:
3597     if (cntval & 2) {
3598       lwzx(R0, str1_reg, index_reg);
3599       lwzx(tmp2_reg, str2_reg, index_reg);
3600       cmpw(CCR0, R0, tmp2_reg);
3601       bne(CCR0, Ldone_false);
3602       if (cntval & 1) addi(index_reg, index_reg, 2*sizeof(jchar));
3603     }
3604     if (cntval & 1) {
3605       lhzx(R0, str1_reg, index_reg);
3606       lhzx(tmp2_reg, str2_reg, index_reg);
3607       cmpw(CCR0, R0, tmp2_reg);
3608       bne(CCR0, Ldone_false);
3609     }
3610     // fallthru: true
3611   }
3612   li(result_reg, 1);
3613   bind(Ldone_false);
3614 }
3615 
3616 // Helpers for Intrinsic Emitters
3617 //
3618 // Revert the byte order of a 32bit value in a register
3619 //   src: 0x44556677
3620 //   dst: 0x77665544
3621 // Three steps to obtain the result:
3622 //  1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word
3623 //     into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared.
3624 //     This value initializes dst.
3625 //  2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost
3626 //     byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go.
3627 //     This value is mask inserted into dst with a [0..23] mask of 1s.
3628 //  3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position.
3629 //     This value is mask inserted into dst with a [8..15] mask of 1s.
3630 void MacroAssembler::load_reverse_32(Register dst, Register src) {
3631   assert_different_registers(dst, src);
3632 
3633   rldicl(dst, src, (4+1)*8, 56);       // Rotate byte 4 into position 7 (rightmost), clear all to the left.
3634   rlwimi(dst, src,     3*8,  0, 23);   // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone.
3635   rlwimi(dst, src,     1*8,  8, 15);   // Insert byte 6 into position 5, leave the rest alone.
3636 }
3637 
3638 // Calculate the column addresses of the crc32 lookup table into distinct registers.
3639 // This loop-invariant calculation is moved out of the loop body, reducing the loop
3640 // body size from 20 to 16 instructions.
3641 // Returns the offset that was used to calculate the address of column tc3.
3642 // Due to register shortage, setting tc3 may overwrite table. With the return offset
3643 // at hand, the original table address can be easily reconstructed.
3644 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {
3645 
3646 #ifdef VM_LITTLE_ENDIAN
3647   // This is what we implement (the DOLIT4 part):
3648   // ========================================================================= */
3649   // #define DOLIT4 c ^= *buf4++; \
3650   //         c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \
3651   //             crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24]
3652   // #define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4
3653   // ========================================================================= */
3654   const int ix0 = 3*(4*CRC32_COLUMN_SIZE);
3655   const int ix1 = 2*(4*CRC32_COLUMN_SIZE);
3656   const int ix2 = 1*(4*CRC32_COLUMN_SIZE);
3657   const int ix3 = 0*(4*CRC32_COLUMN_SIZE);
3658 #else
3659   // This is what we implement (the DOBIG4 part):
3660   // =========================================================================
3661   // #define DOBIG4 c ^= *++buf4; \
3662   //         c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \
3663   //             crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24]
3664   // #define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4
3665   // =========================================================================
3666   const int ix0 = 4*(4*CRC32_COLUMN_SIZE);
3667   const int ix1 = 5*(4*CRC32_COLUMN_SIZE);
3668   const int ix2 = 6*(4*CRC32_COLUMN_SIZE);
3669   const int ix3 = 7*(4*CRC32_COLUMN_SIZE);
3670 #endif
3671   assert_different_registers(table, tc0, tc1, tc2);
3672   assert(table == tc3, "must be!");
3673 
3674   addi(tc0, table, ix0);
3675   addi(tc1, table, ix1);
3676   addi(tc2, table, ix2);
3677   if (ix3 != 0) addi(tc3, table, ix3);
3678 
3679   return ix3;
3680 }
3681 
3682 /**
3683  * uint32_t crc;
3684  * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
3685  */
3686 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) {
3687   assert_different_registers(crc, table, tmp);
3688   assert_different_registers(val, table);
3689 
3690   if (crc == val) {                   // Must rotate first to use the unmodified value.
3691     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3692                                       // As we use a word (4-byte) instruction, we have to adapt the mask bit positions.
3693     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
3694   } else {
3695     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
3696     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3697   }
3698   lwzx(tmp, table, tmp);
3699   xorr(crc, crc, tmp);
3700 }
3701 
3702 /**
3703  * uint32_t crc;
3704  * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
3705  */
3706 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
3707   fold_byte_crc32(crc, crc, table, tmp);
3708 }
3709 
3710 /**
3711  * Emits code to update CRC-32 with a byte value according to constants in table.
3712  *
3713  * @param [in,out]crc   Register containing the crc.
3714  * @param [in]val       Register containing the byte to fold into the CRC.
3715  * @param [in]table     Register containing the table of crc constants.
3716  *
3717  * uint32_t crc;
3718  * val = crc_table[(val ^ crc) & 0xFF];
3719  * crc = val ^ (crc >> 8);
3720  */
3721 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3722   BLOCK_COMMENT("update_byte_crc32:");
3723   xorr(val, val, crc);
3724   fold_byte_crc32(crc, val, table, val);
3725 }
3726 
3727 /**
3728  * @param crc   register containing existing CRC (32-bit)
3729  * @param buf   register pointing to input byte buffer (byte*)
3730  * @param len   register containing number of bytes
3731  * @param table register pointing to CRC table
3732  */
3733 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,
3734                                            Register data, bool loopAlignment, bool invertCRC) {
3735   assert_different_registers(crc, buf, len, table, data);
3736 
3737   Label L_mainLoop, L_done;
3738   const int mainLoop_stepping  = 1;
3739   const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4;
3740 
3741   // Process all bytes in a single-byte loop.
3742   clrldi_(len, len, 32);                         // Enforce 32 bit. Anything to do?
3743   beq(CCR0, L_done);
3744 
3745   if (invertCRC) {
3746     nand(crc, crc, crc);                         // ~c
3747   }
3748 
3749   mtctr(len);
3750   align(mainLoop_alignment);
3751   BIND(L_mainLoop);
3752     lbz(data, 0, buf);                           // Byte from buffer, zero-extended.
3753     addi(buf, buf, mainLoop_stepping);           // Advance buffer position.
3754     update_byte_crc32(crc, data, table);
3755     bdnz(L_mainLoop);                            // Iterate.
3756 
3757   if (invertCRC) {
3758     nand(crc, crc, crc);                         // ~c
3759   }
3760 
3761   bind(L_done);
3762 }
3763 
3764 /**
3765  * Emits code to update CRC-32 with a 4-byte value according to constants in table
3766  * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
3767  */
3768 // A not on the lookup table address(es):
3769 // The lookup table consists of two sets of four columns each.
3770 // The columns {0..3} are used for little-endian machines.
3771 // The columns {4..7} are used for big-endian machines.
3772 // To save the effort of adding the column offset to the table address each time
3773 // a table element is looked up, it is possible to pass the pre-calculated
3774 // column addresses.
3775 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary.
3776 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,
3777                                         Register t0,  Register t1,  Register t2,  Register t3,
3778                                         Register tc0, Register tc1, Register tc2, Register tc3) {
3779   assert_different_registers(crc, t3);
3780 
3781   // XOR crc with next four bytes of buffer.
3782   lwz(t3, bufDisp, buf);
3783   if (bufInc != 0) {
3784     addi(buf, buf, bufInc);
3785   }
3786   xorr(t3, t3, crc);
3787 
3788   // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices.
3789   rlwinm(t0, t3,  2,         24-2, 31-2);  // ((t1 >>  0) & 0xff) << 2
3790   rlwinm(t1, t3,  32+(2- 8), 24-2, 31-2);  // ((t1 >>  8) & 0xff) << 2
3791   rlwinm(t2, t3,  32+(2-16), 24-2, 31-2);  // ((t1 >> 16) & 0xff) << 2
3792   rlwinm(t3, t3,  32+(2-24), 24-2, 31-2);  // ((t1 >> 24) & 0xff) << 2
3793 
3794   // Use the pre-calculated column addresses.
3795   // Load pre-calculated table values.
3796   lwzx(t0, tc0, t0);
3797   lwzx(t1, tc1, t1);
3798   lwzx(t2, tc2, t2);
3799   lwzx(t3, tc3, t3);
3800 
3801   // Calculate new crc from table values.
3802   xorr(t0,  t0, t1);
3803   xorr(t2,  t2, t3);
3804   xorr(crc, t0, t2);  // Now crc contains the final checksum value.
3805 }
3806 
3807 /**
3808  * @param crc   register containing existing CRC (32-bit)
3809  * @param buf   register pointing to input byte buffer (byte*)
3810  * @param len   register containing number of bytes
3811  * @param table register pointing to CRC table
3812  *
3813  * Uses R9..R12 as work register. Must be saved/restored by caller!
3814  */
3815 void MacroAssembler::kernel_crc32_2word(Register crc, Register buf, Register len, Register table,
3816                                         Register t0,  Register t1,  Register t2,  Register t3,
3817                                         Register tc0, Register tc1, Register tc2, Register tc3) {
3818   assert_different_registers(crc, buf, len, table);
3819 
3820   Label L_mainLoop, L_tail;
3821   Register  tmp  = t0;
3822   Register  data = t0;
3823   Register  tmp2 = t1;
3824   const int mainLoop_stepping  = 8;
3825   const int tailLoop_stepping  = 1;
3826   const int log_stepping       = exact_log2(mainLoop_stepping);
3827   const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
3828   const int complexThreshold   = 2*mainLoop_stepping;
3829 
3830   // Don't test for len <= 0 here. This pathological case should not occur anyway.
3831   // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles.
3832   // The situation itself is detected and handled correctly by the conditional branches
3833   // following  aghi(len, -stepping) and aghi(len, +stepping).
3834   assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
3835 
3836   BLOCK_COMMENT("kernel_crc32_2word {");
3837 
3838   nand(crc, crc, crc);                           // ~c
3839 
3840   // Check for short (<mainLoop_stepping) buffer.
3841   cmpdi(CCR0, len, complexThreshold);
3842   blt(CCR0, L_tail);
3843 
3844   // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
3845   // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
3846   {
3847     // Align buf addr to mainLoop_stepping boundary.
3848     neg(tmp2, buf);                           // Calculate # preLoop iterations for alignment.
3849     rldicl(tmp2, tmp2, 0, 64-log_stepping);   // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
3850 
3851     if (complexThreshold > mainLoop_stepping) {
3852       sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3853     } else {
3854       sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
3855       cmpdi(CCR0, tmp, mainLoop_stepping);
3856       blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
3857       mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3858     }
3859     update_byteLoop_crc32(crc, buf, tmp2, table, data, false, false);
3860   }
3861 
3862   srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
3863   andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
3864   mtctr(tmp2);
3865 
3866 #ifdef VM_LITTLE_ENDIAN
3867   Register crc_rv = crc;
3868 #else
3869   Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
3870                                                  // Occupies tmp, but frees up crc.
3871   load_reverse_32(crc_rv, crc);                  // Revert byte order because we are dealing with big-endian data.
3872   tmp = crc;
3873 #endif
3874 
3875   int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
3876 
3877   align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
3878   BIND(L_mainLoop);
3879     update_1word_crc32(crc_rv, buf, table, 0, 0, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
3880     update_1word_crc32(crc_rv, buf, table, 4, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
3881     bdnz(L_mainLoop);
3882 
3883 #ifndef VM_LITTLE_ENDIAN
3884   load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
3885   tmp = crc_rv;                                  // Tmp uses it's original register again.
3886 #endif
3887 
3888   // Restore original table address for tailLoop.
3889   if (reconstructTableOffset != 0) {
3890     addi(table, table, -reconstructTableOffset);
3891   }
3892 
3893   // Process last few (<complexThreshold) bytes of buffer.
3894   BIND(L_tail);
3895   update_byteLoop_crc32(crc, buf, len, table, data, false, false);
3896 
3897   nand(crc, crc, crc);                           // ~c
3898   BLOCK_COMMENT("} kernel_crc32_2word");
3899 }
3900 
3901 /**
3902  * @param crc   register containing existing CRC (32-bit)
3903  * @param buf   register pointing to input byte buffer (byte*)
3904  * @param len   register containing number of bytes
3905  * @param table register pointing to CRC table
3906  *
3907  * uses R9..R12 as work register. Must be saved/restored by caller!
3908  */
3909 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table,
3910                                         Register t0,  Register t1,  Register t2,  Register t3,
3911                                         Register tc0, Register tc1, Register tc2, Register tc3) {
3912   assert_different_registers(crc, buf, len, table);
3913 
3914   Label L_mainLoop, L_tail;
3915   Register  tmp          = t0;
3916   Register  data         = t0;
3917   Register  tmp2         = t1;
3918   const int mainLoop_stepping  = 4;
3919   const int tailLoop_stepping  = 1;
3920   const int log_stepping       = exact_log2(mainLoop_stepping);
3921   const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
3922   const int complexThreshold   = 2*mainLoop_stepping;
3923 
3924   // Don't test for len <= 0 here. This pathological case should not occur anyway.
3925   // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles.
3926   // The situation itself is detected and handled correctly by the conditional branches
3927   // following  aghi(len, -stepping) and aghi(len, +stepping).
3928   assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
3929 
3930   BLOCK_COMMENT("kernel_crc32_1word {");
3931 
3932   nand(crc, crc, crc);                           // ~c
3933 
3934   // Check for short (<mainLoop_stepping) buffer.
3935   cmpdi(CCR0, len, complexThreshold);
3936   blt(CCR0, L_tail);
3937 
3938   // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
3939   // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
3940   {
3941     // Align buf addr to mainLoop_stepping boundary.
3942     neg(tmp2, buf);                              // Calculate # preLoop iterations for alignment.
3943     rldicl(tmp2, tmp2, 0, 64-log_stepping);      // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
3944 
3945     if (complexThreshold > mainLoop_stepping) {
3946       sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3947     } else {
3948       sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
3949       cmpdi(CCR0, tmp, mainLoop_stepping);
3950       blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
3951       mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3952     }
3953     update_byteLoop_crc32(crc, buf, tmp2, table, data, false, false);
3954   }
3955 
3956   srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
3957   andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
3958   mtctr(tmp2);
3959 
3960 #ifdef VM_LITTLE_ENDIAN
3961   Register crc_rv = crc;
3962 #else
3963   Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
3964                                                  // Occupies tmp, but frees up crc.
3965   load_reverse_32(crc_rv, crc);                  // Revert byte order because we are dealing with big-endian data.
3966   tmp = crc;
3967 #endif
3968 
3969   int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
3970 
3971   align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
3972   BIND(L_mainLoop);
3973     update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
3974     bdnz(L_mainLoop);
3975 
3976 #ifndef VM_LITTLE_ENDIAN
3977   load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
3978   tmp = crc_rv;                                  // Tmp uses it's original register again.
3979 #endif
3980 
3981   // Restore original table address for tailLoop.
3982   if (reconstructTableOffset != 0) {
3983     addi(table, table, -reconstructTableOffset);
3984   }
3985 
3986   // Process last few (<complexThreshold) bytes of buffer.
3987   BIND(L_tail);
3988   update_byteLoop_crc32(crc, buf, len, table, data, false, false);
3989 
3990   nand(crc, crc, crc);                           // ~c
3991   BLOCK_COMMENT("} kernel_crc32_1word");
3992 }
3993 
3994 /**
3995  * @param crc   register containing existing CRC (32-bit)
3996  * @param buf   register pointing to input byte buffer (byte*)
3997  * @param len   register containing number of bytes
3998  * @param table register pointing to CRC table
3999  *
4000  * Uses R7_ARG5, R8_ARG6 as work registers.
4001  */
4002 void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len, Register table,
4003                                         Register t0,  Register t1,  Register t2,  Register t3) {
4004   assert_different_registers(crc, buf, len, table);
4005 
4006   Register  data = t0;                   // Holds the current byte to be folded into crc.
4007 
4008   BLOCK_COMMENT("kernel_crc32_1byte {");
4009 
4010   // Process all bytes in a single-byte loop.
4011   update_byteLoop_crc32(crc, buf, len, table, data, true, true);
4012 
4013   BLOCK_COMMENT("} kernel_crc32_1byte");
4014 }
4015 
4016 void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp) {
4017   assert_different_registers(crc, buf, /* len,  not used!! */ table, tmp);
4018 
4019   BLOCK_COMMENT("kernel_crc32_singleByte:");
4020   nand(crc, crc, crc);       // ~c
4021 
4022   lbz(tmp, 0, buf);          // Byte from buffer, zero-extended.
4023   update_byte_crc32(crc, tmp, table);
4024 
4025   nand(crc, crc, crc);       // ~c
4026 }
4027 
4028 // dest_lo += src1 + src2
4029 // dest_hi += carry1 + carry2
4030 void MacroAssembler::add2_with_carry(Register dest_hi,
4031                                      Register dest_lo,
4032                                      Register src1, Register src2) {
4033   li(R0, 0);
4034   addc(dest_lo, dest_lo, src1);
4035   adde(dest_hi, dest_hi, R0);
4036   addc(dest_lo, dest_lo, src2);
4037   adde(dest_hi, dest_hi, R0);
4038 }
4039 
4040 // Multiply 64 bit by 64 bit first loop.
4041 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,
4042                                            Register x_xstart,
4043                                            Register y, Register y_idx,
4044                                            Register z,
4045                                            Register carry,
4046                                            Register product_high, Register product,
4047                                            Register idx, Register kdx,
4048                                            Register tmp) {
4049   //  jlong carry, x[], y[], z[];
4050   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
4051   //    huge_128 product = y[idx] * x[xstart] + carry;
4052   //    z[kdx] = (jlong)product;
4053   //    carry  = (jlong)(product >>> 64);
4054   //  }
4055   //  z[xstart] = carry;
4056 
4057   Label L_first_loop, L_first_loop_exit;
4058   Label L_one_x, L_one_y, L_multiply;
4059 
4060   addic_(xstart, xstart, -1);
4061   blt(CCR0, L_one_x);   // Special case: length of x is 1.
4062 
4063   // Load next two integers of x.
4064   sldi(tmp, xstart, LogBytesPerInt);
4065   ldx(x_xstart, x, tmp);
4066 #ifdef VM_LITTLE_ENDIAN
4067   rldicl(x_xstart, x_xstart, 32, 0);
4068 #endif
4069 
4070   align(32, 16);
4071   bind(L_first_loop);
4072 
4073   cmpdi(CCR0, idx, 1);
4074   blt(CCR0, L_first_loop_exit);
4075   addi(idx, idx, -2);
4076   beq(CCR0, L_one_y);
4077 
4078   // Load next two integers of y.
4079   sldi(tmp, idx, LogBytesPerInt);
4080   ldx(y_idx, y, tmp);
4081 #ifdef VM_LITTLE_ENDIAN
4082   rldicl(y_idx, y_idx, 32, 0);
4083 #endif
4084 
4085 
4086   bind(L_multiply);
4087   multiply64(product_high, product, x_xstart, y_idx);
4088 
4089   li(tmp, 0);
4090   addc(product, product, carry);         // Add carry to result.
4091   adde(product_high, product_high, tmp); // Add carry of the last addition.
4092   addi(kdx, kdx, -2);
4093 
4094   // Store result.
4095 #ifdef VM_LITTLE_ENDIAN
4096   rldicl(product, product, 32, 0);
4097 #endif
4098   sldi(tmp, kdx, LogBytesPerInt);
4099   stdx(product, z, tmp);
4100   mr_if_needed(carry, product_high);
4101   b(L_first_loop);
4102 
4103 
4104   bind(L_one_y); // Load one 32 bit portion of y as (0,value).
4105 
4106   lwz(y_idx, 0, y);
4107   b(L_multiply);
4108 
4109 
4110   bind(L_one_x); // Load one 32 bit portion of x as (0,value).
4111 
4112   lwz(x_xstart, 0, x);
4113   b(L_first_loop);
4114 
4115   bind(L_first_loop_exit);
4116 }
4117 
4118 // Multiply 64 bit by 64 bit and add 128 bit.
4119 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y,
4120                                             Register z, Register yz_idx,
4121                                             Register idx, Register carry,
4122                                             Register product_high, Register product,
4123                                             Register tmp, int offset) {
4124 
4125   //  huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
4126   //  z[kdx] = (jlong)product;
4127 
4128   sldi(tmp, idx, LogBytesPerInt);
4129   if (offset) {
4130     addi(tmp, tmp, offset);
4131   }
4132   ldx(yz_idx, y, tmp);
4133 #ifdef VM_LITTLE_ENDIAN
4134   rldicl(yz_idx, yz_idx, 32, 0);
4135 #endif
4136 
4137   multiply64(product_high, product, x_xstart, yz_idx);
4138   ldx(yz_idx, z, tmp);
4139 #ifdef VM_LITTLE_ENDIAN
4140   rldicl(yz_idx, yz_idx, 32, 0);
4141 #endif
4142 
4143   add2_with_carry(product_high, product, carry, yz_idx);
4144 
4145   sldi(tmp, idx, LogBytesPerInt);
4146   if (offset) {
4147     addi(tmp, tmp, offset);
4148   }
4149 #ifdef VM_LITTLE_ENDIAN
4150   rldicl(product, product, 32, 0);
4151 #endif
4152   stdx(product, z, tmp);
4153 }
4154 
4155 // Multiply 128 bit by 128 bit. Unrolled inner loop.
4156 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart,
4157                                              Register y, Register z,
4158                                              Register yz_idx, Register idx, Register carry,
4159                                              Register product_high, Register product,
4160                                              Register carry2, Register tmp) {
4161 
4162   //  jlong carry, x[], y[], z[];
4163   //  int kdx = ystart+1;
4164   //  for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
4165   //    huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
4166   //    z[kdx+idx+1] = (jlong)product;
4167   //    jlong carry2 = (jlong)(product >>> 64);
4168   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
4169   //    z[kdx+idx] = (jlong)product;
4170   //    carry = (jlong)(product >>> 64);
4171   //  }
4172   //  idx += 2;
4173   //  if (idx > 0) {
4174   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
4175   //    z[kdx+idx] = (jlong)product;
4176   //    carry = (jlong)(product >>> 64);
4177   //  }
4178 
4179   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
4180   const Register jdx = R0;
4181 
4182   // Scale the index.
4183   srdi_(jdx, idx, 2);
4184   beq(CCR0, L_third_loop_exit);
4185   mtctr(jdx);
4186 
4187   align(32, 16);
4188   bind(L_third_loop);
4189 
4190   addi(idx, idx, -4);
4191 
4192   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8);
4193   mr_if_needed(carry2, product_high);
4194 
4195   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0);
4196   mr_if_needed(carry, product_high);
4197   bdnz(L_third_loop);
4198 
4199   bind(L_third_loop_exit);  // Handle any left-over operand parts.
4200 
4201   andi_(idx, idx, 0x3);
4202   beq(CCR0, L_post_third_loop_done);
4203 
4204   Label L_check_1;
4205 
4206   addic_(idx, idx, -2);
4207   blt(CCR0, L_check_1);
4208 
4209   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0);
4210   mr_if_needed(carry, product_high);
4211 
4212   bind(L_check_1);
4213 
4214   addi(idx, idx, 0x2);
4215   andi_(idx, idx, 0x1);
4216   addic_(idx, idx, -1);
4217   blt(CCR0, L_post_third_loop_done);
4218 
4219   sldi(tmp, idx, LogBytesPerInt);
4220   lwzx(yz_idx, y, tmp);
4221   multiply64(product_high, product, x_xstart, yz_idx);
4222   lwzx(yz_idx, z, tmp);
4223 
4224   add2_with_carry(product_high, product, yz_idx, carry);
4225 
4226   sldi(tmp, idx, LogBytesPerInt);
4227   stwx(product, z, tmp);
4228   srdi(product, product, 32);
4229 
4230   sldi(product_high, product_high, 32);
4231   orr(product, product, product_high);
4232   mr_if_needed(carry, product);
4233 
4234   bind(L_post_third_loop_done);
4235 }   // multiply_128_x_128_loop
4236 
4237 void MacroAssembler::multiply_to_len(Register x, Register xlen,
4238                                      Register y, Register ylen,
4239                                      Register z, Register zlen,
4240                                      Register tmp1, Register tmp2,
4241                                      Register tmp3, Register tmp4,
4242                                      Register tmp5, Register tmp6,
4243                                      Register tmp7, Register tmp8,
4244                                      Register tmp9, Register tmp10,
4245                                      Register tmp11, Register tmp12,
4246                                      Register tmp13) {
4247 
4248   ShortBranchVerifier sbv(this);
4249 
4250   assert_different_registers(x, xlen, y, ylen, z, zlen,
4251                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
4252   assert_different_registers(x, xlen, y, ylen, z, zlen,
4253                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp7);
4254   assert_different_registers(x, xlen, y, ylen, z, zlen,
4255                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp8);
4256 
4257   const Register idx = tmp1;
4258   const Register kdx = tmp2;
4259   const Register xstart = tmp3;
4260 
4261   const Register y_idx = tmp4;
4262   const Register carry = tmp5;
4263   const Register product = tmp6;
4264   const Register product_high = tmp7;
4265   const Register x_xstart = tmp8;
4266   const Register tmp = tmp9;
4267 
4268   // First Loop.
4269   //
4270   //  final static long LONG_MASK = 0xffffffffL;
4271   //  int xstart = xlen - 1;
4272   //  int ystart = ylen - 1;
4273   //  long carry = 0;
4274   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
4275   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
4276   //    z[kdx] = (int)product;
4277   //    carry = product >>> 32;
4278   //  }
4279   //  z[xstart] = (int)carry;
4280 
4281   mr_if_needed(idx, ylen);        // idx = ylen
4282   mr_if_needed(kdx, zlen);        // kdx = xlen + ylen
4283   li(carry, 0);                   // carry = 0
4284 
4285   Label L_done;
4286 
4287   addic_(xstart, xlen, -1);
4288   blt(CCR0, L_done);
4289 
4290   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z,
4291                         carry, product_high, product, idx, kdx, tmp);
4292 
4293   Label L_second_loop;
4294 
4295   cmpdi(CCR0, kdx, 0);
4296   beq(CCR0, L_second_loop);
4297 
4298   Label L_carry;
4299 
4300   addic_(kdx, kdx, -1);
4301   beq(CCR0, L_carry);
4302 
4303   // Store lower 32 bits of carry.
4304   sldi(tmp, kdx, LogBytesPerInt);
4305   stwx(carry, z, tmp);
4306   srdi(carry, carry, 32);
4307   addi(kdx, kdx, -1);
4308 
4309 
4310   bind(L_carry);
4311 
4312   // Store upper 32 bits of carry.
4313   sldi(tmp, kdx, LogBytesPerInt);
4314   stwx(carry, z, tmp);
4315 
4316   // Second and third (nested) loops.
4317   //
4318   //  for (int i = xstart-1; i >= 0; i--) { // Second loop
4319   //    carry = 0;
4320   //    for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
4321   //      long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
4322   //                     (z[k] & LONG_MASK) + carry;
4323   //      z[k] = (int)product;
4324   //      carry = product >>> 32;
4325   //    }
4326   //    z[i] = (int)carry;
4327   //  }
4328   //
4329   //  i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
4330 
4331   bind(L_second_loop);
4332 
4333   li(carry, 0);                   // carry = 0;
4334 
4335   addic_(xstart, xstart, -1);     // i = xstart-1;
4336   blt(CCR0, L_done);
4337 
4338   Register zsave = tmp10;
4339 
4340   mr(zsave, z);
4341 
4342 
4343   Label L_last_x;
4344 
4345   sldi(tmp, xstart, LogBytesPerInt);
4346   add(z, z, tmp);                 // z = z + k - j
4347   addi(z, z, 4);
4348   addic_(xstart, xstart, -1);     // i = xstart-1;
4349   blt(CCR0, L_last_x);
4350 
4351   sldi(tmp, xstart, LogBytesPerInt);
4352   ldx(x_xstart, x, tmp);
4353 #ifdef VM_LITTLE_ENDIAN
4354   rldicl(x_xstart, x_xstart, 32, 0);
4355 #endif
4356 
4357 
4358   Label L_third_loop_prologue;
4359 
4360   bind(L_third_loop_prologue);
4361 
4362   Register xsave = tmp11;
4363   Register xlensave = tmp12;
4364   Register ylensave = tmp13;
4365 
4366   mr(xsave, x);
4367   mr(xlensave, xstart);
4368   mr(ylensave, ylen);
4369 
4370 
4371   multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen,
4372                           carry, product_high, product, x, tmp);
4373 
4374   mr(z, zsave);
4375   mr(x, xsave);
4376   mr(xlen, xlensave);   // This is the decrement of the loop counter!
4377   mr(ylen, ylensave);
4378 
4379   addi(tmp3, xlen, 1);
4380   sldi(tmp, tmp3, LogBytesPerInt);
4381   stwx(carry, z, tmp);
4382   addic_(tmp3, tmp3, -1);
4383   blt(CCR0, L_done);
4384 
4385   srdi(carry, carry, 32);
4386   sldi(tmp, tmp3, LogBytesPerInt);
4387   stwx(carry, z, tmp);
4388   b(L_second_loop);
4389 
4390   // Next infrequent code is moved outside loops.
4391   bind(L_last_x);
4392 
4393   lwz(x_xstart, 0, x);
4394   b(L_third_loop_prologue);
4395 
4396   bind(L_done);
4397 }   // multiply_to_len
4398 
4399 void MacroAssembler::asm_assert(bool check_equal, const char *msg, int id) {
4400 #ifdef ASSERT
4401   Label ok;
4402   if (check_equal) {
4403     beq(CCR0, ok);
4404   } else {
4405     bne(CCR0, ok);
4406   }
4407   stop(msg, id);
4408   bind(ok);
4409 #endif
4410 }
4411 
4412 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset,
4413                                           Register mem_base, const char* msg, int id) {
4414 #ifdef ASSERT
4415   switch (size) {
4416     case 4:
4417       lwz(R0, mem_offset, mem_base);
4418       cmpwi(CCR0, R0, 0);
4419       break;
4420     case 8:
4421       ld(R0, mem_offset, mem_base);
4422       cmpdi(CCR0, R0, 0);
4423       break;
4424     default:
4425       ShouldNotReachHere();
4426   }
4427   asm_assert(check_equal, msg, id);
4428 #endif // ASSERT
4429 }
4430 
4431 void MacroAssembler::verify_thread() {
4432   if (VerifyThread) {
4433     unimplemented("'VerifyThread' currently not implemented on PPC");
4434   }
4435 }
4436 
4437 // READ: oop. KILL: R0. Volatile floats perhaps.
4438 void MacroAssembler::verify_oop(Register oop, const char* msg) {
4439   if (!VerifyOops) {
4440     return;
4441   }
4442 
4443   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4444   const Register tmp = R11; // Will be preserved.
4445   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
4446   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
4447 
4448   mr_if_needed(R4_ARG2, oop);
4449   save_LR_CR(tmp); // save in old frame
4450   push_frame_reg_args(nbytes_save, tmp);
4451   // load FunctionDescriptor** / entry_address *
4452   load_const_optimized(tmp, fd, R0);
4453   // load FunctionDescriptor* / entry_address
4454   ld(tmp, 0, tmp);
4455   load_const_optimized(R3_ARG1, (address)msg, R0);
4456   // Call destination for its side effect.
4457   call_c(tmp);
4458 
4459   pop_frame();
4460   restore_LR_CR(tmp);
4461   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
4462 }
4463 
4464 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) {
4465   if (!VerifyOops) {
4466     return;
4467   }
4468 
4469   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4470   const Register tmp = R11; // Will be preserved.
4471   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
4472   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
4473 
4474   ld(R4_ARG2, offs, base);
4475   save_LR_CR(tmp); // save in old frame
4476   push_frame_reg_args(nbytes_save, tmp);
4477   // load FunctionDescriptor** / entry_address *
4478   load_const_optimized(tmp, fd, R0);
4479   // load FunctionDescriptor* / entry_address
4480   ld(tmp, 0, tmp);
4481   load_const_optimized(R3_ARG1, (address)msg, R0);
4482   // Call destination for its side effect.
4483   call_c(tmp);
4484 
4485   pop_frame();
4486   restore_LR_CR(tmp);
4487   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
4488 }
4489 
4490 const char* stop_types[] = {
4491   "stop",
4492   "untested",
4493   "unimplemented",
4494   "shouldnotreachhere"
4495 };
4496 
4497 static void stop_on_request(int tp, const char* msg) {
4498   tty->print("PPC assembly code requires stop: (%s) %s\n", stop_types[tp%/*stop_end*/4], msg);
4499   guarantee(false, "PPC assembly code requires stop: %s", msg);
4500 }
4501 
4502 // Call a C-function that prints output.
4503 void MacroAssembler::stop(int type, const char* msg, int id) {
4504 #ifndef PRODUCT
4505   block_comment(err_msg("stop: %s %s {", stop_types[type%stop_end], msg));
4506 #else
4507   block_comment("stop {");
4508 #endif
4509 
4510   // setup arguments
4511   load_const_optimized(R3_ARG1, type);
4512   load_const_optimized(R4_ARG2, (void *)msg, /*tmp=*/R0);
4513   call_VM_leaf(CAST_FROM_FN_PTR(address, stop_on_request), R3_ARG1, R4_ARG2);
4514   illtrap();
4515   emit_int32(id);
4516   block_comment("} stop;");
4517 }
4518 
4519 #ifndef PRODUCT
4520 // Write pattern 0x0101010101010101 in memory region [low-before, high+after].
4521 // Val, addr are temp registers.
4522 // If low == addr, addr is killed.
4523 // High is preserved.
4524 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) {
4525   if (!ZapMemory) return;
4526 
4527   assert_different_registers(low, val);
4528 
4529   BLOCK_COMMENT("zap memory region {");
4530   load_const_optimized(val, 0x0101010101010101);
4531   int size = before + after;
4532   if (low == high && size < 5 && size > 0) {
4533     int offset = -before*BytesPerWord;
4534     for (int i = 0; i < size; ++i) {
4535       std(val, offset, low);
4536       offset += (1*BytesPerWord);
4537     }
4538   } else {
4539     addi(addr, low, -before*BytesPerWord);
4540     assert_different_registers(high, val);
4541     if (after) addi(high, high, after * BytesPerWord);
4542     Label loop;
4543     bind(loop);
4544     std(val, 0, addr);
4545     addi(addr, addr, 8);
4546     cmpd(CCR6, addr, high);
4547     ble(CCR6, loop);
4548     if (after) addi(high, high, -after * BytesPerWord);  // Correct back to old value.
4549   }
4550   BLOCK_COMMENT("} zap memory region");
4551 }
4552 
4553 #endif // !PRODUCT
4554 
4555 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() {
4556   int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true);
4557   assert(sizeof(bool) == 1, "PowerPC ABI");
4558   masm->lbz(temp, simm16_offset, temp);
4559   masm->cmpwi(CCR0, temp, 0);
4560   masm->beq(CCR0, _label);
4561 }
4562 
4563 SkipIfEqualZero::~SkipIfEqualZero() {
4564   _masm->bind(_label);
4565 }