1 /*
   2  * Copyright (c) 1997, 2016, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright 2012, 2016 SAP AG. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "compiler/disassembler.hpp"
  29 #include "gc/shared/cardTableModRefBS.hpp"
  30 #include "gc/shared/collectedHeap.inline.hpp"
  31 #include "interpreter/interpreter.hpp"
  32 #include "memory/resourceArea.hpp"
  33 #include "nativeInst_ppc.hpp"
  34 #include "prims/methodHandles.hpp"
  35 #include "runtime/biasedLocking.hpp"
  36 #include "runtime/icache.hpp"
  37 #include "runtime/interfaceSupport.hpp"
  38 #include "runtime/objectMonitor.hpp"
  39 #include "runtime/os.hpp"
  40 #include "runtime/sharedRuntime.hpp"
  41 #include "runtime/stubRoutines.hpp"
  42 #include "utilities/macros.hpp"
  43 #if INCLUDE_ALL_GCS
  44 #include "gc/g1/g1CollectedHeap.inline.hpp"
  45 #include "gc/g1/g1SATBCardTableModRefBS.hpp"
  46 #include "gc/g1/heapRegion.hpp"
  47 #endif // INCLUDE_ALL_GCS
  48 
  49 #ifdef PRODUCT
  50 #define BLOCK_COMMENT(str) // nothing
  51 #else
  52 #define BLOCK_COMMENT(str) block_comment(str)
  53 #endif
  54 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  55 
  56 #ifdef ASSERT
  57 // On RISC, there's no benefit to verifying instruction boundaries.
  58 bool AbstractAssembler::pd_check_instruction_mark() { return false; }
  59 #endif
  60 
  61 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) {
  62   assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range");
  63   if (Assembler::is_simm(si31, 16)) {
  64     ld(d, si31, a);
  65     if (emit_filler_nop) nop();
  66   } else {
  67     const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31);
  68     const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31);
  69     addis(d, a, hi);
  70     ld(d, lo, d);
  71   }
  72 }
  73 
  74 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) {
  75   assert_different_registers(d, a);
  76   ld_largeoffset_unchecked(d, si31, a, emit_filler_nop);
  77 }
  78 
  79 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base,
  80                                       size_t size_in_bytes, bool is_signed) {
  81   switch (size_in_bytes) {
  82   case  8:              ld(dst, offs, base);                         break;
  83   case  4:  is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break;
  84   case  2:  is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break;
  85   case  1:  lbz(dst, offs, base); if (is_signed) extsb(dst, dst);    break; // lba doesn't exist :(
  86   default:  ShouldNotReachHere();
  87   }
  88 }
  89 
  90 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base,
  91                                        size_t size_in_bytes) {
  92   switch (size_in_bytes) {
  93   case  8:  std(dst, offs, base); break;
  94   case  4:  stw(dst, offs, base); break;
  95   case  2:  sth(dst, offs, base); break;
  96   case  1:  stb(dst, offs, base); break;
  97   default:  ShouldNotReachHere();
  98   }
  99 }
 100 
 101 void MacroAssembler::align(int modulus, int max, int rem) {
 102   int padding = (rem + modulus - (offset() % modulus)) % modulus;
 103   if (padding > max) return;
 104   for (int c = (padding >> 2); c > 0; --c) { nop(); }
 105 }
 106 
 107 // Issue instructions that calculate given TOC from global TOC.
 108 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16,
 109                                                        bool add_relocation, bool emit_dummy_addr) {
 110   int offset = -1;
 111   if (emit_dummy_addr) {
 112     offset = -128; // dummy address
 113   } else if (addr != (address)(intptr_t)-1) {
 114     offset = MacroAssembler::offset_to_global_toc(addr);
 115   }
 116 
 117   if (hi16) {
 118     addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset));
 119   }
 120   if (lo16) {
 121     if (add_relocation) {
 122       // Relocate at the addi to avoid confusion with a load from the method's TOC.
 123       relocate(internal_word_Relocation::spec(addr));
 124     }
 125     addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset));
 126   }
 127 }
 128 
 129 int MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) {
 130   const int offset = MacroAssembler::offset_to_global_toc(addr);
 131 
 132   const address inst2_addr = a;
 133   const int inst2 = *(int *)inst2_addr;
 134 
 135   // The relocation points to the second instruction, the addi,
 136   // and the addi reads and writes the same register dst.
 137   const int dst = inv_rt_field(inst2);
 138   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 139 
 140   // Now, find the preceding addis which writes to dst.
 141   int inst1 = 0;
 142   address inst1_addr = inst2_addr - BytesPerInstWord;
 143   while (inst1_addr >= bound) {
 144     inst1 = *(int *) inst1_addr;
 145     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 146       // Stop, found the addis which writes dst.
 147       break;
 148     }
 149     inst1_addr -= BytesPerInstWord;
 150   }
 151 
 152   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 153   set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset));
 154   set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset));
 155   return (int)((intptr_t)addr - (intptr_t)inst1_addr);
 156 }
 157 
 158 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) {
 159   const address inst2_addr = a;
 160   const int inst2 = *(int *)inst2_addr;
 161 
 162   // The relocation points to the second instruction, the addi,
 163   // and the addi reads and writes the same register dst.
 164   const int dst = inv_rt_field(inst2);
 165   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 166 
 167   // Now, find the preceding addis which writes to dst.
 168   int inst1 = 0;
 169   address inst1_addr = inst2_addr - BytesPerInstWord;
 170   while (inst1_addr >= bound) {
 171     inst1 = *(int *) inst1_addr;
 172     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 173       // stop, found the addis which writes dst
 174       break;
 175     }
 176     inst1_addr -= BytesPerInstWord;
 177   }
 178 
 179   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 180 
 181   int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0);
 182   // -1 is a special case
 183   if (offset == -1) {
 184     return (address)(intptr_t)-1;
 185   } else {
 186     return global_toc() + offset;
 187   }
 188 }
 189 
 190 #ifdef _LP64
 191 // Patch compressed oops or klass constants.
 192 // Assembler sequence is
 193 // 1) compressed oops:
 194 //    lis  rx = const.hi
 195 //    ori rx = rx | const.lo
 196 // 2) compressed klass:
 197 //    lis  rx = const.hi
 198 //    clrldi rx = rx & 0xFFFFffff // clearMS32b, optional
 199 //    ori rx = rx | const.lo
 200 // Clrldi will be passed by.
 201 int MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) {
 202   assert(UseCompressedOops, "Should only patch compressed oops");
 203 
 204   const address inst2_addr = a;
 205   const int inst2 = *(int *)inst2_addr;
 206 
 207   // The relocation points to the second instruction, the ori,
 208   // and the ori reads and writes the same register dst.
 209   const int dst = inv_rta_field(inst2);
 210   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 211   // Now, find the preceding addis which writes to dst.
 212   int inst1 = 0;
 213   address inst1_addr = inst2_addr - BytesPerInstWord;
 214   bool inst1_found = false;
 215   while (inst1_addr >= bound) {
 216     inst1 = *(int *)inst1_addr;
 217     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; }
 218     inst1_addr -= BytesPerInstWord;
 219   }
 220   assert(inst1_found, "inst is not lis");
 221 
 222   int xc = (data >> 16) & 0xffff;
 223   int xd = (data >>  0) & 0xffff;
 224 
 225   set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo
 226   set_imm((int *)inst2_addr,        (xd)); // unsigned int
 227   return (int)((intptr_t)inst2_addr - (intptr_t)inst1_addr);
 228 }
 229 
 230 // Get compressed oop or klass constant.
 231 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) {
 232   assert(UseCompressedOops, "Should only patch compressed oops");
 233 
 234   const address inst2_addr = a;
 235   const int inst2 = *(int *)inst2_addr;
 236 
 237   // The relocation points to the second instruction, the ori,
 238   // and the ori reads and writes the same register dst.
 239   const int dst = inv_rta_field(inst2);
 240   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 241   // Now, find the preceding lis which writes to dst.
 242   int inst1 = 0;
 243   address inst1_addr = inst2_addr - BytesPerInstWord;
 244   bool inst1_found = false;
 245 
 246   while (inst1_addr >= bound) {
 247     inst1 = *(int *) inst1_addr;
 248     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;}
 249     inst1_addr -= BytesPerInstWord;
 250   }
 251   assert(inst1_found, "inst is not lis");
 252 
 253   uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff));
 254   uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16);
 255 
 256   return (int) (xl | xh);
 257 }
 258 #endif // _LP64
 259 
 260 // Returns true if successful.
 261 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a,
 262                                                 Register toc, bool fixed_size) {
 263   int toc_offset = 0;
 264   // Use RelocationHolder::none for the constant pool entry, otherwise
 265   // we will end up with a failing NativeCall::verify(x) where x is
 266   // the address of the constant pool entry.
 267   // FIXME: We should insert relocation information for oops at the constant
 268   // pool entries instead of inserting it at the loads; patching of a constant
 269   // pool entry should be less expensive.
 270   address const_address = address_constant((address)a.value(), RelocationHolder::none);
 271   if (const_address == NULL) { return false; } // allocation failure
 272   // Relocate at the pc of the load.
 273   relocate(a.rspec());
 274   toc_offset = (int)(const_address - code()->consts()->start());
 275   ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size);
 276   return true;
 277 }
 278 
 279 bool MacroAssembler::is_load_const_from_method_toc_at(address a) {
 280   const address inst1_addr = a;
 281   const int inst1 = *(int *)inst1_addr;
 282 
 283    // The relocation points to the ld or the addis.
 284    return (is_ld(inst1)) ||
 285           (is_addis(inst1) && inv_ra_field(inst1) != 0);
 286 }
 287 
 288 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) {
 289   assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc");
 290 
 291   const address inst1_addr = a;
 292   const int inst1 = *(int *)inst1_addr;
 293 
 294   if (is_ld(inst1)) {
 295     return inv_d1_field(inst1);
 296   } else if (is_addis(inst1)) {
 297     const int dst = inv_rt_field(inst1);
 298 
 299     // Now, find the succeeding ld which reads and writes to dst.
 300     address inst2_addr = inst1_addr + BytesPerInstWord;
 301     int inst2 = 0;
 302     while (true) {
 303       inst2 = *(int *) inst2_addr;
 304       if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) {
 305         // Stop, found the ld which reads and writes dst.
 306         break;
 307       }
 308       inst2_addr += BytesPerInstWord;
 309     }
 310     return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2);
 311   }
 312   ShouldNotReachHere();
 313   return 0;
 314 }
 315 
 316 // Get the constant from a `load_const' sequence.
 317 long MacroAssembler::get_const(address a) {
 318   assert(is_load_const_at(a), "not a load of a constant");
 319   const int *p = (const int*) a;
 320   unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48);
 321   if (is_ori(*(p+1))) {
 322     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32);
 323     x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16);
 324     x |= (((unsigned long) (get_imm(a,4) & 0xffff)));
 325   } else if (is_lis(*(p+1))) {
 326     x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32);
 327     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16);
 328     x |= (((unsigned long) (get_imm(a,3) & 0xffff)));
 329   } else {
 330     ShouldNotReachHere();
 331     return (long) 0;
 332   }
 333   return (long) x;
 334 }
 335 
 336 // Patch the 64 bit constant of a `load_const' sequence. This is a low
 337 // level procedure. It neither flushes the instruction cache nor is it
 338 // mt safe.
 339 void MacroAssembler::patch_const(address a, long x) {
 340   assert(is_load_const_at(a), "not a load of a constant");
 341   int *p = (int*) a;
 342   if (is_ori(*(p+1))) {
 343     set_imm(0 + p, (x >> 48) & 0xffff);
 344     set_imm(1 + p, (x >> 32) & 0xffff);
 345     set_imm(3 + p, (x >> 16) & 0xffff);
 346     set_imm(4 + p, x & 0xffff);
 347   } else if (is_lis(*(p+1))) {
 348     set_imm(0 + p, (x >> 48) & 0xffff);
 349     set_imm(2 + p, (x >> 32) & 0xffff);
 350     set_imm(1 + p, (x >> 16) & 0xffff);
 351     set_imm(3 + p, x & 0xffff);
 352   } else {
 353     ShouldNotReachHere();
 354   }
 355 }
 356 
 357 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) {
 358   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
 359   int index = oop_recorder()->allocate_metadata_index(obj);
 360   RelocationHolder rspec = metadata_Relocation::spec(index);
 361   return AddressLiteral((address)obj, rspec);
 362 }
 363 
 364 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) {
 365   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
 366   int index = oop_recorder()->find_index(obj);
 367   RelocationHolder rspec = metadata_Relocation::spec(index);
 368   return AddressLiteral((address)obj, rspec);
 369 }
 370 
 371 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) {
 372   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
 373   int oop_index = oop_recorder()->allocate_oop_index(obj);
 374   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 375 }
 376 
 377 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) {
 378   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
 379   int oop_index = oop_recorder()->find_index(obj);
 380   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 381 }
 382 
 383 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
 384                                                       Register tmp, int offset) {
 385   intptr_t value = *delayed_value_addr;
 386   if (value != 0) {
 387     return RegisterOrConstant(value + offset);
 388   }
 389 
 390   // Load indirectly to solve generation ordering problem.
 391   // static address, no relocation
 392   int simm16_offset = load_const_optimized(tmp, delayed_value_addr, noreg, true);
 393   ld(tmp, simm16_offset, tmp); // must be aligned ((xa & 3) == 0)
 394 
 395   if (offset != 0) {
 396     addi(tmp, tmp, offset);
 397   }
 398 
 399   return RegisterOrConstant(tmp);
 400 }
 401 
 402 #ifndef PRODUCT
 403 void MacroAssembler::pd_print_patched_instruction(address branch) {
 404   Unimplemented(); // TODO: PPC port
 405 }
 406 #endif // ndef PRODUCT
 407 
 408 // Conditional far branch for destinations encodable in 24+2 bits.
 409 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) {
 410 
 411   // If requested by flag optimize, relocate the bc_far as a
 412   // runtime_call and prepare for optimizing it when the code gets
 413   // relocated.
 414   if (optimize == bc_far_optimize_on_relocate) {
 415     relocate(relocInfo::runtime_call_type);
 416   }
 417 
 418   // variant 2:
 419   //
 420   //    b!cxx SKIP
 421   //    bxx   DEST
 422   //  SKIP:
 423   //
 424 
 425   const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 426                                                 opposite_bcond(inv_boint_bcond(boint)));
 427 
 428   // We emit two branches.
 429   // First, a conditional branch which jumps around the far branch.
 430   const address not_taken_pc = pc() + 2 * BytesPerInstWord;
 431   const address bc_pc        = pc();
 432   bc(opposite_boint, biint, not_taken_pc);
 433 
 434   const int bc_instr = *(int*)bc_pc;
 435   assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition");
 436   assert(opposite_boint == inv_bo_field(bc_instr), "postcondition");
 437   assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))),
 438                                      opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))),
 439          "postcondition");
 440   assert(biint == inv_bi_field(bc_instr), "postcondition");
 441 
 442   // Second, an unconditional far branch which jumps to dest.
 443   // Note: target(dest) remembers the current pc (see CodeSection::target)
 444   //       and returns the current pc if the label is not bound yet; when
 445   //       the label gets bound, the unconditional far branch will be patched.
 446   const address target_pc = target(dest);
 447   const address b_pc  = pc();
 448   b(target_pc);
 449 
 450   assert(not_taken_pc == pc(),                     "postcondition");
 451   assert(dest.is_bound() || target_pc == b_pc, "postcondition");
 452 }
 453 
 454 // 1 or 2 instructions
 455 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) {
 456   if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) {
 457     bc(boint, biint, dest);
 458   } else {
 459     bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate);
 460   }
 461 }
 462 
 463 bool MacroAssembler::is_bc_far_at(address instruction_addr) {
 464   return is_bc_far_variant1_at(instruction_addr) ||
 465          is_bc_far_variant2_at(instruction_addr) ||
 466          is_bc_far_variant3_at(instruction_addr);
 467 }
 468 
 469 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) {
 470   if (is_bc_far_variant1_at(instruction_addr)) {
 471     const address instruction_1_addr = instruction_addr;
 472     const int instruction_1 = *(int*)instruction_1_addr;
 473     return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr);
 474   } else if (is_bc_far_variant2_at(instruction_addr)) {
 475     const address instruction_2_addr = instruction_addr + 4;
 476     return bxx_destination(instruction_2_addr);
 477   } else if (is_bc_far_variant3_at(instruction_addr)) {
 478     return instruction_addr + 8;
 479   }
 480   // variant 4 ???
 481   ShouldNotReachHere();
 482   return NULL;
 483 }
 484 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) {
 485 
 486   if (is_bc_far_variant3_at(instruction_addr)) {
 487     // variant 3, far cond branch to the next instruction, already patched to nops:
 488     //
 489     //    nop
 490     //    endgroup
 491     //  SKIP/DEST:
 492     //
 493     return;
 494   }
 495 
 496   // first, extract boint and biint from the current branch
 497   int boint = 0;
 498   int biint = 0;
 499 
 500   ResourceMark rm;
 501   const int code_size = 2 * BytesPerInstWord;
 502   CodeBuffer buf(instruction_addr, code_size);
 503   MacroAssembler masm(&buf);
 504   if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) {
 505     // Far branch to next instruction: Optimize it by patching nops (produce variant 3).
 506     masm.nop();
 507     masm.endgroup();
 508   } else {
 509     if (is_bc_far_variant1_at(instruction_addr)) {
 510       // variant 1, the 1st instruction contains the destination address:
 511       //
 512       //    bcxx  DEST
 513       //    nop
 514       //
 515       const int instruction_1 = *(int*)(instruction_addr);
 516       boint = inv_bo_field(instruction_1);
 517       biint = inv_bi_field(instruction_1);
 518     } else if (is_bc_far_variant2_at(instruction_addr)) {
 519       // variant 2, the 2nd instruction contains the destination address:
 520       //
 521       //    b!cxx SKIP
 522       //    bxx   DEST
 523       //  SKIP:
 524       //
 525       const int instruction_1 = *(int*)(instruction_addr);
 526       boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))),
 527           opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1))));
 528       biint = inv_bi_field(instruction_1);
 529     } else {
 530       // variant 4???
 531       ShouldNotReachHere();
 532     }
 533 
 534     // second, set the new branch destination and optimize the code
 535     if (dest != instruction_addr + 4 && // the bc_far is still unbound!
 536         masm.is_within_range_of_bcxx(dest, instruction_addr)) {
 537       // variant 1:
 538       //
 539       //    bcxx  DEST
 540       //    nop
 541       //
 542       masm.bc(boint, biint, dest);
 543       masm.nop();
 544     } else {
 545       // variant 2:
 546       //
 547       //    b!cxx SKIP
 548       //    bxx   DEST
 549       //  SKIP:
 550       //
 551       const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 552                                                     opposite_bcond(inv_boint_bcond(boint)));
 553       const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord;
 554       masm.bc(opposite_boint, biint, not_taken_pc);
 555       masm.b(dest);
 556     }
 557   }
 558   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 559 }
 560 
 561 // Emit a NOT mt-safe patchable 64 bit absolute call/jump.
 562 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) {
 563   // get current pc
 564   uint64_t start_pc = (uint64_t) pc();
 565 
 566   const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last
 567   const address pc_of_b  = (address) (start_pc + (0*BytesPerInstWord)); // b is first
 568 
 569   // relocate here
 570   if (rt != relocInfo::none) {
 571     relocate(rt);
 572   }
 573 
 574   if ( ReoptimizeCallSequences &&
 575        (( link && is_within_range_of_b(dest, pc_of_bl)) ||
 576         (!link && is_within_range_of_b(dest, pc_of_b)))) {
 577     // variant 2:
 578     // Emit an optimized, pc-relative call/jump.
 579 
 580     if (link) {
 581       // some padding
 582       nop();
 583       nop();
 584       nop();
 585       nop();
 586       nop();
 587       nop();
 588 
 589       // do the call
 590       assert(pc() == pc_of_bl, "just checking");
 591       bl(dest, relocInfo::none);
 592     } else {
 593       // do the jump
 594       assert(pc() == pc_of_b, "just checking");
 595       b(dest, relocInfo::none);
 596 
 597       // some padding
 598       nop();
 599       nop();
 600       nop();
 601       nop();
 602       nop();
 603       nop();
 604     }
 605 
 606     // Assert that we can identify the emitted call/jump.
 607     assert(is_bxx64_patchable_variant2_at((address)start_pc, link),
 608            "can't identify emitted call");
 609   } else {
 610     // variant 1:
 611     mr(R0, R11);  // spill R11 -> R0.
 612 
 613     // Load the destination address into CTR,
 614     // calculate destination relative to global toc.
 615     calculate_address_from_global_toc(R11, dest, true, true, false);
 616 
 617     mtctr(R11);
 618     mr(R11, R0);  // spill R11 <- R0.
 619     nop();
 620 
 621     // do the call/jump
 622     if (link) {
 623       bctrl();
 624     } else{
 625       bctr();
 626     }
 627     // Assert that we can identify the emitted call/jump.
 628     assert(is_bxx64_patchable_variant1b_at((address)start_pc, link),
 629            "can't identify emitted call");
 630   }
 631 
 632   // Assert that we can identify the emitted call/jump.
 633   assert(is_bxx64_patchable_at((address)start_pc, link),
 634          "can't identify emitted call");
 635   assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest,
 636          "wrong encoding of dest address");
 637 }
 638 
 639 // Identify a bxx64_patchable instruction.
 640 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) {
 641   return is_bxx64_patchable_variant1b_at(instruction_addr, link)
 642     //|| is_bxx64_patchable_variant1_at(instruction_addr, link)
 643       || is_bxx64_patchable_variant2_at(instruction_addr, link);
 644 }
 645 
 646 // Does the call64_patchable instruction use a pc-relative encoding of
 647 // the call destination?
 648 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) {
 649   // variant 2 is pc-relative
 650   return is_bxx64_patchable_variant2_at(instruction_addr, link);
 651 }
 652 
 653 // Identify variant 1.
 654 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) {
 655   unsigned int* instr = (unsigned int*) instruction_addr;
 656   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 657       && is_mtctr(instr[5]) // mtctr
 658     && is_load_const_at(instruction_addr);
 659 }
 660 
 661 // Identify variant 1b: load destination relative to global toc.
 662 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) {
 663   unsigned int* instr = (unsigned int*) instruction_addr;
 664   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 665     && is_mtctr(instr[3]) // mtctr
 666     && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr);
 667 }
 668 
 669 // Identify variant 2.
 670 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) {
 671   unsigned int* instr = (unsigned int*) instruction_addr;
 672   if (link) {
 673     return is_bl (instr[6])  // bl dest is last
 674       && is_nop(instr[0])  // nop
 675       && is_nop(instr[1])  // nop
 676       && is_nop(instr[2])  // nop
 677       && is_nop(instr[3])  // nop
 678       && is_nop(instr[4])  // nop
 679       && is_nop(instr[5]); // nop
 680   } else {
 681     return is_b  (instr[0])  // b  dest is first
 682       && is_nop(instr[1])  // nop
 683       && is_nop(instr[2])  // nop
 684       && is_nop(instr[3])  // nop
 685       && is_nop(instr[4])  // nop
 686       && is_nop(instr[5])  // nop
 687       && is_nop(instr[6]); // nop
 688   }
 689 }
 690 
 691 // Set dest address of a bxx64_patchable instruction.
 692 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) {
 693   ResourceMark rm;
 694   int code_size = MacroAssembler::bxx64_patchable_size;
 695   CodeBuffer buf(instruction_addr, code_size);
 696   MacroAssembler masm(&buf);
 697   masm.bxx64_patchable(dest, relocInfo::none, link);
 698   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 699 }
 700 
 701 // Get dest address of a bxx64_patchable instruction.
 702 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) {
 703   if (is_bxx64_patchable_variant1_at(instruction_addr, link)) {
 704     return (address) (unsigned long) get_const(instruction_addr);
 705   } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) {
 706     unsigned int* instr = (unsigned int*) instruction_addr;
 707     if (link) {
 708       const int instr_idx = 6; // bl is last
 709       int branchoffset = branch_destination(instr[instr_idx], 0);
 710       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 711     } else {
 712       const int instr_idx = 0; // b is first
 713       int branchoffset = branch_destination(instr[instr_idx], 0);
 714       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 715     }
 716   // Load dest relative to global toc.
 717   } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) {
 718     return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord,
 719                                                                instruction_addr);
 720   } else {
 721     ShouldNotReachHere();
 722     return NULL;
 723   }
 724 }
 725 
 726 // Uses ordering which corresponds to ABI:
 727 //    _savegpr0_14:  std  r14,-144(r1)
 728 //    _savegpr0_15:  std  r15,-136(r1)
 729 //    _savegpr0_16:  std  r16,-128(r1)
 730 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) {
 731   std(R14, offset, dst);   offset += 8;
 732   std(R15, offset, dst);   offset += 8;
 733   std(R16, offset, dst);   offset += 8;
 734   std(R17, offset, dst);   offset += 8;
 735   std(R18, offset, dst);   offset += 8;
 736   std(R19, offset, dst);   offset += 8;
 737   std(R20, offset, dst);   offset += 8;
 738   std(R21, offset, dst);   offset += 8;
 739   std(R22, offset, dst);   offset += 8;
 740   std(R23, offset, dst);   offset += 8;
 741   std(R24, offset, dst);   offset += 8;
 742   std(R25, offset, dst);   offset += 8;
 743   std(R26, offset, dst);   offset += 8;
 744   std(R27, offset, dst);   offset += 8;
 745   std(R28, offset, dst);   offset += 8;
 746   std(R29, offset, dst);   offset += 8;
 747   std(R30, offset, dst);   offset += 8;
 748   std(R31, offset, dst);   offset += 8;
 749 
 750   stfd(F14, offset, dst);   offset += 8;
 751   stfd(F15, offset, dst);   offset += 8;
 752   stfd(F16, offset, dst);   offset += 8;
 753   stfd(F17, offset, dst);   offset += 8;
 754   stfd(F18, offset, dst);   offset += 8;
 755   stfd(F19, offset, dst);   offset += 8;
 756   stfd(F20, offset, dst);   offset += 8;
 757   stfd(F21, offset, dst);   offset += 8;
 758   stfd(F22, offset, dst);   offset += 8;
 759   stfd(F23, offset, dst);   offset += 8;
 760   stfd(F24, offset, dst);   offset += 8;
 761   stfd(F25, offset, dst);   offset += 8;
 762   stfd(F26, offset, dst);   offset += 8;
 763   stfd(F27, offset, dst);   offset += 8;
 764   stfd(F28, offset, dst);   offset += 8;
 765   stfd(F29, offset, dst);   offset += 8;
 766   stfd(F30, offset, dst);   offset += 8;
 767   stfd(F31, offset, dst);
 768 }
 769 
 770 // Uses ordering which corresponds to ABI:
 771 //    _restgpr0_14:  ld   r14,-144(r1)
 772 //    _restgpr0_15:  ld   r15,-136(r1)
 773 //    _restgpr0_16:  ld   r16,-128(r1)
 774 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) {
 775   ld(R14, offset, src);   offset += 8;
 776   ld(R15, offset, src);   offset += 8;
 777   ld(R16, offset, src);   offset += 8;
 778   ld(R17, offset, src);   offset += 8;
 779   ld(R18, offset, src);   offset += 8;
 780   ld(R19, offset, src);   offset += 8;
 781   ld(R20, offset, src);   offset += 8;
 782   ld(R21, offset, src);   offset += 8;
 783   ld(R22, offset, src);   offset += 8;
 784   ld(R23, offset, src);   offset += 8;
 785   ld(R24, offset, src);   offset += 8;
 786   ld(R25, offset, src);   offset += 8;
 787   ld(R26, offset, src);   offset += 8;
 788   ld(R27, offset, src);   offset += 8;
 789   ld(R28, offset, src);   offset += 8;
 790   ld(R29, offset, src);   offset += 8;
 791   ld(R30, offset, src);   offset += 8;
 792   ld(R31, offset, src);   offset += 8;
 793 
 794   // FP registers
 795   lfd(F14, offset, src);   offset += 8;
 796   lfd(F15, offset, src);   offset += 8;
 797   lfd(F16, offset, src);   offset += 8;
 798   lfd(F17, offset, src);   offset += 8;
 799   lfd(F18, offset, src);   offset += 8;
 800   lfd(F19, offset, src);   offset += 8;
 801   lfd(F20, offset, src);   offset += 8;
 802   lfd(F21, offset, src);   offset += 8;
 803   lfd(F22, offset, src);   offset += 8;
 804   lfd(F23, offset, src);   offset += 8;
 805   lfd(F24, offset, src);   offset += 8;
 806   lfd(F25, offset, src);   offset += 8;
 807   lfd(F26, offset, src);   offset += 8;
 808   lfd(F27, offset, src);   offset += 8;
 809   lfd(F28, offset, src);   offset += 8;
 810   lfd(F29, offset, src);   offset += 8;
 811   lfd(F30, offset, src);   offset += 8;
 812   lfd(F31, offset, src);
 813 }
 814 
 815 // For verify_oops.
 816 void MacroAssembler::save_volatile_gprs(Register dst, int offset) {
 817   std(R2,  offset, dst);   offset += 8;
 818   std(R3,  offset, dst);   offset += 8;
 819   std(R4,  offset, dst);   offset += 8;
 820   std(R5,  offset, dst);   offset += 8;
 821   std(R6,  offset, dst);   offset += 8;
 822   std(R7,  offset, dst);   offset += 8;
 823   std(R8,  offset, dst);   offset += 8;
 824   std(R9,  offset, dst);   offset += 8;
 825   std(R10, offset, dst);   offset += 8;
 826   std(R11, offset, dst);   offset += 8;
 827   std(R12, offset, dst);   offset += 8;
 828 
 829   stfd(F0, offset, dst);   offset += 8;
 830   stfd(F1, offset, dst);   offset += 8;
 831   stfd(F2, offset, dst);   offset += 8;
 832   stfd(F3, offset, dst);   offset += 8;
 833   stfd(F4, offset, dst);   offset += 8;
 834   stfd(F5, offset, dst);   offset += 8;
 835   stfd(F6, offset, dst);   offset += 8;
 836   stfd(F7, offset, dst);   offset += 8;
 837   stfd(F8, offset, dst);   offset += 8;
 838   stfd(F9, offset, dst);   offset += 8;
 839   stfd(F10, offset, dst);  offset += 8;
 840   stfd(F11, offset, dst);  offset += 8;
 841   stfd(F12, offset, dst);  offset += 8;
 842   stfd(F13, offset, dst);
 843 }
 844 
 845 // For verify_oops.
 846 void MacroAssembler::restore_volatile_gprs(Register src, int offset) {
 847   ld(R2,  offset, src);   offset += 8;
 848   ld(R3,  offset, src);   offset += 8;
 849   ld(R4,  offset, src);   offset += 8;
 850   ld(R5,  offset, src);   offset += 8;
 851   ld(R6,  offset, src);   offset += 8;
 852   ld(R7,  offset, src);   offset += 8;
 853   ld(R8,  offset, src);   offset += 8;
 854   ld(R9,  offset, src);   offset += 8;
 855   ld(R10, offset, src);   offset += 8;
 856   ld(R11, offset, src);   offset += 8;
 857   ld(R12, offset, src);   offset += 8;
 858 
 859   lfd(F0, offset, src);   offset += 8;
 860   lfd(F1, offset, src);   offset += 8;
 861   lfd(F2, offset, src);   offset += 8;
 862   lfd(F3, offset, src);   offset += 8;
 863   lfd(F4, offset, src);   offset += 8;
 864   lfd(F5, offset, src);   offset += 8;
 865   lfd(F6, offset, src);   offset += 8;
 866   lfd(F7, offset, src);   offset += 8;
 867   lfd(F8, offset, src);   offset += 8;
 868   lfd(F9, offset, src);   offset += 8;
 869   lfd(F10, offset, src);  offset += 8;
 870   lfd(F11, offset, src);  offset += 8;
 871   lfd(F12, offset, src);  offset += 8;
 872   lfd(F13, offset, src);
 873 }
 874 
 875 void MacroAssembler::save_LR_CR(Register tmp) {
 876   mfcr(tmp);
 877   std(tmp, _abi(cr), R1_SP);
 878   mflr(tmp);
 879   std(tmp, _abi(lr), R1_SP);
 880   // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad)
 881 }
 882 
 883 void MacroAssembler::restore_LR_CR(Register tmp) {
 884   assert(tmp != R1_SP, "must be distinct");
 885   ld(tmp, _abi(lr), R1_SP);
 886   mtlr(tmp);
 887   ld(tmp, _abi(cr), R1_SP);
 888   mtcr(tmp);
 889 }
 890 
 891 address MacroAssembler::get_PC_trash_LR(Register result) {
 892   Label L;
 893   bl(L);
 894   bind(L);
 895   address lr_pc = pc();
 896   mflr(result);
 897   return lr_pc;
 898 }
 899 
 900 void MacroAssembler::resize_frame(Register offset, Register tmp) {
 901 #ifdef ASSERT
 902   assert_different_registers(offset, tmp, R1_SP);
 903   andi_(tmp, offset, frame::alignment_in_bytes-1);
 904   asm_assert_eq("resize_frame: unaligned", 0x204);
 905 #endif
 906 
 907   // tmp <- *(SP)
 908   ld(tmp, _abi(callers_sp), R1_SP);
 909   // addr <- SP + offset;
 910   // *(addr) <- tmp;
 911   // SP <- addr
 912   stdux(tmp, R1_SP, offset);
 913 }
 914 
 915 void MacroAssembler::resize_frame(int offset, Register tmp) {
 916   assert(is_simm(offset, 16), "too big an offset");
 917   assert_different_registers(tmp, R1_SP);
 918   assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned");
 919   // tmp <- *(SP)
 920   ld(tmp, _abi(callers_sp), R1_SP);
 921   // addr <- SP + offset;
 922   // *(addr) <- tmp;
 923   // SP <- addr
 924   stdu(tmp, offset, R1_SP);
 925 }
 926 
 927 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) {
 928   // (addr == tmp1) || (addr == tmp2) is allowed here!
 929   assert(tmp1 != tmp2, "must be distinct");
 930 
 931   // compute offset w.r.t. current stack pointer
 932   // tmp_1 <- addr - SP (!)
 933   subf(tmp1, R1_SP, addr);
 934 
 935   // atomically update SP keeping back link.
 936   resize_frame(tmp1/* offset */, tmp2/* tmp */);
 937 }
 938 
 939 void MacroAssembler::push_frame(Register bytes, Register tmp) {
 940 #ifdef ASSERT
 941   assert(bytes != R0, "r0 not allowed here");
 942   andi_(R0, bytes, frame::alignment_in_bytes-1);
 943   asm_assert_eq("push_frame(Reg, Reg): unaligned", 0x203);
 944 #endif
 945   neg(tmp, bytes);
 946   stdux(R1_SP, R1_SP, tmp);
 947 }
 948 
 949 // Push a frame of size `bytes'.
 950 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) {
 951   long offset = align_addr(bytes, frame::alignment_in_bytes);
 952   if (is_simm(-offset, 16)) {
 953     stdu(R1_SP, -offset, R1_SP);
 954   } else {
 955     load_const_optimized(tmp, -offset);
 956     stdux(R1_SP, R1_SP, tmp);
 957   }
 958 }
 959 
 960 // Push a frame of size `bytes' plus abi_reg_args on top.
 961 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) {
 962   push_frame(bytes + frame::abi_reg_args_size, tmp);
 963 }
 964 
 965 // Setup up a new C frame with a spill area for non-volatile GPRs and
 966 // additional space for local variables.
 967 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes,
 968                                                       Register tmp) {
 969   push_frame(bytes + frame::abi_reg_args_size + frame::spill_nonvolatiles_size, tmp);
 970 }
 971 
 972 // Pop current C frame.
 973 void MacroAssembler::pop_frame() {
 974   ld(R1_SP, _abi(callers_sp), R1_SP);
 975 }
 976 
 977 #if defined(ABI_ELFv2)
 978 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) {
 979   // TODO(asmundak): make sure the caller uses R12 as function descriptor
 980   // most of the times.
 981   if (R12 != r_function_entry) {
 982     mr(R12, r_function_entry);
 983   }
 984   mtctr(R12);
 985   // Do a call or a branch.
 986   if (and_link) {
 987     bctrl();
 988   } else {
 989     bctr();
 990   }
 991   _last_calls_return_pc = pc();
 992 
 993   return _last_calls_return_pc;
 994 }
 995 
 996 // Call a C function via a function descriptor and use full C
 997 // calling conventions. Updates and returns _last_calls_return_pc.
 998 address MacroAssembler::call_c(Register r_function_entry) {
 999   return branch_to(r_function_entry, /*and_link=*/true);
1000 }
1001 
1002 // For tail calls: only branch, don't link, so callee returns to caller of this function.
1003 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) {
1004   return branch_to(r_function_entry, /*and_link=*/false);
1005 }
1006 
1007 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) {
1008   load_const(R12, function_entry, R0);
1009   return branch_to(R12,  /*and_link=*/true);
1010 }
1011 
1012 #else
1013 // Generic version of a call to C function via a function descriptor
1014 // with variable support for C calling conventions (TOC, ENV, etc.).
1015 // Updates and returns _last_calls_return_pc.
1016 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call,
1017                                   bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) {
1018   // we emit standard ptrgl glue code here
1019   assert((function_descriptor != R0), "function_descriptor cannot be R0");
1020 
1021   // retrieve necessary entries from the function descriptor
1022   ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor);
1023   mtctr(R0);
1024 
1025   if (load_toc_of_callee) {
1026     ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor);
1027   }
1028   if (load_env_of_callee) {
1029     ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor);
1030   } else if (load_toc_of_callee) {
1031     li(R11, 0);
1032   }
1033 
1034   // do a call or a branch
1035   if (and_link) {
1036     bctrl();
1037   } else {
1038     bctr();
1039   }
1040   _last_calls_return_pc = pc();
1041 
1042   return _last_calls_return_pc;
1043 }
1044 
1045 // Call a C function via a function descriptor and use full C calling
1046 // conventions.
1047 // We don't use the TOC in generated code, so there is no need to save
1048 // and restore its value.
1049 address MacroAssembler::call_c(Register fd) {
1050   return branch_to(fd, /*and_link=*/true,
1051                        /*save toc=*/false,
1052                        /*restore toc=*/false,
1053                        /*load toc=*/true,
1054                        /*load env=*/true);
1055 }
1056 
1057 address MacroAssembler::call_c_and_return_to_caller(Register fd) {
1058   return branch_to(fd, /*and_link=*/false,
1059                        /*save toc=*/false,
1060                        /*restore toc=*/false,
1061                        /*load toc=*/true,
1062                        /*load env=*/true);
1063 }
1064 
1065 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) {
1066   if (rt != relocInfo::none) {
1067     // this call needs to be relocatable
1068     if (!ReoptimizeCallSequences
1069         || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1070         || fd == NULL   // support code-size estimation
1071         || !fd->is_friend_function()
1072         || fd->entry() == NULL) {
1073       // it's not a friend function as defined by class FunctionDescriptor,
1074       // so do a full call-c here.
1075       load_const(R11, (address)fd, R0);
1076 
1077       bool has_env = (fd != NULL && fd->env() != NULL);
1078       return branch_to(R11, /*and_link=*/true,
1079                             /*save toc=*/false,
1080                             /*restore toc=*/false,
1081                             /*load toc=*/true,
1082                             /*load env=*/has_env);
1083     } else {
1084       // It's a friend function. Load the entry point and don't care about
1085       // toc and env. Use an optimizable call instruction, but ensure the
1086       // same code-size as in the case of a non-friend function.
1087       nop();
1088       nop();
1089       nop();
1090       bl64_patchable(fd->entry(), rt);
1091       _last_calls_return_pc = pc();
1092       return _last_calls_return_pc;
1093     }
1094   } else {
1095     // This call does not need to be relocatable, do more aggressive
1096     // optimizations.
1097     if (!ReoptimizeCallSequences
1098       || !fd->is_friend_function()) {
1099       // It's not a friend function as defined by class FunctionDescriptor,
1100       // so do a full call-c here.
1101       load_const(R11, (address)fd, R0);
1102       return branch_to(R11, /*and_link=*/true,
1103                             /*save toc=*/false,
1104                             /*restore toc=*/false,
1105                             /*load toc=*/true,
1106                             /*load env=*/true);
1107     } else {
1108       // it's a friend function, load the entry point and don't care about
1109       // toc and env.
1110       address dest = fd->entry();
1111       if (is_within_range_of_b(dest, pc())) {
1112         bl(dest);
1113       } else {
1114         bl64_patchable(dest, rt);
1115       }
1116       _last_calls_return_pc = pc();
1117       return _last_calls_return_pc;
1118     }
1119   }
1120 }
1121 
1122 // Call a C function.  All constants needed reside in TOC.
1123 //
1124 // Read the address to call from the TOC.
1125 // Read env from TOC, if fd specifies an env.
1126 // Read new TOC from TOC.
1127 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd,
1128                                          relocInfo::relocType rt, Register toc) {
1129   if (!ReoptimizeCallSequences
1130     || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1131     || !fd->is_friend_function()) {
1132     // It's not a friend function as defined by class FunctionDescriptor,
1133     // so do a full call-c here.
1134     assert(fd->entry() != NULL, "function must be linked");
1135 
1136     AddressLiteral fd_entry(fd->entry());
1137     bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true);
1138     mtctr(R11);
1139     if (fd->env() == NULL) {
1140       li(R11, 0);
1141       nop();
1142     } else {
1143       AddressLiteral fd_env(fd->env());
1144       success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true);
1145     }
1146     AddressLiteral fd_toc(fd->toc());
1147     // Set R2_TOC (load from toc)
1148     success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true);
1149     bctrl();
1150     _last_calls_return_pc = pc();
1151     if (!success) { return NULL; }
1152   } else {
1153     // It's a friend function, load the entry point and don't care about
1154     // toc and env. Use an optimizable call instruction, but ensure the
1155     // same code-size as in the case of a non-friend function.
1156     nop();
1157     bl64_patchable(fd->entry(), rt);
1158     _last_calls_return_pc = pc();
1159   }
1160   return _last_calls_return_pc;
1161 }
1162 #endif // ABI_ELFv2
1163 
1164 void MacroAssembler::call_VM_base(Register oop_result,
1165                                   Register last_java_sp,
1166                                   address  entry_point,
1167                                   bool     check_exceptions) {
1168   BLOCK_COMMENT("call_VM {");
1169   // Determine last_java_sp register.
1170   if (!last_java_sp->is_valid()) {
1171     last_java_sp = R1_SP;
1172   }
1173   set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1);
1174 
1175   // ARG1 must hold thread address.
1176   mr(R3_ARG1, R16_thread);
1177 #if defined(ABI_ELFv2)
1178   address return_pc = call_c(entry_point, relocInfo::none);
1179 #else
1180   address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none);
1181 #endif
1182 
1183   reset_last_Java_frame();
1184 
1185   // Check for pending exceptions.
1186   if (check_exceptions) {
1187     // We don't check for exceptions here.
1188     ShouldNotReachHere();
1189   }
1190 
1191   // Get oop result if there is one and reset the value in the thread.
1192   if (oop_result->is_valid()) {
1193     get_vm_result(oop_result);
1194   }
1195 
1196   _last_calls_return_pc = return_pc;
1197   BLOCK_COMMENT("} call_VM");
1198 }
1199 
1200 void MacroAssembler::call_VM_leaf_base(address entry_point) {
1201   BLOCK_COMMENT("call_VM_leaf {");
1202 #if defined(ABI_ELFv2)
1203   call_c(entry_point, relocInfo::none);
1204 #else
1205   call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none);
1206 #endif
1207   BLOCK_COMMENT("} call_VM_leaf");
1208 }
1209 
1210 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) {
1211   call_VM_base(oop_result, noreg, entry_point, check_exceptions);
1212 }
1213 
1214 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1,
1215                              bool check_exceptions) {
1216   // R3_ARG1 is reserved for the thread.
1217   mr_if_needed(R4_ARG2, arg_1);
1218   call_VM(oop_result, entry_point, check_exceptions);
1219 }
1220 
1221 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2,
1222                              bool check_exceptions) {
1223   // R3_ARG1 is reserved for the thread
1224   mr_if_needed(R4_ARG2, arg_1);
1225   assert(arg_2 != R4_ARG2, "smashed argument");
1226   mr_if_needed(R5_ARG3, arg_2);
1227   call_VM(oop_result, entry_point, check_exceptions);
1228 }
1229 
1230 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3,
1231                              bool check_exceptions) {
1232   // R3_ARG1 is reserved for the thread
1233   mr_if_needed(R4_ARG2, arg_1);
1234   assert(arg_2 != R4_ARG2, "smashed argument");
1235   mr_if_needed(R5_ARG3, arg_2);
1236   mr_if_needed(R6_ARG4, arg_3);
1237   call_VM(oop_result, entry_point, check_exceptions);
1238 }
1239 
1240 void MacroAssembler::call_VM_leaf(address entry_point) {
1241   call_VM_leaf_base(entry_point);
1242 }
1243 
1244 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) {
1245   mr_if_needed(R3_ARG1, arg_1);
1246   call_VM_leaf(entry_point);
1247 }
1248 
1249 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) {
1250   mr_if_needed(R3_ARG1, arg_1);
1251   assert(arg_2 != R3_ARG1, "smashed argument");
1252   mr_if_needed(R4_ARG2, arg_2);
1253   call_VM_leaf(entry_point);
1254 }
1255 
1256 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) {
1257   mr_if_needed(R3_ARG1, arg_1);
1258   assert(arg_2 != R3_ARG1, "smashed argument");
1259   mr_if_needed(R4_ARG2, arg_2);
1260   assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument");
1261   mr_if_needed(R5_ARG3, arg_3);
1262   call_VM_leaf(entry_point);
1263 }
1264 
1265 // Check whether instruction is a read access to the polling page
1266 // which was emitted by load_from_polling_page(..).
1267 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext,
1268                                                address* polling_address_ptr) {
1269   if (!is_ld(instruction))
1270     return false; // It's not a ld. Fail.
1271 
1272   int rt = inv_rt_field(instruction);
1273   int ra = inv_ra_field(instruction);
1274   int ds = inv_ds_field(instruction);
1275   if (!(ds == 0 && ra != 0 && rt == 0)) {
1276     return false; // It's not a ld(r0, X, ra). Fail.
1277   }
1278 
1279   if (!ucontext) {
1280     // Set polling address.
1281     if (polling_address_ptr != NULL) {
1282       *polling_address_ptr = NULL;
1283     }
1284     return true; // No ucontext given. Can't check value of ra. Assume true.
1285   }
1286 
1287 #ifdef LINUX
1288   // Ucontext given. Check that register ra contains the address of
1289   // the safepoing polling page.
1290   ucontext_t* uc = (ucontext_t*) ucontext;
1291   // Set polling address.
1292   address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds;
1293   if (polling_address_ptr != NULL) {
1294     *polling_address_ptr = addr;
1295   }
1296   return os::is_poll_address(addr);
1297 #else
1298   // Not on Linux, ucontext must be NULL.
1299   ShouldNotReachHere();
1300   return false;
1301 #endif
1302 }
1303 
1304 bool MacroAssembler::is_memory_serialization(int instruction, JavaThread* thread, void* ucontext) {
1305 #ifdef LINUX
1306   ucontext_t* uc = (ucontext_t*) ucontext;
1307 
1308   if (is_stwx(instruction) || is_stwux(instruction)) {
1309     int ra = inv_ra_field(instruction);
1310     int rb = inv_rb_field(instruction);
1311 
1312     // look up content of ra and rb in ucontext
1313     address ra_val=(address)uc->uc_mcontext.regs->gpr[ra];
1314     long rb_val=(long)uc->uc_mcontext.regs->gpr[rb];
1315     return os::is_memory_serialize_page(thread, ra_val+rb_val);
1316   } else if (is_stw(instruction) || is_stwu(instruction)) {
1317     int ra = inv_ra_field(instruction);
1318     int d1 = inv_d1_field(instruction);
1319 
1320     // look up content of ra in ucontext
1321     address ra_val=(address)uc->uc_mcontext.regs->gpr[ra];
1322     return os::is_memory_serialize_page(thread, ra_val+d1);
1323   } else {
1324     return false;
1325   }
1326 #else
1327   // workaround not needed on !LINUX :-)
1328   ShouldNotCallThis();
1329   return false;
1330 #endif
1331 }
1332 
1333 void MacroAssembler::bang_stack_with_offset(int offset) {
1334   // When increasing the stack, the old stack pointer will be written
1335   // to the new top of stack according to the PPC64 abi.
1336   // Therefore, stack banging is not necessary when increasing
1337   // the stack by <= os::vm_page_size() bytes.
1338   // When increasing the stack by a larger amount, this method is
1339   // called repeatedly to bang the intermediate pages.
1340 
1341   // Stack grows down, caller passes positive offset.
1342   assert(offset > 0, "must bang with positive offset");
1343 
1344   long stdoffset = -offset;
1345 
1346   if (is_simm(stdoffset, 16)) {
1347     // Signed 16 bit offset, a simple std is ok.
1348     if (UseLoadInstructionsForStackBangingPPC64) {
1349       ld(R0, (int)(signed short)stdoffset, R1_SP);
1350     } else {
1351       std(R0,(int)(signed short)stdoffset, R1_SP);
1352     }
1353   } else if (is_simm(stdoffset, 31)) {
1354     const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset);
1355     const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset);
1356 
1357     Register tmp = R11;
1358     addis(tmp, R1_SP, hi);
1359     if (UseLoadInstructionsForStackBangingPPC64) {
1360       ld(R0,  lo, tmp);
1361     } else {
1362       std(R0, lo, tmp);
1363     }
1364   } else {
1365     ShouldNotReachHere();
1366   }
1367 }
1368 
1369 // If instruction is a stack bang of the form
1370 //    std    R0,    x(Ry),       (see bang_stack_with_offset())
1371 //    stdu   R1_SP, x(R1_SP),    (see push_frame(), resize_frame())
1372 // or stdux  R1_SP, Rx, R1_SP    (see push_frame(), resize_frame())
1373 // return the banged address. Otherwise, return 0.
1374 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) {
1375 #ifdef LINUX
1376   ucontext_t* uc = (ucontext_t*) ucontext;
1377   int rs = inv_rs_field(instruction);
1378   int ra = inv_ra_field(instruction);
1379   if (   (is_ld(instruction)   && rs == 0 &&  UseLoadInstructionsForStackBangingPPC64)
1380       || (is_std(instruction)  && rs == 0 && !UseLoadInstructionsForStackBangingPPC64)
1381       || (is_stdu(instruction) && rs == 1)) {
1382     int ds = inv_ds_field(instruction);
1383     // return banged address
1384     return ds+(address)uc->uc_mcontext.regs->gpr[ra];
1385   } else if (is_stdux(instruction) && rs == 1) {
1386     int rb = inv_rb_field(instruction);
1387     address sp = (address)uc->uc_mcontext.regs->gpr[1];
1388     long rb_val = (long)uc->uc_mcontext.regs->gpr[rb];
1389     return ra != 1 || rb_val >= 0 ? NULL         // not a stack bang
1390                                   : sp + rb_val; // banged address
1391   }
1392   return NULL; // not a stack bang
1393 #else
1394   // workaround not needed on !LINUX :-)
1395   ShouldNotCallThis();
1396   return NULL;
1397 #endif
1398 }
1399 
1400 // CmpxchgX sets condition register to cmpX(current, compare).
1401 void MacroAssembler::cmpxchgw(ConditionRegister flag, Register dest_current_value,
1402                               Register compare_value, Register exchange_value,
1403                               Register addr_base, int semantics, bool cmpxchgx_hint,
1404                               Register int_flag_success, bool contention_hint) {
1405   Label retry;
1406   Label failed;
1407   Label done;
1408 
1409   // Save one branch if result is returned via register and
1410   // result register is different from the other ones.
1411   bool use_result_reg    = (int_flag_success != noreg);
1412   bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value &&
1413                             int_flag_success != exchange_value && int_flag_success != addr_base);
1414 
1415   if (use_result_reg && preset_result_reg) {
1416     li(int_flag_success, 0); // preset (assume cas failed)
1417   }
1418 
1419   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1420   if (contention_hint) { // Don't try to reserve if cmp fails.
1421     lwz(dest_current_value, 0, addr_base);
1422     cmpw(flag, dest_current_value, compare_value);
1423     bne(flag, failed);
1424   }
1425 
1426   // release/fence semantics
1427   if (semantics & MemBarRel) {
1428     release();
1429   }
1430 
1431   // atomic emulation loop
1432   bind(retry);
1433 
1434   lwarx(dest_current_value, addr_base, cmpxchgx_hint);
1435   cmpw(flag, dest_current_value, compare_value);
1436   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1437     bne_predict_not_taken(flag, failed);
1438   } else {
1439     bne(                  flag, failed);
1440   }
1441   // branch to done  => (flag == ne), (dest_current_value != compare_value)
1442   // fall through    => (flag == eq), (dest_current_value == compare_value)
1443 
1444   stwcx_(exchange_value, addr_base);
1445   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1446     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1447   } else {
1448     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1449   }
1450   // fall through    => (flag == eq), (dest_current_value == compare_value), (swapped)
1451 
1452   // Result in register (must do this at the end because int_flag_success can be the
1453   // same register as one above).
1454   if (use_result_reg) {
1455     li(int_flag_success, 1);
1456   }
1457 
1458   if (semantics & MemBarFenceAfter) {
1459     fence();
1460   } else if (semantics & MemBarAcq) {
1461     isync();
1462   }
1463 
1464   if (use_result_reg && !preset_result_reg) {
1465     b(done);
1466   }
1467 
1468   bind(failed);
1469   if (use_result_reg && !preset_result_reg) {
1470     li(int_flag_success, 0);
1471   }
1472 
1473   bind(done);
1474   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1475   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1476 }
1477 
1478 // Preforms atomic compare exchange:
1479 //   if (compare_value == *addr_base)
1480 //     *addr_base = exchange_value
1481 //     int_flag_success = 1;
1482 //   else
1483 //     int_flag_success = 0;
1484 //
1485 // ConditionRegister flag       = cmp(compare_value, *addr_base)
1486 // Register dest_current_value  = *addr_base
1487 // Register compare_value       Used to compare with value in memory
1488 // Register exchange_value      Written to memory if compare_value == *addr_base
1489 // Register addr_base           The memory location to compareXChange
1490 // Register int_flag_success    Set to 1 if exchange_value was written to *addr_base
1491 //
1492 // To avoid the costly compare exchange the value is tested beforehand.
1493 // Several special cases exist to avoid that unnecessary information is generated.
1494 //
1495 void MacroAssembler::cmpxchgd(ConditionRegister flag,
1496                               Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value,
1497                               Register addr_base, int semantics, bool cmpxchgx_hint,
1498                               Register int_flag_success, Label* failed_ext, bool contention_hint) {
1499   Label retry;
1500   Label failed_int;
1501   Label& failed = (failed_ext != NULL) ? *failed_ext : failed_int;
1502   Label done;
1503 
1504   // Save one branch if result is returned via register and result register is different from the other ones.
1505   bool use_result_reg    = (int_flag_success!=noreg);
1506   bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() &&
1507                             int_flag_success!=exchange_value && int_flag_success!=addr_base);
1508   assert(int_flag_success == noreg || failed_ext == NULL, "cannot have both");
1509 
1510   if (use_result_reg && preset_result_reg) {
1511     li(int_flag_success, 0); // preset (assume cas failed)
1512   }
1513 
1514   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1515   if (contention_hint) { // Don't try to reserve if cmp fails.
1516     ld(dest_current_value, 0, addr_base);
1517     cmpd(flag, compare_value, dest_current_value);
1518     bne(flag, failed);
1519   }
1520 
1521   // release/fence semantics
1522   if (semantics & MemBarRel) {
1523     release();
1524   }
1525 
1526   // atomic emulation loop
1527   bind(retry);
1528 
1529   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1530   cmpd(flag, compare_value, dest_current_value);
1531   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1532     bne_predict_not_taken(flag, failed);
1533   } else {
1534     bne(                  flag, failed);
1535   }
1536 
1537   stdcx_(exchange_value, addr_base);
1538   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1539     bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
1540   } else {
1541     bne(                  CCR0, retry); // stXcx_ sets CCR0
1542   }
1543 
1544   // result in register (must do this at the end because int_flag_success can be the same register as one above)
1545   if (use_result_reg) {
1546     li(int_flag_success, 1);
1547   }
1548 
1549   if (semantics & MemBarFenceAfter) {
1550     fence();
1551   } else if (semantics & MemBarAcq) {
1552     isync();
1553   }
1554 
1555   if (use_result_reg && !preset_result_reg) {
1556     b(done);
1557   }
1558 
1559   bind(failed_int);
1560   if (use_result_reg && !preset_result_reg) {
1561     li(int_flag_success, 0);
1562   }
1563 
1564   bind(done);
1565   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1566   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1567 }
1568 
1569 // Look up the method for a megamorphic invokeinterface call.
1570 // The target method is determined by <intf_klass, itable_index>.
1571 // The receiver klass is in recv_klass.
1572 // On success, the result will be in method_result, and execution falls through.
1573 // On failure, execution transfers to the given label.
1574 void MacroAssembler::lookup_interface_method(Register recv_klass,
1575                                              Register intf_klass,
1576                                              RegisterOrConstant itable_index,
1577                                              Register method_result,
1578                                              Register scan_temp,
1579                                              Register sethi_temp,
1580                                              Label& L_no_such_interface) {
1581   assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
1582   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
1583          "caller must use same register for non-constant itable index as for method");
1584 
1585   // Compute start of first itableOffsetEntry (which is at the end of the vtable).
1586   int vtable_base = InstanceKlass::vtable_start_offset() * wordSize;
1587   int itentry_off = itableMethodEntry::method_offset_in_bytes();
1588   int logMEsize   = exact_log2(itableMethodEntry::size() * wordSize);
1589   int scan_step   = itableOffsetEntry::size() * wordSize;
1590   int log_vte_size= exact_log2(vtableEntry::size() * wordSize);
1591 
1592   lwz(scan_temp, InstanceKlass::vtable_length_offset() * wordSize, recv_klass);
1593   // %%% We should store the aligned, prescaled offset in the klassoop.
1594   // Then the next several instructions would fold away.
1595 
1596   sldi(scan_temp, scan_temp, log_vte_size);
1597   addi(scan_temp, scan_temp, vtable_base);
1598   add(scan_temp, recv_klass, scan_temp);
1599 
1600   // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1601   if (itable_index.is_register()) {
1602     Register itable_offset = itable_index.as_register();
1603     sldi(itable_offset, itable_offset, logMEsize);
1604     if (itentry_off) addi(itable_offset, itable_offset, itentry_off);
1605     add(recv_klass, itable_offset, recv_klass);
1606   } else {
1607     long itable_offset = (long)itable_index.as_constant();
1608     load_const_optimized(sethi_temp, (itable_offset<<logMEsize)+itentry_off); // static address, no relocation
1609     add(recv_klass, sethi_temp, recv_klass);
1610   }
1611 
1612   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
1613   //   if (scan->interface() == intf) {
1614   //     result = (klass + scan->offset() + itable_index);
1615   //   }
1616   // }
1617   Label search, found_method;
1618 
1619   for (int peel = 1; peel >= 0; peel--) {
1620     // %%%% Could load both offset and interface in one ldx, if they were
1621     // in the opposite order. This would save a load.
1622     ld(method_result, itableOffsetEntry::interface_offset_in_bytes(), scan_temp);
1623 
1624     // Check that this entry is non-null. A null entry means that
1625     // the receiver class doesn't implement the interface, and wasn't the
1626     // same as when the caller was compiled.
1627     cmpd(CCR0, method_result, intf_klass);
1628 
1629     if (peel) {
1630       beq(CCR0, found_method);
1631     } else {
1632       bne(CCR0, search);
1633       // (invert the test to fall through to found_method...)
1634     }
1635 
1636     if (!peel) break;
1637 
1638     bind(search);
1639 
1640     cmpdi(CCR0, method_result, 0);
1641     beq(CCR0, L_no_such_interface);
1642     addi(scan_temp, scan_temp, scan_step);
1643   }
1644 
1645   bind(found_method);
1646 
1647   // Got a hit.
1648   int ito_offset = itableOffsetEntry::offset_offset_in_bytes();
1649   lwz(scan_temp, ito_offset, scan_temp);
1650   ldx(method_result, scan_temp, recv_klass);
1651 }
1652 
1653 // virtual method calling
1654 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1655                                            RegisterOrConstant vtable_index,
1656                                            Register method_result) {
1657 
1658   assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg());
1659 
1660   const int base = InstanceKlass::vtable_start_offset() * wordSize;
1661   assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1662 
1663   if (vtable_index.is_register()) {
1664     sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord);
1665     add(recv_klass, vtable_index.as_register(), recv_klass);
1666   } else {
1667     addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord);
1668   }
1669   ld(R19_method, base + vtableEntry::method_offset_in_bytes(), recv_klass);
1670 }
1671 
1672 /////////////////////////////////////////// subtype checking ////////////////////////////////////////////
1673 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1674                                                    Register super_klass,
1675                                                    Register temp1_reg,
1676                                                    Register temp2_reg,
1677                                                    Label* L_success,
1678                                                    Label* L_failure,
1679                                                    Label* L_slow_path,
1680                                                    RegisterOrConstant super_check_offset) {
1681 
1682   const Register check_cache_offset = temp1_reg;
1683   const Register cached_super       = temp2_reg;
1684 
1685   assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super);
1686 
1687   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1688   int sc_offset  = in_bytes(Klass::secondary_super_cache_offset());
1689 
1690   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1691   bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset);
1692 
1693   Label L_fallthrough;
1694   int label_nulls = 0;
1695   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1696   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1697   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1698   assert(label_nulls <= 1 ||
1699          (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path),
1700          "at most one NULL in the batch, usually");
1701 
1702   // If the pointers are equal, we are done (e.g., String[] elements).
1703   // This self-check enables sharing of secondary supertype arrays among
1704   // non-primary types such as array-of-interface. Otherwise, each such
1705   // type would need its own customized SSA.
1706   // We move this check to the front of the fast path because many
1707   // type checks are in fact trivially successful in this manner,
1708   // so we get a nicely predicted branch right at the start of the check.
1709   cmpd(CCR0, sub_klass, super_klass);
1710   beq(CCR0, *L_success);
1711 
1712   // Check the supertype display:
1713   if (must_load_sco) {
1714     // The super check offset is always positive...
1715   lwz(check_cache_offset, sco_offset, super_klass);
1716     super_check_offset = RegisterOrConstant(check_cache_offset);
1717     // super_check_offset is register.
1718     assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register());
1719   }
1720   // The loaded value is the offset from KlassOopDesc.
1721 
1722   ld(cached_super, super_check_offset, sub_klass);
1723   cmpd(CCR0, cached_super, super_klass);
1724 
1725   // This check has worked decisively for primary supers.
1726   // Secondary supers are sought in the super_cache ('super_cache_addr').
1727   // (Secondary supers are interfaces and very deeply nested subtypes.)
1728   // This works in the same check above because of a tricky aliasing
1729   // between the super_cache and the primary super display elements.
1730   // (The 'super_check_addr' can address either, as the case requires.)
1731   // Note that the cache is updated below if it does not help us find
1732   // what we need immediately.
1733   // So if it was a primary super, we can just fail immediately.
1734   // Otherwise, it's the slow path for us (no success at this point).
1735 
1736 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); }
1737 
1738   if (super_check_offset.is_register()) {
1739     beq(CCR0, *L_success);
1740     cmpwi(CCR0, super_check_offset.as_register(), sc_offset);
1741     if (L_failure == &L_fallthrough) {
1742       beq(CCR0, *L_slow_path);
1743     } else {
1744       bne(CCR0, *L_failure);
1745       FINAL_JUMP(*L_slow_path);
1746     }
1747   } else {
1748     if (super_check_offset.as_constant() == sc_offset) {
1749       // Need a slow path; fast failure is impossible.
1750       if (L_slow_path == &L_fallthrough) {
1751         beq(CCR0, *L_success);
1752       } else {
1753         bne(CCR0, *L_slow_path);
1754         FINAL_JUMP(*L_success);
1755       }
1756     } else {
1757       // No slow path; it's a fast decision.
1758       if (L_failure == &L_fallthrough) {
1759         beq(CCR0, *L_success);
1760       } else {
1761         bne(CCR0, *L_failure);
1762         FINAL_JUMP(*L_success);
1763       }
1764     }
1765   }
1766 
1767   bind(L_fallthrough);
1768 #undef FINAL_JUMP
1769 }
1770 
1771 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1772                                                    Register super_klass,
1773                                                    Register temp1_reg,
1774                                                    Register temp2_reg,
1775                                                    Label* L_success,
1776                                                    Register result_reg) {
1777   const Register array_ptr = temp1_reg; // current value from cache array
1778   const Register temp      = temp2_reg;
1779 
1780   assert_different_registers(sub_klass, super_klass, array_ptr, temp);
1781 
1782   int source_offset = in_bytes(Klass::secondary_supers_offset());
1783   int target_offset = in_bytes(Klass::secondary_super_cache_offset());
1784 
1785   int length_offset = Array<Klass*>::length_offset_in_bytes();
1786   int base_offset   = Array<Klass*>::base_offset_in_bytes();
1787 
1788   Label hit, loop, failure, fallthru;
1789 
1790   ld(array_ptr, source_offset, sub_klass);
1791 
1792   // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated.");
1793   lwz(temp, length_offset, array_ptr);
1794   cmpwi(CCR0, temp, 0);
1795   beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0
1796 
1797   mtctr(temp); // load ctr
1798 
1799   bind(loop);
1800   // Oops in table are NO MORE compressed.
1801   ld(temp, base_offset, array_ptr);
1802   cmpd(CCR0, temp, super_klass);
1803   beq(CCR0, hit);
1804   addi(array_ptr, array_ptr, BytesPerWord);
1805   bdnz(loop);
1806 
1807   bind(failure);
1808   if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss)
1809   b(fallthru);
1810 
1811   bind(hit);
1812   std(super_klass, target_offset, sub_klass); // save result to cache
1813   if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit)
1814   if (L_success != NULL) { b(*L_success); }
1815   else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided
1816 
1817   bind(fallthru);
1818 }
1819 
1820 // Try fast path, then go to slow one if not successful
1821 void MacroAssembler::check_klass_subtype(Register sub_klass,
1822                          Register super_klass,
1823                          Register temp1_reg,
1824                          Register temp2_reg,
1825                          Label& L_success) {
1826   Label L_failure;
1827   check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure);
1828   check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success);
1829   bind(L_failure); // Fallthru if not successful.
1830 }
1831 
1832 void MacroAssembler::check_method_handle_type(Register mtype_reg, Register mh_reg,
1833                                               Register temp_reg,
1834                                               Label& wrong_method_type) {
1835   assert_different_registers(mtype_reg, mh_reg, temp_reg);
1836   // Compare method type against that of the receiver.
1837   load_heap_oop_not_null(temp_reg, delayed_value(java_lang_invoke_MethodHandle::type_offset_in_bytes, temp_reg), mh_reg);
1838   cmpd(CCR0, temp_reg, mtype_reg);
1839   bne(CCR0, wrong_method_type);
1840 }
1841 
1842 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot,
1843                                                    Register temp_reg,
1844                                                    int extra_slot_offset) {
1845   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
1846   int stackElementSize = Interpreter::stackElementSize;
1847   int offset = extra_slot_offset * stackElementSize;
1848   if (arg_slot.is_constant()) {
1849     offset += arg_slot.as_constant() * stackElementSize;
1850     return offset;
1851   } else {
1852     assert(temp_reg != noreg, "must specify");
1853     sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize));
1854     if (offset != 0)
1855       addi(temp_reg, temp_reg, offset);
1856     return temp_reg;
1857   }
1858 }
1859 
1860 // Supports temp2_reg = R0.
1861 void MacroAssembler::biased_locking_enter(ConditionRegister cr_reg, Register obj_reg,
1862                                           Register mark_reg, Register temp_reg,
1863                                           Register temp2_reg, Label& done, Label* slow_case) {
1864   assert(UseBiasedLocking, "why call this otherwise?");
1865 
1866 #ifdef ASSERT
1867   assert_different_registers(obj_reg, mark_reg, temp_reg, temp2_reg);
1868 #endif
1869 
1870   Label cas_label;
1871 
1872   // Branch to done if fast path fails and no slow_case provided.
1873   Label *slow_case_int = (slow_case != NULL) ? slow_case : &done;
1874 
1875   // Biased locking
1876   // See whether the lock is currently biased toward our thread and
1877   // whether the epoch is still valid
1878   // Note that the runtime guarantees sufficient alignment of JavaThread
1879   // pointers to allow age to be placed into low bits
1880   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits,
1881          "biased locking makes assumptions about bit layout");
1882 
1883   if (PrintBiasedLockingStatistics) {
1884     load_const(temp2_reg, (address) BiasedLocking::total_entry_count_addr(), temp_reg);
1885     lwzx(temp_reg, temp2_reg);
1886     addi(temp_reg, temp_reg, 1);
1887     stwx(temp_reg, temp2_reg);
1888   }
1889 
1890   andi(temp_reg, mark_reg, markOopDesc::biased_lock_mask_in_place);
1891   cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern);
1892   bne(cr_reg, cas_label);
1893 
1894   load_klass(temp_reg, obj_reg);
1895 
1896   load_const_optimized(temp2_reg, ~((int) markOopDesc::age_mask_in_place));
1897   ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
1898   orr(temp_reg, R16_thread, temp_reg);
1899   xorr(temp_reg, mark_reg, temp_reg);
1900   andr(temp_reg, temp_reg, temp2_reg);
1901   cmpdi(cr_reg, temp_reg, 0);
1902   if (PrintBiasedLockingStatistics) {
1903     Label l;
1904     bne(cr_reg, l);
1905     load_const(temp2_reg, (address) BiasedLocking::biased_lock_entry_count_addr());
1906     lwzx(mark_reg, temp2_reg);
1907     addi(mark_reg, mark_reg, 1);
1908     stwx(mark_reg, temp2_reg);
1909     // restore mark_reg
1910     ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);
1911     bind(l);
1912   }
1913   beq(cr_reg, done);
1914 
1915   Label try_revoke_bias;
1916   Label try_rebias;
1917 
1918   // At this point we know that the header has the bias pattern and
1919   // that we are not the bias owner in the current epoch. We need to
1920   // figure out more details about the state of the header in order to
1921   // know what operations can be legally performed on the object's
1922   // header.
1923 
1924   // If the low three bits in the xor result aren't clear, that means
1925   // the prototype header is no longer biased and we have to revoke
1926   // the bias on this object.
1927   andi(temp2_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
1928   cmpwi(cr_reg, temp2_reg, 0);
1929   bne(cr_reg, try_revoke_bias);
1930 
1931   // Biasing is still enabled for this data type. See whether the
1932   // epoch of the current bias is still valid, meaning that the epoch
1933   // bits of the mark word are equal to the epoch bits of the
1934   // prototype header. (Note that the prototype header's epoch bits
1935   // only change at a safepoint.) If not, attempt to rebias the object
1936   // toward the current thread. Note that we must be absolutely sure
1937   // that the current epoch is invalid in order to do this because
1938   // otherwise the manipulations it performs on the mark word are
1939   // illegal.
1940 
1941   int shift_amount = 64 - markOopDesc::epoch_shift;
1942   // rotate epoch bits to right (little) end and set other bits to 0
1943   // [ big part | epoch | little part ] -> [ 0..0 | epoch ]
1944   rldicl_(temp2_reg, temp_reg, shift_amount, 64 - markOopDesc::epoch_bits);
1945   // branch if epoch bits are != 0, i.e. they differ, because the epoch has been incremented
1946   bne(CCR0, try_rebias);
1947 
1948   // The epoch of the current bias is still valid but we know nothing
1949   // about the owner; it might be set or it might be clear. Try to
1950   // acquire the bias of the object using an atomic operation. If this
1951   // fails we will go in to the runtime to revoke the object's bias.
1952   // Note that we first construct the presumed unbiased header so we
1953   // don't accidentally blow away another thread's valid bias.
1954   andi(mark_reg, mark_reg, (markOopDesc::biased_lock_mask_in_place |
1955                                 markOopDesc::age_mask_in_place |
1956                                 markOopDesc::epoch_mask_in_place));
1957   orr(temp_reg, R16_thread, mark_reg);
1958 
1959   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
1960 
1961   // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
1962   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
1963            /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
1964            /*where=*/obj_reg,
1965            MacroAssembler::MemBarAcq,
1966            MacroAssembler::cmpxchgx_hint_acquire_lock(),
1967            noreg, slow_case_int); // bail out if failed
1968 
1969   // If the biasing toward our thread failed, this means that
1970   // another thread succeeded in biasing it toward itself and we
1971   // need to revoke that bias. The revocation will occur in the
1972   // interpreter runtime in the slow case.
1973   if (PrintBiasedLockingStatistics) {
1974     load_const(temp2_reg, (address) BiasedLocking::anonymously_biased_lock_entry_count_addr(), temp_reg);
1975     lwzx(temp_reg, temp2_reg);
1976     addi(temp_reg, temp_reg, 1);
1977     stwx(temp_reg, temp2_reg);
1978   }
1979   b(done);
1980 
1981   bind(try_rebias);
1982   // At this point we know the epoch has expired, meaning that the
1983   // current "bias owner", if any, is actually invalid. Under these
1984   // circumstances _only_, we are allowed to use the current header's
1985   // value as the comparison value when doing the cas to acquire the
1986   // bias in the current epoch. In other words, we allow transfer of
1987   // the bias from one thread to another directly in this situation.
1988   load_klass(temp_reg, obj_reg);
1989   andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place);
1990   orr(temp2_reg, R16_thread, temp2_reg);
1991   ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
1992   orr(temp_reg, temp2_reg, temp_reg);
1993 
1994   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
1995 
1996   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
1997                  /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
1998                  /*where=*/obj_reg,
1999                  MacroAssembler::MemBarAcq,
2000                  MacroAssembler::cmpxchgx_hint_acquire_lock(),
2001                  noreg, slow_case_int); // bail out if failed
2002 
2003   // If the biasing toward our thread failed, this means that
2004   // another thread succeeded in biasing it toward itself and we
2005   // need to revoke that bias. The revocation will occur in the
2006   // interpreter runtime in the slow case.
2007   if (PrintBiasedLockingStatistics) {
2008     load_const(temp2_reg, (address) BiasedLocking::rebiased_lock_entry_count_addr(), temp_reg);
2009     lwzx(temp_reg, temp2_reg);
2010     addi(temp_reg, temp_reg, 1);
2011     stwx(temp_reg, temp2_reg);
2012   }
2013   b(done);
2014 
2015   bind(try_revoke_bias);
2016   // The prototype mark in the klass doesn't have the bias bit set any
2017   // more, indicating that objects of this data type are not supposed
2018   // to be biased any more. We are going to try to reset the mark of
2019   // this object to the prototype value and fall through to the
2020   // CAS-based locking scheme. Note that if our CAS fails, it means
2021   // that another thread raced us for the privilege of revoking the
2022   // bias of this particular object, so it's okay to continue in the
2023   // normal locking code.
2024   load_klass(temp_reg, obj_reg);
2025   ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
2026   andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place);
2027   orr(temp_reg, temp_reg, temp2_reg);
2028 
2029   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2030 
2031   // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
2032   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
2033                  /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
2034                  /*where=*/obj_reg,
2035                  MacroAssembler::MemBarAcq,
2036                  MacroAssembler::cmpxchgx_hint_acquire_lock());
2037 
2038   // reload markOop in mark_reg before continuing with lightweight locking
2039   ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);
2040 
2041   // Fall through to the normal CAS-based lock, because no matter what
2042   // the result of the above CAS, some thread must have succeeded in
2043   // removing the bias bit from the object's header.
2044   if (PrintBiasedLockingStatistics) {
2045     Label l;
2046     bne(cr_reg, l);
2047     load_const(temp2_reg, (address) BiasedLocking::revoked_lock_entry_count_addr(), temp_reg);
2048     lwzx(temp_reg, temp2_reg);
2049     addi(temp_reg, temp_reg, 1);
2050     stwx(temp_reg, temp2_reg);
2051     bind(l);
2052   }
2053 
2054   bind(cas_label);
2055 }
2056 
2057 void MacroAssembler::biased_locking_exit (ConditionRegister cr_reg, Register mark_addr, Register temp_reg, Label& done) {
2058   // Check for biased locking unlock case, which is a no-op
2059   // Note: we do not have to check the thread ID for two reasons.
2060   // First, the interpreter checks for IllegalMonitorStateException at
2061   // a higher level. Second, if the bias was revoked while we held the
2062   // lock, the object could not be rebiased toward another thread, so
2063   // the bias bit would be clear.
2064 
2065   ld(temp_reg, 0, mark_addr);
2066   andi(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
2067 
2068   cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern);
2069   beq(cr_reg, done);
2070 }
2071 
2072 // allocation (for C1)
2073 void MacroAssembler::eden_allocate(
2074   Register obj,                      // result: pointer to object after successful allocation
2075   Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
2076   int      con_size_in_bytes,        // object size in bytes if   known at compile time
2077   Register t1,                       // temp register
2078   Register t2,                       // temp register
2079   Label&   slow_case                 // continuation point if fast allocation fails
2080 ) {
2081   b(slow_case);
2082 }
2083 
2084 void MacroAssembler::tlab_allocate(
2085   Register obj,                      // result: pointer to object after successful allocation
2086   Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
2087   int      con_size_in_bytes,        // object size in bytes if   known at compile time
2088   Register t1,                       // temp register
2089   Label&   slow_case                 // continuation point if fast allocation fails
2090 ) {
2091   // make sure arguments make sense
2092   assert_different_registers(obj, var_size_in_bytes, t1);
2093   assert(0 <= con_size_in_bytes && is_simm13(con_size_in_bytes), "illegal object size");
2094   assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment");
2095 
2096   const Register new_top = t1;
2097   //verify_tlab(); not implemented
2098 
2099   ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2100   ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread);
2101   if (var_size_in_bytes == noreg) {
2102     addi(new_top, obj, con_size_in_bytes);
2103   } else {
2104     add(new_top, obj, var_size_in_bytes);
2105   }
2106   cmpld(CCR0, new_top, R0);
2107   bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case);
2108 
2109 #ifdef ASSERT
2110   // make sure new free pointer is properly aligned
2111   {
2112     Label L;
2113     andi_(R0, new_top, MinObjAlignmentInBytesMask);
2114     beq(CCR0, L);
2115     stop("updated TLAB free is not properly aligned", 0x934);
2116     bind(L);
2117   }
2118 #endif // ASSERT
2119 
2120   // update the tlab top pointer
2121   std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2122   //verify_tlab(); not implemented
2123 }
2124 void MacroAssembler::tlab_refill(Label& retry_tlab, Label& try_eden, Label& slow_case) {
2125   unimplemented("tlab_refill");
2126 }
2127 void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register t1, Register t2) {
2128   unimplemented("incr_allocated_bytes");
2129 }
2130 
2131 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset,
2132                                              int insts_call_instruction_offset, Register Rtoc) {
2133   // Start the stub.
2134   address stub = start_a_stub(64);
2135   if (stub == NULL) { return NULL; } // CodeCache full: bail out
2136 
2137   // Create a trampoline stub relocation which relates this trampoline stub
2138   // with the call instruction at insts_call_instruction_offset in the
2139   // instructions code-section.
2140   relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset));
2141   const int stub_start_offset = offset();
2142 
2143   // For java_to_interp stubs we use R11_scratch1 as scratch register
2144   // and in call trampoline stubs we use R12_scratch2. This way we
2145   // can distinguish them (see is_NativeCallTrampolineStub_at()).
2146   Register reg_scratch = R12_scratch2;
2147 
2148   // Now, create the trampoline stub's code:
2149   // - load the TOC
2150   // - load the call target from the constant pool
2151   // - call
2152   if (Rtoc == noreg) {
2153     calculate_address_from_global_toc(reg_scratch, method_toc());
2154     Rtoc = reg_scratch;
2155   }
2156 
2157   ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false);
2158   mtctr(reg_scratch);
2159   bctr();
2160 
2161   const address stub_start_addr = addr_at(stub_start_offset);
2162 
2163   // Assert that the encoded destination_toc_offset can be identified and that it is correct.
2164   assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(),
2165          "encoded offset into the constant pool must match");
2166   // Trampoline_stub_size should be good.
2167   assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size");
2168   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
2169 
2170   // End the stub.
2171   end_a_stub();
2172   return stub;
2173 }
2174 
2175 // TM on PPC64.
2176 void MacroAssembler::atomic_inc_ptr(Register addr, Register result, int simm16) {
2177   Label retry;
2178   bind(retry);
2179   ldarx(result, addr, /*hint*/ false);
2180   addi(result, result, simm16);
2181   stdcx_(result, addr);
2182   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
2183     bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
2184   } else {
2185     bne(                  CCR0, retry); // stXcx_ sets CCR0
2186   }
2187 }
2188 
2189 void MacroAssembler::atomic_ori_int(Register addr, Register result, int uimm16) {
2190   Label retry;
2191   bind(retry);
2192   lwarx(result, addr, /*hint*/ false);
2193   ori(result, result, uimm16);
2194   stwcx_(result, addr);
2195   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
2196     bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
2197   } else {
2198     bne(                  CCR0, retry); // stXcx_ sets CCR0
2199   }
2200 }
2201 
2202 #if INCLUDE_RTM_OPT
2203 
2204 // Update rtm_counters based on abort status
2205 // input: abort_status
2206 //        rtm_counters (RTMLockingCounters*)
2207 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters_Reg) {
2208   // Mapping to keep PreciseRTMLockingStatistics similar to x86.
2209   // x86 ppc (! means inverted, ? means not the same)
2210   //  0   31  Set if abort caused by XABORT instruction.
2211   //  1  ! 7  If set, the transaction may succeed on a retry. This bit is always clear if bit 0 is set.
2212   //  2   13  Set if another logical processor conflicted with a memory address that was part of the transaction that aborted.
2213   //  3   10  Set if an internal buffer overflowed.
2214   //  4  ?12  Set if a debug breakpoint was hit.
2215   //  5  ?32  Set if an abort occurred during execution of a nested transaction.
2216   const  int tm_failure_bit[] = {Assembler::tm_tabort, // Note: Seems like signal handler sets this, too.
2217                                  Assembler::tm_failure_persistent, // inverted: transient
2218                                  Assembler::tm_trans_cf,
2219                                  Assembler::tm_footprint_of,
2220                                  Assembler::tm_non_trans_cf,
2221                                  Assembler::tm_suspended};
2222   const bool tm_failure_inv[] = {false, true, false, false, false, false};
2223   assert(sizeof(tm_failure_bit)/sizeof(int) == RTMLockingCounters::ABORT_STATUS_LIMIT, "adapt mapping!");
2224 
2225   const Register addr_Reg = R0;
2226   // Keep track of offset to where rtm_counters_Reg had pointed to.
2227   int counters_offs = RTMLockingCounters::abort_count_offset();
2228   addi(addr_Reg, rtm_counters_Reg, counters_offs);
2229   const Register temp_Reg = rtm_counters_Reg;
2230 
2231   //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically
2232   ldx(temp_Reg, addr_Reg);
2233   addi(temp_Reg, temp_Reg, 1);
2234   stdx(temp_Reg, addr_Reg);
2235 
2236   if (PrintPreciseRTMLockingStatistics) {
2237     int counters_offs_delta = RTMLockingCounters::abortX_count_offset() - counters_offs;
2238 
2239     //mftexasr(abort_status); done by caller
2240     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
2241       counters_offs += counters_offs_delta;
2242       li(temp_Reg, counters_offs_delta); // can't use addi with R0
2243       add(addr_Reg, addr_Reg, temp_Reg); // point to next counter
2244       counters_offs_delta = sizeof(uintx);
2245 
2246       Label check_abort;
2247       rldicr_(temp_Reg, abort_status, tm_failure_bit[i], 0);
2248       if (tm_failure_inv[i]) {
2249         bne(CCR0, check_abort);
2250       } else {
2251         beq(CCR0, check_abort);
2252       }
2253       //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically
2254       ldx(temp_Reg, addr_Reg);
2255       addi(temp_Reg, temp_Reg, 1);
2256       stdx(temp_Reg, addr_Reg);
2257       bind(check_abort);
2258     }
2259   }
2260   li(temp_Reg, -counters_offs); // can't use addi with R0
2261   add(rtm_counters_Reg, addr_Reg, temp_Reg); // restore
2262 }
2263 
2264 // Branch if (random & (count-1) != 0), count is 2^n
2265 // tmp and CR0 are killed
2266 void MacroAssembler::branch_on_random_using_tb(Register tmp, int count, Label& brLabel) {
2267   mftb(tmp);
2268   andi_(tmp, tmp, count-1);
2269   bne(CCR0, brLabel);
2270 }
2271 
2272 // Perform abort ratio calculation, set no_rtm bit if high ratio.
2273 // input:  rtm_counters_Reg (RTMLockingCounters* address) - KILLED
2274 void MacroAssembler::rtm_abort_ratio_calculation(Register rtm_counters_Reg,
2275                                                  RTMLockingCounters* rtm_counters,
2276                                                  Metadata* method_data) {
2277   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
2278 
2279   if (RTMLockingCalculationDelay > 0) {
2280     // Delay calculation.
2281     ld(rtm_counters_Reg, (RegisterOrConstant)(intptr_t)RTMLockingCounters::rtm_calculation_flag_addr());
2282     cmpdi(CCR0, rtm_counters_Reg, 0);
2283     beq(CCR0, L_done);
2284     load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2285   }
2286   // Abort ratio calculation only if abort_count > RTMAbortThreshold.
2287   //   Aborted transactions = abort_count * 100
2288   //   All transactions = total_count *  RTMTotalCountIncrRate
2289   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
2290   ld(R0, RTMLockingCounters::abort_count_offset(), rtm_counters_Reg);
2291   cmpdi(CCR0, R0, RTMAbortThreshold);
2292   blt(CCR0, L_check_always_rtm2);
2293   mulli(R0, R0, 100);
2294 
2295   const Register tmpReg = rtm_counters_Reg;
2296   ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2297   mulli(tmpReg, tmpReg, RTMTotalCountIncrRate);
2298   mulli(tmpReg, tmpReg, RTMAbortRatio);
2299   cmpd(CCR0, R0, tmpReg);
2300   blt(CCR0, L_check_always_rtm1); // jump to reload
2301   if (method_data != NULL) {
2302     // Set rtm_state to "no rtm" in MDO.
2303     // Not using a metadata relocation. Method and Class Loader are kept alive anyway.
2304     // (See nmethod::metadata_do and CodeBuffer::finalize_oop_references.)
2305     load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2306     atomic_ori_int(R0, tmpReg, NoRTM);
2307   }
2308   b(L_done);
2309 
2310   bind(L_check_always_rtm1);
2311   load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2312   bind(L_check_always_rtm2);
2313   ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2314   cmpdi(CCR0, tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
2315   blt(CCR0, L_done);
2316   if (method_data != NULL) {
2317     // Set rtm_state to "always rtm" in MDO.
2318     // Not using a metadata relocation. See above.
2319     load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2320     atomic_ori_int(R0, tmpReg, UseRTM);
2321   }
2322   bind(L_done);
2323 }
2324 
2325 // Update counters and perform abort ratio calculation.
2326 // input: abort_status_Reg
2327 void MacroAssembler::rtm_profiling(Register abort_status_Reg, Register temp_Reg,
2328                                    RTMLockingCounters* rtm_counters,
2329                                    Metadata* method_data,
2330                                    bool profile_rtm) {
2331 
2332   assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2333   // Update rtm counters based on state at abort.
2334   // Reads abort_status_Reg, updates flags.
2335   assert_different_registers(abort_status_Reg, temp_Reg);
2336   load_const_optimized(temp_Reg, (address)rtm_counters, R0);
2337   rtm_counters_update(abort_status_Reg, temp_Reg);
2338   if (profile_rtm) {
2339     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2340     rtm_abort_ratio_calculation(temp_Reg, rtm_counters, method_data);
2341   }
2342 }
2343 
2344 // Retry on abort if abort's status indicates non-persistent failure.
2345 // inputs: retry_count_Reg
2346 //       : abort_status_Reg
2347 // output: retry_count_Reg decremented by 1
2348 void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg,
2349                                              Label& retryLabel, Label* checkRetry) {
2350   Label doneRetry;
2351   rldicr_(R0, abort_status_Reg, tm_failure_persistent, 0);
2352   bne(CCR0, doneRetry);
2353   if (checkRetry) { bind(*checkRetry); }
2354   addic_(retry_count_Reg, retry_count_Reg, -1);
2355   blt(CCR0, doneRetry);
2356   smt_yield(); // Can't use wait(). No permission (SIGILL).
2357   b(retryLabel);
2358   bind(doneRetry);
2359 }
2360 
2361 // Spin and retry if lock is busy.
2362 // inputs: box_Reg (monitor address)
2363 //       : retry_count_Reg
2364 // output: retry_count_Reg decremented by 1
2365 // CTR is killed
2366 void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register owner_addr_Reg, Label& retryLabel) {
2367   Label SpinLoop, doneRetry;
2368   addic_(retry_count_Reg, retry_count_Reg, -1);
2369   blt(CCR0, doneRetry);
2370   li(R0, RTMSpinLoopCount);
2371   mtctr(R0);
2372 
2373   bind(SpinLoop);
2374   smt_yield(); // Can't use waitrsv(). No permission (SIGILL).
2375   bdz(retryLabel);
2376   ld(R0, 0, owner_addr_Reg);
2377   cmpdi(CCR0, R0, 0);
2378   bne(CCR0, SpinLoop);
2379   b(retryLabel);
2380 
2381   bind(doneRetry);
2382 }
2383 
2384 // Use RTM for normal stack locks.
2385 // Input: objReg (object to lock)
2386 void MacroAssembler::rtm_stack_locking(ConditionRegister flag,
2387                                        Register obj, Register mark_word, Register tmp,
2388                                        Register retry_on_abort_count_Reg,
2389                                        RTMLockingCounters* stack_rtm_counters,
2390                                        Metadata* method_data, bool profile_rtm,
2391                                        Label& DONE_LABEL, Label& IsInflated) {
2392   assert(UseRTMForStackLocks, "why call this otherwise?");
2393   assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
2394   Label L_rtm_retry, L_decrement_retry, L_on_abort;
2395 
2396   if (RTMRetryCount > 0) {
2397     load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
2398     bind(L_rtm_retry);
2399   }
2400   andi_(R0, mark_word, markOopDesc::monitor_value);  // inflated vs stack-locked|neutral|biased
2401   bne(CCR0, IsInflated);
2402 
2403   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2404     Label L_noincrement;
2405     if (RTMTotalCountIncrRate > 1) {
2406       branch_on_random_using_tb(tmp, (int)RTMTotalCountIncrRate, L_noincrement);
2407     }
2408     assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
2409     load_const_optimized(tmp, (address)stack_rtm_counters->total_count_addr(), R0);
2410     //atomic_inc_ptr(tmp, /*temp, will be reloaded*/mark_word); We don't increment atomically
2411     ldx(mark_word, tmp);
2412     addi(mark_word, mark_word, 1);
2413     stdx(mark_word, tmp);
2414     bind(L_noincrement);
2415   }
2416   tbegin_();
2417   beq(CCR0, L_on_abort);
2418   ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);         // Reload in transaction, conflicts need to be tracked.
2419   andi(R0, mark_word, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
2420   cmpwi(flag, R0, markOopDesc::unlocked_value);                // bits = 001 unlocked
2421   beq(flag, DONE_LABEL);                                       // all done if unlocked
2422 
2423   if (UseRTMXendForLockBusy) {
2424     tend_();
2425     b(L_decrement_retry);
2426   } else {
2427     tabort_();
2428   }
2429   bind(L_on_abort);
2430   const Register abort_status_Reg = tmp;
2431   mftexasr(abort_status_Reg);
2432   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2433     rtm_profiling(abort_status_Reg, /*temp*/mark_word, stack_rtm_counters, method_data, profile_rtm);
2434   }
2435   ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // reload
2436   if (RTMRetryCount > 0) {
2437     // Retry on lock abort if abort status is not permanent.
2438     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry, &L_decrement_retry);
2439   } else {
2440     bind(L_decrement_retry);
2441   }
2442 }
2443 
2444 // Use RTM for inflating locks
2445 // inputs: obj       (object to lock)
2446 //         mark_word (current header - KILLED)
2447 //         boxReg    (on-stack box address (displaced header location) - KILLED)
2448 void MacroAssembler::rtm_inflated_locking(ConditionRegister flag,
2449                                           Register obj, Register mark_word, Register boxReg,
2450                                           Register retry_on_busy_count_Reg, Register retry_on_abort_count_Reg,
2451                                           RTMLockingCounters* rtm_counters,
2452                                           Metadata* method_data, bool profile_rtm,
2453                                           Label& DONE_LABEL) {
2454   assert(UseRTMLocking, "why call this otherwise?");
2455   Label L_rtm_retry, L_decrement_retry, L_on_abort;
2456   // Clean monitor_value bit to get valid pointer.
2457   int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markOopDesc::monitor_value;
2458 
2459   // Store non-null, using boxReg instead of (intptr_t)markOopDesc::unused_mark().
2460   std(boxReg, BasicLock::displaced_header_offset_in_bytes(), boxReg);
2461   const Register tmpReg = boxReg;
2462   const Register owner_addr_Reg = mark_word;
2463   addi(owner_addr_Reg, mark_word, owner_offset);
2464 
2465   if (RTMRetryCount > 0) {
2466     load_const_optimized(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy.
2467     load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort.
2468     bind(L_rtm_retry);
2469   }
2470   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2471     Label L_noincrement;
2472     if (RTMTotalCountIncrRate > 1) {
2473       branch_on_random_using_tb(R0, (int)RTMTotalCountIncrRate, L_noincrement);
2474     }
2475     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2476     load_const(R0, (address)rtm_counters->total_count_addr(), tmpReg);
2477     //atomic_inc_ptr(R0, tmpReg); We don't increment atomically
2478     ldx(tmpReg, R0);
2479     addi(tmpReg, tmpReg, 1);
2480     stdx(tmpReg, R0);
2481     bind(L_noincrement);
2482   }
2483   tbegin_();
2484   beq(CCR0, L_on_abort);
2485   // We don't reload mark word. Will only be reset at safepoint.
2486   ld(R0, 0, owner_addr_Reg); // Load in transaction, conflicts need to be tracked.
2487   cmpdi(flag, R0, 0);
2488   beq(flag, DONE_LABEL);
2489 
2490   if (UseRTMXendForLockBusy) {
2491     tend_();
2492     b(L_decrement_retry);
2493   } else {
2494     tabort_();
2495   }
2496   bind(L_on_abort);
2497   const Register abort_status_Reg = tmpReg;
2498   mftexasr(abort_status_Reg);
2499   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2500     rtm_profiling(abort_status_Reg, /*temp*/ owner_addr_Reg, rtm_counters, method_data, profile_rtm);
2501     // Restore owner_addr_Reg
2502     ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);
2503 #ifdef ASSERT
2504     andi_(R0, mark_word, markOopDesc::monitor_value);
2505     asm_assert_ne("must be inflated", 0xa754); // Deflating only allowed at safepoint.
2506 #endif
2507     addi(owner_addr_Reg, mark_word, owner_offset);
2508   }
2509   if (RTMRetryCount > 0) {
2510     // Retry on lock abort if abort status is not permanent.
2511     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
2512   }
2513 
2514   // Appears unlocked - try to swing _owner from null to non-null.
2515   cmpxchgd(flag, /*current val*/ R0, (intptr_t)0, /*new val*/ R16_thread, owner_addr_Reg,
2516            MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2517            MacroAssembler::cmpxchgx_hint_acquire_lock(), noreg, &L_decrement_retry, true);
2518 
2519   if (RTMRetryCount > 0) {
2520     // success done else retry
2521     b(DONE_LABEL);
2522     bind(L_decrement_retry);
2523     // Spin and retry if lock is busy.
2524     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, owner_addr_Reg, L_rtm_retry);
2525   } else {
2526     bind(L_decrement_retry);
2527   }
2528 }
2529 
2530 #endif //  INCLUDE_RTM_OPT
2531 
2532 // "The box" is the space on the stack where we copy the object mark.
2533 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box,
2534                                                Register temp, Register displaced_header, Register current_header,
2535                                                bool try_bias,
2536                                                RTMLockingCounters* rtm_counters,
2537                                                RTMLockingCounters* stack_rtm_counters,
2538                                                Metadata* method_data,
2539                                                bool use_rtm, bool profile_rtm) {
2540   assert_different_registers(oop, box, temp, displaced_header, current_header);
2541   assert(flag != CCR0, "bad condition register");
2542   Label cont;
2543   Label object_has_monitor;
2544   Label cas_failed;
2545 
2546   // Load markOop from object into displaced_header.
2547   ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop);
2548 
2549 
2550   // Always do locking in runtime.
2551   if (EmitSync & 0x01) {
2552     cmpdi(flag, oop, 0); // Oop can't be 0 here => always false.
2553     return;
2554   }
2555 
2556   if (try_bias) {
2557     biased_locking_enter(flag, oop, displaced_header, temp, current_header, cont);
2558   }
2559 
2560 #if INCLUDE_RTM_OPT
2561   if (UseRTMForStackLocks && use_rtm) {
2562     rtm_stack_locking(flag, oop, displaced_header, temp, /*temp*/ current_header,
2563                       stack_rtm_counters, method_data, profile_rtm,
2564                       cont, object_has_monitor);
2565   }
2566 #endif // INCLUDE_RTM_OPT
2567 
2568   // Handle existing monitor.
2569   if ((EmitSync & 0x02) == 0) {
2570     // The object has an existing monitor iff (mark & monitor_value) != 0.
2571     andi_(temp, displaced_header, markOopDesc::monitor_value);
2572     bne(CCR0, object_has_monitor);
2573   }
2574 
2575   // Set displaced_header to be (markOop of object | UNLOCK_VALUE).
2576   ori(displaced_header, displaced_header, markOopDesc::unlocked_value);
2577 
2578   // Load Compare Value application register.
2579 
2580   // Initialize the box. (Must happen before we update the object mark!)
2581   std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2582 
2583   // Must fence, otherwise, preceding store(s) may float below cmpxchg.
2584   // Compare object markOop with mark and if equal exchange scratch1 with object markOop.
2585   cmpxchgd(/*flag=*/flag,
2586            /*current_value=*/current_header,
2587            /*compare_value=*/displaced_header,
2588            /*exchange_value=*/box,
2589            /*where=*/oop,
2590            MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2591            MacroAssembler::cmpxchgx_hint_acquire_lock(),
2592            noreg,
2593            &cas_failed,
2594            /*check without membar and ldarx first*/true);
2595   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2596 
2597   // If the compare-and-exchange succeeded, then we found an unlocked
2598   // object and we have now locked it.
2599   b(cont);
2600 
2601   bind(cas_failed);
2602   // We did not see an unlocked object so try the fast recursive case.
2603 
2604   // Check if the owner is self by comparing the value in the markOop of object
2605   // (current_header) with the stack pointer.
2606   sub(current_header, current_header, R1_SP);
2607   load_const_optimized(temp, ~(os::vm_page_size()-1) | markOopDesc::lock_mask_in_place);
2608 
2609   and_(R0/*==0?*/, current_header, temp);
2610   // If condition is true we are cont and hence we can store 0 as the
2611   // displaced header in the box, which indicates that it is a recursive lock.
2612   mcrf(flag,CCR0);
2613   std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box);
2614 
2615   // Handle existing monitor.
2616   if ((EmitSync & 0x02) == 0) {
2617     b(cont);
2618 
2619     bind(object_has_monitor);
2620     // The object's monitor m is unlocked iff m->owner == NULL,
2621     // otherwise m->owner may contain a thread or a stack address.
2622 
2623 #if INCLUDE_RTM_OPT
2624     // Use the same RTM locking code in 32- and 64-bit VM.
2625     if (use_rtm) {
2626       rtm_inflated_locking(flag, oop, displaced_header, box, temp, /*temp*/ current_header,
2627                            rtm_counters, method_data, profile_rtm, cont);
2628     } else {
2629 #endif // INCLUDE_RTM_OPT
2630 
2631     // Try to CAS m->owner from NULL to current thread.
2632     addi(temp, displaced_header, ObjectMonitor::owner_offset_in_bytes()-markOopDesc::monitor_value);
2633     cmpxchgd(/*flag=*/flag,
2634              /*current_value=*/current_header,
2635              /*compare_value=*/(intptr_t)0,
2636              /*exchange_value=*/R16_thread,
2637              /*where=*/temp,
2638              MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2639              MacroAssembler::cmpxchgx_hint_acquire_lock());
2640 
2641     // Store a non-null value into the box.
2642     std(box, BasicLock::displaced_header_offset_in_bytes(), box);
2643 
2644 #   ifdef ASSERT
2645     bne(flag, cont);
2646     // We have acquired the monitor, check some invariants.
2647     addi(/*monitor=*/temp, temp, -ObjectMonitor::owner_offset_in_bytes());
2648     // Invariant 1: _recursions should be 0.
2649     //assert(ObjectMonitor::recursions_size_in_bytes() == 8, "unexpected size");
2650     asm_assert_mem8_is_zero(ObjectMonitor::recursions_offset_in_bytes(), temp,
2651                             "monitor->_recursions should be 0", -1);
2652     // Invariant 2: OwnerIsThread shouldn't be 0.
2653     //assert(ObjectMonitor::OwnerIsThread_size_in_bytes() == 4, "unexpected size");
2654     //asm_assert_mem4_isnot_zero(ObjectMonitor::OwnerIsThread_offset_in_bytes(), temp,
2655     //                           "monitor->OwnerIsThread shouldn't be 0", -1);
2656 #   endif
2657 
2658 #if INCLUDE_RTM_OPT
2659     } // use_rtm()
2660 #endif
2661   }
2662 
2663   bind(cont);
2664   // flag == EQ indicates success
2665   // flag == NE indicates failure
2666 }
2667 
2668 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box,
2669                                                  Register temp, Register displaced_header, Register current_header,
2670                                                  bool try_bias, bool use_rtm) {
2671   assert_different_registers(oop, box, temp, displaced_header, current_header);
2672   assert(flag != CCR0, "bad condition register");
2673   Label cont;
2674   Label object_has_monitor;
2675 
2676   // Always do locking in runtime.
2677   if (EmitSync & 0x01) {
2678     cmpdi(flag, oop, 0); // Oop can't be 0 here => always false.
2679     return;
2680   }
2681 
2682   if (try_bias) {
2683     biased_locking_exit(flag, oop, current_header, cont);
2684   }
2685 
2686 #if INCLUDE_RTM_OPT
2687   if (UseRTMForStackLocks && use_rtm) {
2688     assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
2689     Label L_regular_unlock;
2690     ld(current_header, oopDesc::mark_offset_in_bytes(), oop);         // fetch markword
2691     andi(R0, current_header, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
2692     cmpwi(flag, R0, markOopDesc::unlocked_value);                     // bits = 001 unlocked
2693     bne(flag, L_regular_unlock);                                      // else RegularLock
2694     tend_();                                                          // otherwise end...
2695     b(cont);                                                          // ... and we're done
2696     bind(L_regular_unlock);
2697   }
2698 #endif
2699 
2700   // Find the lock address and load the displaced header from the stack.
2701   ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2702 
2703   // If the displaced header is 0, we have a recursive unlock.
2704   cmpdi(flag, displaced_header, 0);
2705   beq(flag, cont);
2706 
2707   // Handle existing monitor.
2708   if ((EmitSync & 0x02) == 0) {
2709     // The object has an existing monitor iff (mark & monitor_value) != 0.
2710     RTM_OPT_ONLY( if (!(UseRTMForStackLocks && use_rtm)) ) // skip load if already done
2711     ld(current_header, oopDesc::mark_offset_in_bytes(), oop);
2712     andi_(R0, current_header, markOopDesc::monitor_value);
2713     bne(CCR0, object_has_monitor);
2714   }
2715 
2716   // Check if it is still a light weight lock, this is is true if we see
2717   // the stack address of the basicLock in the markOop of the object.
2718   // Cmpxchg sets flag to cmpd(current_header, box).
2719   cmpxchgd(/*flag=*/flag,
2720            /*current_value=*/current_header,
2721            /*compare_value=*/box,
2722            /*exchange_value=*/displaced_header,
2723            /*where=*/oop,
2724            MacroAssembler::MemBarRel,
2725            MacroAssembler::cmpxchgx_hint_release_lock(),
2726            noreg,
2727            &cont);
2728 
2729   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2730 
2731   // Handle existing monitor.
2732   if ((EmitSync & 0x02) == 0) {
2733     b(cont);
2734 
2735     bind(object_has_monitor);
2736     addi(current_header, current_header, -markOopDesc::monitor_value); // monitor
2737     ld(temp,             ObjectMonitor::owner_offset_in_bytes(), current_header);
2738 
2739     // It's inflated.
2740 #if INCLUDE_RTM_OPT
2741     if (use_rtm) {
2742       Label L_regular_inflated_unlock;
2743       // Clean monitor_value bit to get valid pointer
2744       cmpdi(flag, temp, 0);
2745       bne(flag, L_regular_inflated_unlock);
2746       tend_();
2747       b(cont);
2748       bind(L_regular_inflated_unlock);
2749     }
2750 #endif
2751 
2752     ld(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header);
2753     xorr(temp, R16_thread, temp);      // Will be 0 if we are the owner.
2754     orr(temp, temp, displaced_header); // Will be 0 if there are 0 recursions.
2755     cmpdi(flag, temp, 0);
2756     bne(flag, cont);
2757 
2758     ld(temp,             ObjectMonitor::EntryList_offset_in_bytes(), current_header);
2759     ld(displaced_header, ObjectMonitor::cxq_offset_in_bytes(), current_header);
2760     orr(temp, temp, displaced_header); // Will be 0 if both are 0.
2761     cmpdi(flag, temp, 0);
2762     bne(flag, cont);
2763     release();
2764     std(temp, ObjectMonitor::owner_offset_in_bytes(), current_header);
2765   }
2766 
2767   bind(cont);
2768   // flag == EQ indicates success
2769   // flag == NE indicates failure
2770 }
2771 
2772 // Write serialization page so VM thread can do a pseudo remote membar.
2773 // We use the current thread pointer to calculate a thread specific
2774 // offset to write to within the page. This minimizes bus traffic
2775 // due to cache line collision.
2776 void MacroAssembler::serialize_memory(Register thread, Register tmp1, Register tmp2) {
2777   srdi(tmp2, thread, os::get_serialize_page_shift_count());
2778 
2779   int mask = os::vm_page_size() - sizeof(int);
2780   if (Assembler::is_simm(mask, 16)) {
2781     andi(tmp2, tmp2, mask);
2782   } else {
2783     lis(tmp1, (int)((signed short) (mask >> 16)));
2784     ori(tmp1, tmp1, mask & 0x0000ffff);
2785     andr(tmp2, tmp2, tmp1);
2786   }
2787 
2788   load_const(tmp1, (long) os::get_memory_serialize_page());
2789   release();
2790   stwx(R0, tmp1, tmp2);
2791 }
2792 
2793 
2794 // GC barrier helper macros
2795 
2796 // Write the card table byte if needed.
2797 void MacroAssembler::card_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp) {
2798   CardTableModRefBS* bs =
2799     barrier_set_cast<CardTableModRefBS>(Universe::heap()->barrier_set());
2800   assert(bs->kind() == BarrierSet::CardTableForRS ||
2801          bs->kind() == BarrierSet::CardTableExtension, "wrong barrier");
2802 #ifdef ASSERT
2803   cmpdi(CCR0, Rnew_val, 0);
2804   asm_assert_ne("null oop not allowed", 0x321);
2805 #endif
2806   card_table_write(bs->byte_map_base, Rtmp, Rstore_addr);
2807 }
2808 
2809 // Write the card table byte.
2810 void MacroAssembler::card_table_write(jbyte* byte_map_base, Register Rtmp, Register Robj) {
2811   assert_different_registers(Robj, Rtmp, R0);
2812   load_const_optimized(Rtmp, (address)byte_map_base, R0);
2813   srdi(Robj, Robj, CardTableModRefBS::card_shift);
2814   li(R0, 0); // dirty
2815   if (UseConcMarkSweepGC) membar(Assembler::StoreStore);
2816   stbx(R0, Rtmp, Robj);
2817 }
2818 
2819 #if INCLUDE_ALL_GCS
2820 // General G1 pre-barrier generator.
2821 // Goal: record the previous value if it is not null.
2822 void MacroAssembler::g1_write_barrier_pre(Register Robj, RegisterOrConstant offset, Register Rpre_val,
2823                                           Register Rtmp1, Register Rtmp2, bool needs_frame) {
2824   Label runtime, filtered;
2825 
2826   // Is marking active?
2827   if (in_bytes(SATBMarkQueue::byte_width_of_active()) == 4) {
2828     lwz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_active()), R16_thread);
2829   } else {
2830     guarantee(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, "Assumption");
2831     lbz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_active()), R16_thread);
2832   }
2833   cmpdi(CCR0, Rtmp1, 0);
2834   beq(CCR0, filtered);
2835 
2836   // Do we need to load the previous value?
2837   if (Robj != noreg) {
2838     // Load the previous value...
2839     if (UseCompressedOops) {
2840       lwz(Rpre_val, offset, Robj);
2841     } else {
2842       ld(Rpre_val, offset, Robj);
2843     }
2844     // Previous value has been loaded into Rpre_val.
2845   }
2846   assert(Rpre_val != noreg, "must have a real register");
2847 
2848   // Is the previous value null?
2849   cmpdi(CCR0, Rpre_val, 0);
2850   beq(CCR0, filtered);
2851 
2852   if (Robj != noreg && UseCompressedOops) {
2853     decode_heap_oop_not_null(Rpre_val);
2854   }
2855 
2856   // OK, it's not filtered, so we'll need to call enqueue. In the normal
2857   // case, pre_val will be a scratch G-reg, but there are some cases in
2858   // which it's an O-reg. In the first case, do a normal call. In the
2859   // latter, do a save here and call the frameless version.
2860 
2861   // Can we store original value in the thread's buffer?
2862   // Is index == 0?
2863   // (The index field is typed as size_t.)
2864   const Register Rbuffer = Rtmp1, Rindex = Rtmp2;
2865 
2866   ld(Rindex, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_index()), R16_thread);
2867   cmpdi(CCR0, Rindex, 0);
2868   beq(CCR0, runtime); // If index == 0, goto runtime.
2869   ld(Rbuffer, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_buf()), R16_thread);
2870 
2871   addi(Rindex, Rindex, -wordSize); // Decrement index.
2872   std(Rindex, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_index()), R16_thread);
2873 
2874   // Record the previous value.
2875   stdx(Rpre_val, Rbuffer, Rindex);
2876   b(filtered);
2877 
2878   bind(runtime);
2879 
2880   // VM call need frame to access(write) O register.
2881   if (needs_frame) {
2882     save_LR_CR(Rtmp1);
2883     push_frame_reg_args(0, Rtmp2);
2884   }
2885 
2886   if (Rpre_val->is_volatile() && Robj == noreg) mr(R31, Rpre_val); // Save pre_val across C call if it was preloaded.
2887   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), Rpre_val, R16_thread);
2888   if (Rpre_val->is_volatile() && Robj == noreg) mr(Rpre_val, R31); // restore
2889 
2890   if (needs_frame) {
2891     pop_frame();
2892     restore_LR_CR(Rtmp1);
2893   }
2894 
2895   bind(filtered);
2896 }
2897 
2898 // General G1 post-barrier generator
2899 // Store cross-region card.
2900 void MacroAssembler::g1_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp1, Register Rtmp2, Register Rtmp3, Label *filtered_ext) {
2901   Label runtime, filtered_int;
2902   Label& filtered = (filtered_ext != NULL) ? *filtered_ext : filtered_int;
2903   assert_different_registers(Rstore_addr, Rnew_val, Rtmp1, Rtmp2);
2904 
2905   G1SATBCardTableLoggingModRefBS* bs =
2906     barrier_set_cast<G1SATBCardTableLoggingModRefBS>(Universe::heap()->barrier_set());
2907 
2908   // Does store cross heap regions?
2909   if (G1RSBarrierRegionFilter) {
2910     xorr(Rtmp1, Rstore_addr, Rnew_val);
2911     srdi_(Rtmp1, Rtmp1, HeapRegion::LogOfHRGrainBytes);
2912     beq(CCR0, filtered);
2913   }
2914 
2915   // Crosses regions, storing NULL?
2916 #ifdef ASSERT
2917   cmpdi(CCR0, Rnew_val, 0);
2918   asm_assert_ne("null oop not allowed (G1)", 0x322); // Checked by caller on PPC64, so following branch is obsolete:
2919   //beq(CCR0, filtered);
2920 #endif
2921 
2922   // Storing region crossing non-NULL, is card already dirty?
2923   assert(sizeof(*bs->byte_map_base) == sizeof(jbyte), "adjust this code");
2924   const Register Rcard_addr = Rtmp1;
2925   Register Rbase = Rtmp2;
2926   load_const_optimized(Rbase, (address)bs->byte_map_base, /*temp*/ Rtmp3);
2927 
2928   srdi(Rcard_addr, Rstore_addr, CardTableModRefBS::card_shift);
2929 
2930   // Get the address of the card.
2931   lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr);
2932   cmpwi(CCR0, Rtmp3, (int)G1SATBCardTableModRefBS::g1_young_card_val());
2933   beq(CCR0, filtered);
2934 
2935   membar(Assembler::StoreLoad);
2936   lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr);  // Reload after membar.
2937   cmpwi(CCR0, Rtmp3 /* card value */, CardTableModRefBS::dirty_card_val());
2938   beq(CCR0, filtered);
2939 
2940   // Storing a region crossing, non-NULL oop, card is clean.
2941   // Dirty card and log.
2942   li(Rtmp3, CardTableModRefBS::dirty_card_val());
2943   //release(); // G1: oops are allowed to get visible after dirty marking.
2944   stbx(Rtmp3, Rbase, Rcard_addr);
2945 
2946   add(Rcard_addr, Rbase, Rcard_addr); // This is the address which needs to get enqueued.
2947   Rbase = noreg; // end of lifetime
2948 
2949   const Register Rqueue_index = Rtmp2,
2950                  Rqueue_buf   = Rtmp3;
2951   ld(Rqueue_index, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_index()), R16_thread);
2952   cmpdi(CCR0, Rqueue_index, 0);
2953   beq(CCR0, runtime); // index == 0 then jump to runtime
2954   ld(Rqueue_buf, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_buf()), R16_thread);
2955 
2956   addi(Rqueue_index, Rqueue_index, -wordSize); // decrement index
2957   std(Rqueue_index, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_index()), R16_thread);
2958 
2959   stdx(Rcard_addr, Rqueue_buf, Rqueue_index); // store card
2960   b(filtered);
2961 
2962   bind(runtime);
2963 
2964   // Save the live input values.
2965   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), Rcard_addr, R16_thread);
2966 
2967   bind(filtered_int);
2968 }
2969 #endif // INCLUDE_ALL_GCS
2970 
2971 // Values for last_Java_pc, and last_Java_sp must comply to the rules
2972 // in frame_ppc.hpp.
2973 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) {
2974   // Always set last_Java_pc and flags first because once last_Java_sp
2975   // is visible has_last_Java_frame is true and users will look at the
2976   // rest of the fields. (Note: flags should always be zero before we
2977   // get here so doesn't need to be set.)
2978 
2979   // Verify that last_Java_pc was zeroed on return to Java
2980   asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread,
2981                           "last_Java_pc not zeroed before leaving Java", 0x200);
2982 
2983   // When returning from calling out from Java mode the frame anchor's
2984   // last_Java_pc will always be set to NULL. It is set here so that
2985   // if we are doing a call to native (not VM) that we capture the
2986   // known pc and don't have to rely on the native call having a
2987   // standard frame linkage where we can find the pc.
2988   if (last_Java_pc != noreg)
2989     std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
2990 
2991   // Set last_Java_sp last.
2992   std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
2993 }
2994 
2995 void MacroAssembler::reset_last_Java_frame(void) {
2996   asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()),
2997                              R16_thread, "SP was not set, still zero", 0x202);
2998 
2999   BLOCK_COMMENT("reset_last_Java_frame {");
3000   li(R0, 0);
3001 
3002   // _last_Java_sp = 0
3003   std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
3004 
3005   // _last_Java_pc = 0
3006   std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
3007   BLOCK_COMMENT("} reset_last_Java_frame");
3008 }
3009 
3010 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) {
3011   assert_different_registers(sp, tmp1);
3012 
3013   // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via
3014   // TOP_IJAVA_FRAME_ABI.
3015   // FIXME: assert that we really have a TOP_IJAVA_FRAME here!
3016 #ifdef CC_INTERP
3017   ld(tmp1/*pc*/, _top_ijava_frame_abi(frame_manager_lr), sp);
3018 #else
3019   address entry = pc();
3020   load_const_optimized(tmp1, entry);
3021 #endif
3022 
3023   set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1);
3024 }
3025 
3026 void MacroAssembler::get_vm_result(Register oop_result) {
3027   // Read:
3028   //   R16_thread
3029   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
3030   //
3031   // Updated:
3032   //   oop_result
3033   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
3034 
3035   verify_thread();
3036 
3037   ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread);
3038   li(R0, 0);
3039   std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread);
3040 
3041   verify_oop(oop_result);
3042 }
3043 
3044 void MacroAssembler::get_vm_result_2(Register metadata_result) {
3045   // Read:
3046   //   R16_thread
3047   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
3048   //
3049   // Updated:
3050   //   metadata_result
3051   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
3052 
3053   ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
3054   li(R0, 0);
3055   std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
3056 }
3057 
3058 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3059   Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided.
3060   if (Universe::narrow_klass_base() != 0) {
3061     // Use dst as temp if it is free.
3062     sub_const_optimized(dst, current, Universe::narrow_klass_base(), R0);
3063     current = dst;
3064   }
3065   if (Universe::narrow_klass_shift() != 0) {
3066     srdi(dst, current, Universe::narrow_klass_shift());
3067     current = dst;
3068   }
3069   return current;
3070 }
3071 
3072 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) {
3073   if (UseCompressedClassPointers) {
3074     Register compressedKlass = encode_klass_not_null(ck, klass);
3075     stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop);
3076   } else {
3077     std(klass, oopDesc::klass_offset_in_bytes(), dst_oop);
3078   }
3079 }
3080 
3081 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) {
3082   if (UseCompressedClassPointers) {
3083     if (val == noreg) {
3084       val = R0;
3085       li(val, 0);
3086     }
3087     stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed
3088   }
3089 }
3090 
3091 int MacroAssembler::instr_size_for_decode_klass_not_null() {
3092   if (!UseCompressedClassPointers) return 0;
3093   int num_instrs = 1;  // shift or move
3094   if (Universe::narrow_klass_base() != 0) num_instrs = 7;  // shift + load const + add
3095   return num_instrs * BytesPerInstWord;
3096 }
3097 
3098 void MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3099   assert(dst != R0, "Dst reg may not be R0, as R0 is used here.");
3100   if (src == noreg) src = dst;
3101   Register shifted_src = src;
3102   if (Universe::narrow_klass_shift() != 0 ||
3103       Universe::narrow_klass_base() == 0 && src != dst) {  // Move required.
3104     shifted_src = dst;
3105     sldi(shifted_src, src, Universe::narrow_klass_shift());
3106   }
3107   if (Universe::narrow_klass_base() != 0) {
3108     add_const_optimized(dst, shifted_src, Universe::narrow_klass_base(), R0);
3109   }
3110 }
3111 
3112 void MacroAssembler::load_klass(Register dst, Register src) {
3113   if (UseCompressedClassPointers) {
3114     lwz(dst, oopDesc::klass_offset_in_bytes(), src);
3115     // Attention: no null check here!
3116     decode_klass_not_null(dst, dst);
3117   } else {
3118     ld(dst, oopDesc::klass_offset_in_bytes(), src);
3119   }
3120 }
3121 
3122 // Clear Array
3123 // Kills both input registers. tmp == R0 is allowed.
3124 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp) {
3125   // Procedure for large arrays (uses data cache block zero instruction).
3126     Label startloop, fast, fastloop, small_rest, restloop, done;
3127     const int cl_size         = VM_Version::L1_data_cache_line_size(),
3128               cl_dwords       = cl_size>>3,
3129               cl_dw_addr_bits = exact_log2(cl_dwords),
3130               dcbz_min        = 1;                     // Min count of dcbz executions, needs to be >0.
3131 
3132 //2:
3133     cmpdi(CCR1, cnt_dwords, ((dcbz_min+1)<<cl_dw_addr_bits)-1); // Big enough? (ensure >=dcbz_min lines included).
3134     blt(CCR1, small_rest);                                      // Too small.
3135     rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits);           // Extract dword offset within first cache line.
3136     beq(CCR0, fast);                                            // Already 128byte aligned.
3137 
3138     subfic(tmp, tmp, cl_dwords);
3139     mtctr(tmp);                        // Set ctr to hit 128byte boundary (0<ctr<cl_dwords).
3140     subf(cnt_dwords, tmp, cnt_dwords); // rest.
3141     li(tmp, 0);
3142 //10:
3143   bind(startloop);                     // Clear at the beginning to reach 128byte boundary.
3144     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3145     addi(base_ptr, base_ptr, 8);
3146     bdnz(startloop);
3147 //13:
3148   bind(fast);                                  // Clear 128byte blocks.
3149     srdi(tmp, cnt_dwords, cl_dw_addr_bits);    // Loop count for 128byte loop (>0).
3150     andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords.
3151     mtctr(tmp);                                // Load counter.
3152 //16:
3153   bind(fastloop);
3154     dcbz(base_ptr);                    // Clear 128byte aligned block.
3155     addi(base_ptr, base_ptr, cl_size);
3156     bdnz(fastloop);
3157     if (InsertEndGroupPPC64) { endgroup(); } else { nop(); }
3158 //20:
3159   bind(small_rest);
3160     cmpdi(CCR0, cnt_dwords, 0);        // size 0?
3161     beq(CCR0, done);                   // rest == 0
3162     li(tmp, 0);
3163     mtctr(cnt_dwords);                 // Load counter.
3164 //24:
3165   bind(restloop);                      // Clear rest.
3166     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3167     addi(base_ptr, base_ptr, 8);
3168     bdnz(restloop);
3169 //27:
3170   bind(done);
3171 }
3172 
3173 /////////////////////////////////////////// String intrinsics ////////////////////////////////////////////
3174 
3175 // Search for a single jchar in an jchar[].
3176 //
3177 // Assumes that result differs from all other registers.
3178 //
3179 // Haystack, needle are the addresses of jchar-arrays.
3180 // NeedleChar is needle[0] if it is known at compile time.
3181 // Haycnt is the length of the haystack. We assume haycnt >=1.
3182 //
3183 // Preserves haystack, haycnt, kills all other registers.
3184 //
3185 // If needle == R0, we search for the constant needleChar.
3186 void MacroAssembler::string_indexof_1(Register result, Register haystack, Register haycnt,
3187                                       Register needle, jchar needleChar,
3188                                       Register tmp1, Register tmp2) {
3189 
3190   assert_different_registers(result, haystack, haycnt, needle, tmp1, tmp2);
3191 
3192   Label L_InnerLoop, L_FinalCheck, L_Found1, L_Found2, L_Found3, L_NotFound, L_End;
3193   Register needle0 = needle, // Contains needle[0].
3194            addr = tmp1,
3195            ch1 = tmp2,
3196            ch2 = R0;
3197 
3198 //2 (variable) or 3 (const):
3199    if (needle != R0) lhz(needle0, 0, needle); // Preload needle character, needle has len==1.
3200    dcbtct(haystack, 0x00);                        // Indicate R/O access to haystack.
3201 
3202    srwi_(tmp2, haycnt, 1);   // Shift right by exact_log2(UNROLL_FACTOR).
3203    mr(addr, haystack);
3204    beq(CCR0, L_FinalCheck);
3205    mtctr(tmp2);              // Move to count register.
3206 //8:
3207   bind(L_InnerLoop);             // Main work horse (2x unrolled search loop).
3208    lhz(ch1, 0, addr);        // Load characters from haystack.
3209    lhz(ch2, 2, addr);
3210    (needle != R0) ? cmpw(CCR0, ch1, needle0) : cmplwi(CCR0, ch1, needleChar);
3211    (needle != R0) ? cmpw(CCR1, ch2, needle0) : cmplwi(CCR1, ch2, needleChar);
3212    beq(CCR0, L_Found1);   // Did we find the needle?
3213    beq(CCR1, L_Found2);
3214    addi(addr, addr, 4);
3215    bdnz(L_InnerLoop);
3216 //16:
3217   bind(L_FinalCheck);
3218    andi_(R0, haycnt, 1);
3219    beq(CCR0, L_NotFound);
3220    lhz(ch1, 0, addr);        // One position left at which we have to compare.
3221    (needle != R0) ? cmpw(CCR1, ch1, needle0) : cmplwi(CCR1, ch1, needleChar);
3222    beq(CCR1, L_Found3);
3223 //21:
3224   bind(L_NotFound);
3225    li(result, -1);           // Not found.
3226    b(L_End);
3227 
3228   bind(L_Found2);
3229    addi(addr, addr, 2);
3230 //24:
3231   bind(L_Found1);
3232   bind(L_Found3);                  // Return index ...
3233    subf(addr, haystack, addr); // relative to haystack,
3234    srdi(result, addr, 1);      // in characters.
3235   bind(L_End);
3236 }
3237 
3238 
3239 // Implementation of IndexOf for jchar arrays.
3240 //
3241 // The length of haystack and needle are not constant, i.e. passed in a register.
3242 //
3243 // Preserves registers haystack, needle.
3244 // Kills registers haycnt, needlecnt.
3245 // Assumes that result differs from all other registers.
3246 // Haystack, needle are the addresses of jchar-arrays.
3247 // Haycnt, needlecnt are the lengths of them, respectively.
3248 //
3249 // Needlecntval must be zero or 15-bit unsigned immediate and > 1.
3250 void MacroAssembler::string_indexof(Register result, Register haystack, Register haycnt,
3251                                     Register needle, ciTypeArray* needle_values, Register needlecnt, int needlecntval,
3252                                     Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
3253 
3254   // Ensure 0<needlecnt<=haycnt in ideal graph as prerequisite!
3255   Label L_TooShort, L_Found, L_NotFound, L_End;
3256   Register last_addr = haycnt, // Kill haycnt at the beginning.
3257            addr      = tmp1,
3258            n_start   = tmp2,
3259            ch1       = tmp3,
3260            ch2       = R0;
3261 
3262   // **************************************************************************************************
3263   // Prepare for main loop: optimized for needle count >=2, bail out otherwise.
3264   // **************************************************************************************************
3265 
3266 //1 (variable) or 3 (const):
3267    dcbtct(needle, 0x00);    // Indicate R/O access to str1.
3268    dcbtct(haystack, 0x00);  // Indicate R/O access to str2.
3269 
3270   // Compute last haystack addr to use if no match gets found.
3271   if (needlecntval == 0) { // variable needlecnt
3272 //3:
3273    subf(ch1, needlecnt, haycnt);      // Last character index to compare is haycnt-needlecnt.
3274    addi(addr, haystack, -2);          // Accesses use pre-increment.
3275    cmpwi(CCR6, needlecnt, 2);
3276    blt(CCR6, L_TooShort);          // Variable needlecnt: handle short needle separately.
3277    slwi(ch1, ch1, 1);                 // Scale to number of bytes.
3278    lwz(n_start, 0, needle);           // Load first 2 characters of needle.
3279    add(last_addr, haystack, ch1);     // Point to last address to compare (haystack+2*(haycnt-needlecnt)).
3280    addi(needlecnt, needlecnt, -2);    // Rest of needle.
3281   } else { // constant needlecnt
3282   guarantee(needlecntval != 1, "IndexOf with single-character needle must be handled separately");
3283   assert((needlecntval & 0x7fff) == needlecntval, "wrong immediate");
3284 //5:
3285    addi(ch1, haycnt, -needlecntval);  // Last character index to compare is haycnt-needlecnt.
3286    lwz(n_start, 0, needle);           // Load first 2 characters of needle.
3287    addi(addr, haystack, -2);          // Accesses use pre-increment.
3288    slwi(ch1, ch1, 1);                 // Scale to number of bytes.
3289    add(last_addr, haystack, ch1);     // Point to last address to compare (haystack+2*(haycnt-needlecnt)).
3290    li(needlecnt, needlecntval-2);     // Rest of needle.
3291   }
3292 
3293   // Main Loop (now we have at least 3 characters).
3294 //11:
3295   Label L_OuterLoop, L_InnerLoop, L_FinalCheck, L_Comp1, L_Comp2, L_Comp3;
3296   bind(L_OuterLoop); // Search for 1st 2 characters.
3297   Register addr_diff = tmp4;
3298    subf(addr_diff, addr, last_addr); // Difference between already checked address and last address to check.
3299    addi(addr, addr, 2);              // This is the new address we want to use for comparing.
3300    srdi_(ch2, addr_diff, 2);
3301    beq(CCR0, L_FinalCheck);       // 2 characters left?
3302    mtctr(ch2);                       // addr_diff/4
3303 //16:
3304   bind(L_InnerLoop);                // Main work horse (2x unrolled search loop)
3305    lwz(ch1, 0, addr);           // Load 2 characters of haystack (ignore alignment).
3306    lwz(ch2, 2, addr);
3307    cmpw(CCR0, ch1, n_start); // Compare 2 characters (1 would be sufficient but try to reduce branches to CompLoop).
3308    cmpw(CCR1, ch2, n_start);
3309    beq(CCR0, L_Comp1);       // Did we find the needle start?
3310    beq(CCR1, L_Comp2);
3311    addi(addr, addr, 4);
3312    bdnz(L_InnerLoop);
3313 //24:
3314   bind(L_FinalCheck);
3315    rldicl_(addr_diff, addr_diff, 64-1, 63); // Remaining characters not covered by InnerLoop: (addr_diff>>1)&1.
3316    beq(CCR0, L_NotFound);
3317    lwz(ch1, 0, addr);                       // One position left at which we have to compare.
3318    cmpw(CCR1, ch1, n_start);
3319    beq(CCR1, L_Comp3);
3320 //29:
3321   bind(L_NotFound);
3322    li(result, -1); // not found
3323    b(L_End);
3324 
3325 
3326    // **************************************************************************************************
3327    // Special Case: unfortunately, the variable needle case can be called with needlecnt<2
3328    // **************************************************************************************************
3329 //31:
3330  if ((needlecntval>>1) !=1 ) { // Const needlecnt is 2 or 3? Reduce code size.
3331   int nopcnt = 5;
3332   if (needlecntval !=0 ) ++nopcnt; // Balance alignment (other case: see below).
3333   if (needlecntval == 0) {         // We have to handle these cases separately.
3334   Label L_OneCharLoop;
3335   bind(L_TooShort);
3336    mtctr(haycnt);
3337    lhz(n_start, 0, needle);    // First character of needle
3338   bind(L_OneCharLoop);
3339    lhzu(ch1, 2, addr);
3340    cmpw(CCR1, ch1, n_start);
3341    beq(CCR1, L_Found);      // Did we find the one character needle?
3342    bdnz(L_OneCharLoop);
3343    li(result, -1);             // Not found.
3344    b(L_End);
3345   } // 8 instructions, so no impact on alignment.
3346   for (int x = 0; x < nopcnt; ++x) nop();
3347  }
3348 
3349   // **************************************************************************************************
3350   // Regular Case Part II: compare rest of needle (first 2 characters have been compared already)
3351   // **************************************************************************************************
3352 
3353   // Compare the rest
3354 //36 if needlecntval==0, else 37:
3355   bind(L_Comp2);
3356    addi(addr, addr, 2); // First comparison has failed, 2nd one hit.
3357   bind(L_Comp1);            // Addr points to possible needle start.
3358   bind(L_Comp3);            // Could have created a copy and use a different return address but saving code size here.
3359   if (needlecntval != 2) {  // Const needlecnt==2?
3360    if (needlecntval != 3) {
3361     if (needlecntval == 0) beq(CCR6, L_Found); // Variable needlecnt==2?
3362     Register ind_reg = tmp4;
3363     li(ind_reg, 2*2);   // First 2 characters are already compared, use index 2.
3364     mtctr(needlecnt);   // Decremented by 2, still > 0.
3365 //40:
3366    Label L_CompLoop;
3367    bind(L_CompLoop);
3368     lhzx(ch2, needle, ind_reg);
3369     lhzx(ch1, addr, ind_reg);
3370     cmpw(CCR1, ch1, ch2);
3371     bne(CCR1, L_OuterLoop);
3372     addi(ind_reg, ind_reg, 2);
3373     bdnz(L_CompLoop);
3374    } else { // No loop required if there's only one needle character left.
3375     lhz(ch2, 2*2, needle);
3376     lhz(ch1, 2*2, addr);
3377     cmpw(CCR1, ch1, ch2);
3378     bne(CCR1, L_OuterLoop);
3379    }
3380   }
3381   // Return index ...
3382 //46:
3383   bind(L_Found);
3384    subf(addr, haystack, addr); // relative to haystack, ...
3385    srdi(result, addr, 1);      // in characters.
3386 //48:
3387   bind(L_End);
3388 }
3389 
3390 // Implementation of Compare for jchar arrays.
3391 //
3392 // Kills the registers str1, str2, cnt1, cnt2.
3393 // Kills cr0, ctr.
3394 // Assumes that result differes from the input registers.
3395 void MacroAssembler::string_compare(Register str1_reg, Register str2_reg, Register cnt1_reg, Register cnt2_reg,
3396                                     Register result_reg, Register tmp_reg) {
3397    assert_different_registers(result_reg, str1_reg, str2_reg, cnt1_reg, cnt2_reg, tmp_reg);
3398 
3399    Label Ldone, Lslow_case, Lslow_loop, Lfast_loop;
3400    Register cnt_diff = R0,
3401             limit_reg = cnt1_reg,
3402             chr1_reg = result_reg,
3403             chr2_reg = cnt2_reg,
3404             addr_diff = str2_reg;
3405 
3406    // Offset 0 should be 32 byte aligned.
3407 //-4:
3408     dcbtct(str1_reg, 0x00);  // Indicate R/O access to str1.
3409     dcbtct(str2_reg, 0x00);  // Indicate R/O access to str2.
3410 //-2:
3411    // Compute min(cnt1, cnt2) and check if 0 (bail out if we don't need to compare characters).
3412     subf(result_reg, cnt2_reg, cnt1_reg);  // difference between cnt1/2
3413     subf_(addr_diff, str1_reg, str2_reg);  // alias?
3414     beq(CCR0, Ldone);                   // return cnt difference if both ones are identical
3415     srawi(limit_reg, result_reg, 31);      // generate signmask (cnt1/2 must be non-negative so cnt_diff can't overflow)
3416     mr(cnt_diff, result_reg);
3417     andr(limit_reg, result_reg, limit_reg); // difference or zero (negative): cnt1<cnt2 ? cnt1-cnt2 : 0
3418     add_(limit_reg, cnt2_reg, limit_reg);  // min(cnt1, cnt2)==0?
3419     beq(CCR0, Ldone);                   // return cnt difference if one has 0 length
3420 
3421     lhz(chr1_reg, 0, str1_reg);            // optional: early out if first characters mismatch
3422     lhzx(chr2_reg, str1_reg, addr_diff);   // optional: early out if first characters mismatch
3423     addi(tmp_reg, limit_reg, -1);          // min(cnt1, cnt2)-1
3424     subf_(result_reg, chr2_reg, chr1_reg); // optional: early out if first characters mismatch
3425     bne(CCR0, Ldone);                   // optional: early out if first characters mismatch
3426 
3427    // Set loop counter by scaling down tmp_reg
3428     srawi_(chr2_reg, tmp_reg, exact_log2(4)); // (min(cnt1, cnt2)-1)/4
3429     ble(CCR0, Lslow_case);                 // need >4 characters for fast loop
3430     andi(limit_reg, tmp_reg, 4-1);            // remaining characters
3431 
3432    // Adapt str1_reg str2_reg for the first loop iteration
3433     mtctr(chr2_reg);                 // (min(cnt1, cnt2)-1)/4
3434     addi(limit_reg, limit_reg, 4+1); // compare last 5-8 characters in slow_case if mismatch found in fast_loop
3435 //16:
3436    // Compare the rest of the characters
3437    bind(Lfast_loop);
3438     ld(chr1_reg, 0, str1_reg);
3439     ldx(chr2_reg, str1_reg, addr_diff);
3440     cmpd(CCR0, chr2_reg, chr1_reg);
3441     bne(CCR0, Lslow_case); // return chr1_reg
3442     addi(str1_reg, str1_reg, 4*2);
3443     bdnz(Lfast_loop);
3444     addi(limit_reg, limit_reg, -4); // no mismatch found in fast_loop, only 1-4 characters missing
3445 //23:
3446    bind(Lslow_case);
3447     mtctr(limit_reg);
3448 //24:
3449    bind(Lslow_loop);
3450     lhz(chr1_reg, 0, str1_reg);
3451     lhzx(chr2_reg, str1_reg, addr_diff);
3452     subf_(result_reg, chr2_reg, chr1_reg);
3453     bne(CCR0, Ldone); // return chr1_reg
3454     addi(str1_reg, str1_reg, 1*2);
3455     bdnz(Lslow_loop);
3456 //30:
3457    // If strings are equal up to min length, return the length difference.
3458     mr(result_reg, cnt_diff);
3459     nop(); // alignment
3460 //32:
3461    // Otherwise, return the difference between the first mismatched chars.
3462    bind(Ldone);
3463 }
3464 
3465 
3466 // Compare char[] arrays.
3467 //
3468 // str1_reg   USE only
3469 // str2_reg   USE only
3470 // cnt_reg    USE_DEF, due to tmp reg shortage
3471 // result_reg DEF only, might compromise USE only registers
3472 void MacroAssembler::char_arrays_equals(Register str1_reg, Register str2_reg, Register cnt_reg, Register result_reg,
3473                                         Register tmp1_reg, Register tmp2_reg, Register tmp3_reg, Register tmp4_reg,
3474                                         Register tmp5_reg) {
3475 
3476   // Str1 may be the same register as str2 which can occur e.g. after scalar replacement.
3477   assert_different_registers(result_reg, str1_reg, cnt_reg, tmp1_reg, tmp2_reg, tmp3_reg, tmp4_reg, tmp5_reg);
3478   assert_different_registers(result_reg, str2_reg, cnt_reg, tmp1_reg, tmp2_reg, tmp3_reg, tmp4_reg, tmp5_reg);
3479 
3480   // Offset 0 should be 32 byte aligned.
3481   Label Linit_cbc, Lcbc, Lloop, Ldone_true, Ldone_false;
3482   Register index_reg = tmp5_reg;
3483   Register cbc_iter  = tmp4_reg;
3484 
3485 //-1:
3486   dcbtct(str1_reg, 0x00);  // Indicate R/O access to str1.
3487   dcbtct(str2_reg, 0x00);  // Indicate R/O access to str2.
3488 //1:
3489   andi(cbc_iter, cnt_reg, 4-1);            // Remaining iterations after 4 java characters per iteration loop.
3490   li(index_reg, 0); // init
3491   li(result_reg, 0); // assume false
3492   srwi_(tmp2_reg, cnt_reg, exact_log2(4)); // Div: 4 java characters per iteration (main loop).
3493 
3494   cmpwi(CCR1, cbc_iter, 0);             // CCR1 = (cbc_iter==0)
3495   beq(CCR0, Linit_cbc);                 // too short
3496     mtctr(tmp2_reg);
3497 //8:
3498     bind(Lloop);
3499       ldx(tmp1_reg, str1_reg, index_reg);
3500       ldx(tmp2_reg, str2_reg, index_reg);
3501       cmpd(CCR0, tmp1_reg, tmp2_reg);
3502       bne(CCR0, Ldone_false);  // Unequal char pair found -> done.
3503       addi(index_reg, index_reg, 4*sizeof(jchar));
3504       bdnz(Lloop);
3505 //14:
3506   bind(Linit_cbc);
3507   beq(CCR1, Ldone_true);
3508     mtctr(cbc_iter);
3509 //16:
3510     bind(Lcbc);
3511       lhzx(tmp1_reg, str1_reg, index_reg);
3512       lhzx(tmp2_reg, str2_reg, index_reg);
3513       cmpw(CCR0, tmp1_reg, tmp2_reg);
3514       bne(CCR0, Ldone_false);  // Unequal char pair found -> done.
3515       addi(index_reg, index_reg, 1*sizeof(jchar));
3516       bdnz(Lcbc);
3517     nop();
3518   bind(Ldone_true);
3519   li(result_reg, 1);
3520 //24:
3521   bind(Ldone_false);
3522 }
3523 
3524 
3525 void MacroAssembler::char_arrays_equalsImm(Register str1_reg, Register str2_reg, int cntval, Register result_reg,
3526                                            Register tmp1_reg, Register tmp2_reg) {
3527   // Str1 may be the same register as str2 which can occur e.g. after scalar replacement.
3528   assert_different_registers(result_reg, str1_reg, tmp1_reg, tmp2_reg);
3529   assert_different_registers(result_reg, str2_reg, tmp1_reg, tmp2_reg);
3530   assert(sizeof(jchar) == 2, "must be");
3531   assert(cntval >= 0 && ((cntval & 0x7fff) == cntval), "wrong immediate");
3532 
3533   Label Ldone_false;
3534 
3535   if (cntval < 16) { // short case
3536     if (cntval != 0) li(result_reg, 0); // assume false
3537 
3538     const int num_bytes = cntval*sizeof(jchar);
3539     int index = 0;
3540     for (int next_index; (next_index = index + 8) <= num_bytes; index = next_index) {
3541       ld(tmp1_reg, index, str1_reg);
3542       ld(tmp2_reg, index, str2_reg);
3543       cmpd(CCR0, tmp1_reg, tmp2_reg);
3544       bne(CCR0, Ldone_false);
3545     }
3546     if (cntval & 2) {
3547       lwz(tmp1_reg, index, str1_reg);
3548       lwz(tmp2_reg, index, str2_reg);
3549       cmpw(CCR0, tmp1_reg, tmp2_reg);
3550       bne(CCR0, Ldone_false);
3551       index += 4;
3552     }
3553     if (cntval & 1) {
3554       lhz(tmp1_reg, index, str1_reg);
3555       lhz(tmp2_reg, index, str2_reg);
3556       cmpw(CCR0, tmp1_reg, tmp2_reg);
3557       bne(CCR0, Ldone_false);
3558     }
3559     // fallthrough: true
3560   } else {
3561     Label Lloop;
3562     Register index_reg = tmp1_reg;
3563     const int loopcnt = cntval/4;
3564     assert(loopcnt > 0, "must be");
3565     // Offset 0 should be 32 byte aligned.
3566     //2:
3567     dcbtct(str1_reg, 0x00);  // Indicate R/O access to str1.
3568     dcbtct(str2_reg, 0x00);  // Indicate R/O access to str2.
3569     li(tmp2_reg, loopcnt);
3570     li(index_reg, 0); // init
3571     li(result_reg, 0); // assume false
3572     mtctr(tmp2_reg);
3573     //8:
3574     bind(Lloop);
3575     ldx(R0, str1_reg, index_reg);
3576     ldx(tmp2_reg, str2_reg, index_reg);
3577     cmpd(CCR0, R0, tmp2_reg);
3578     bne(CCR0, Ldone_false);  // Unequal char pair found -> done.
3579     addi(index_reg, index_reg, 4*sizeof(jchar));
3580     bdnz(Lloop);
3581     //14:
3582     if (cntval & 2) {
3583       lwzx(R0, str1_reg, index_reg);
3584       lwzx(tmp2_reg, str2_reg, index_reg);
3585       cmpw(CCR0, R0, tmp2_reg);
3586       bne(CCR0, Ldone_false);
3587       if (cntval & 1) addi(index_reg, index_reg, 2*sizeof(jchar));
3588     }
3589     if (cntval & 1) {
3590       lhzx(R0, str1_reg, index_reg);
3591       lhzx(tmp2_reg, str2_reg, index_reg);
3592       cmpw(CCR0, R0, tmp2_reg);
3593       bne(CCR0, Ldone_false);
3594     }
3595     // fallthru: true
3596   }
3597   li(result_reg, 1);
3598   bind(Ldone_false);
3599 }
3600 
3601 // Helpers for Intrinsic Emitters
3602 //
3603 // Revert the byte order of a 32bit value in a register
3604 //   src: 0x44556677
3605 //   dst: 0x77665544
3606 // Three steps to obtain the result:
3607 //  1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word
3608 //     into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared.
3609 //     This value initializes dst.
3610 //  2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost
3611 //     byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go.
3612 //     This value is mask inserted into dst with a [0..23] mask of 1s.
3613 //  3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position.
3614 //     This value is mask inserted into dst with a [8..15] mask of 1s.
3615 void MacroAssembler::load_reverse_32(Register dst, Register src) {
3616   assert_different_registers(dst, src);
3617 
3618   rldicl(dst, src, (4+1)*8, 56);       // Rotate byte 4 into position 7 (rightmost), clear all to the left.
3619   rlwimi(dst, src,     3*8,  0, 23);   // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone.
3620   rlwimi(dst, src,     1*8,  8, 15);   // Insert byte 6 into position 5, leave the rest alone.
3621 }
3622 
3623 // Calculate the column addresses of the crc32 lookup table into distinct registers.
3624 // This loop-invariant calculation is moved out of the loop body, reducing the loop
3625 // body size from 20 to 16 instructions.
3626 // Returns the offset that was used to calculate the address of column tc3.
3627 // Due to register shortage, setting tc3 may overwrite table. With the return offset
3628 // at hand, the original table address can be easily reconstructed.
3629 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {
3630 
3631 #ifdef VM_LITTLE_ENDIAN
3632   // This is what we implement (the DOLIT4 part):
3633   // ========================================================================= */
3634   // #define DOLIT4 c ^= *buf4++; \
3635   //         c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \
3636   //             crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24]
3637   // #define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4
3638   // ========================================================================= */
3639   const int ix0 = 3*(4*CRC32_COLUMN_SIZE);
3640   const int ix1 = 2*(4*CRC32_COLUMN_SIZE);
3641   const int ix2 = 1*(4*CRC32_COLUMN_SIZE);
3642   const int ix3 = 0*(4*CRC32_COLUMN_SIZE);
3643 #else
3644   // This is what we implement (the DOBIG4 part):
3645   // =========================================================================
3646   // #define DOBIG4 c ^= *++buf4; \
3647   //         c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \
3648   //             crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24]
3649   // #define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4
3650   // =========================================================================
3651   const int ix0 = 4*(4*CRC32_COLUMN_SIZE);
3652   const int ix1 = 5*(4*CRC32_COLUMN_SIZE);
3653   const int ix2 = 6*(4*CRC32_COLUMN_SIZE);
3654   const int ix3 = 7*(4*CRC32_COLUMN_SIZE);
3655 #endif
3656   assert_different_registers(table, tc0, tc1, tc2);
3657   assert(table == tc3, "must be!");
3658 
3659   addi(tc0, table, ix0);
3660   addi(tc1, table, ix1);
3661   addi(tc2, table, ix2);
3662   if (ix3 != 0) addi(tc3, table, ix3);
3663 
3664   return ix3;
3665 }
3666 
3667 /**
3668  * uint32_t crc;
3669  * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
3670  */
3671 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) {
3672   assert_different_registers(crc, table, tmp);
3673   assert_different_registers(val, table);
3674 
3675   if (crc == val) {                   // Must rotate first to use the unmodified value.
3676     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3677                                       // As we use a word (4-byte) instruction, we have to adapt the mask bit positions.
3678     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
3679   } else {
3680     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
3681     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3682   }
3683   lwzx(tmp, table, tmp);
3684   xorr(crc, crc, tmp);
3685 }
3686 
3687 /**
3688  * uint32_t crc;
3689  * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
3690  */
3691 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
3692   fold_byte_crc32(crc, crc, table, tmp);
3693 }
3694 
3695 /**
3696  * Emits code to update CRC-32 with a byte value according to constants in table.
3697  *
3698  * @param [in,out]crc   Register containing the crc.
3699  * @param [in]val       Register containing the byte to fold into the CRC.
3700  * @param [in]table     Register containing the table of crc constants.
3701  *
3702  * uint32_t crc;
3703  * val = crc_table[(val ^ crc) & 0xFF];
3704  * crc = val ^ (crc >> 8);
3705  */
3706 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3707   BLOCK_COMMENT("update_byte_crc32:");
3708   xorr(val, val, crc);
3709   fold_byte_crc32(crc, val, table, val);
3710 }
3711 
3712 /**
3713  * @param crc   register containing existing CRC (32-bit)
3714  * @param buf   register pointing to input byte buffer (byte*)
3715  * @param len   register containing number of bytes
3716  * @param table register pointing to CRC table
3717  */
3718 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,
3719                                            Register data, bool loopAlignment, bool invertCRC) {
3720   assert_different_registers(crc, buf, len, table, data);
3721 
3722   Label L_mainLoop, L_done;
3723   const int mainLoop_stepping  = 1;
3724   const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4;
3725 
3726   // Process all bytes in a single-byte loop.
3727   clrldi_(len, len, 32);                         // Enforce 32 bit. Anything to do?
3728   beq(CCR0, L_done);
3729 
3730   if (invertCRC) {
3731     nand(crc, crc, crc);                         // ~c
3732   }
3733 
3734   mtctr(len);
3735   align(mainLoop_alignment);
3736   BIND(L_mainLoop);
3737     lbz(data, 0, buf);                           // Byte from buffer, zero-extended.
3738     addi(buf, buf, mainLoop_stepping);           // Advance buffer position.
3739     update_byte_crc32(crc, data, table);
3740     bdnz(L_mainLoop);                            // Iterate.
3741 
3742   if (invertCRC) {
3743     nand(crc, crc, crc);                         // ~c
3744   }
3745 
3746   bind(L_done);
3747 }
3748 
3749 /**
3750  * Emits code to update CRC-32 with a 4-byte value according to constants in table
3751  * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
3752  */
3753 // A not on the lookup table address(es):
3754 // The lookup table consists of two sets of four columns each.
3755 // The columns {0..3} are used for little-endian machines.
3756 // The columns {4..7} are used for big-endian machines.
3757 // To save the effort of adding the column offset to the table address each time
3758 // a table element is looked up, it is possible to pass the pre-calculated
3759 // column addresses.
3760 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary.
3761 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,
3762                                         Register t0,  Register t1,  Register t2,  Register t3,
3763                                         Register tc0, Register tc1, Register tc2, Register tc3) {
3764   assert_different_registers(crc, t3);
3765 
3766   // XOR crc with next four bytes of buffer.
3767   lwz(t3, bufDisp, buf);
3768   if (bufInc != 0) {
3769     addi(buf, buf, bufInc);
3770   }
3771   xorr(t3, t3, crc);
3772 
3773   // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices.
3774   rlwinm(t0, t3,  2,         24-2, 31-2);  // ((t1 >>  0) & 0xff) << 2
3775   rlwinm(t1, t3,  32+(2- 8), 24-2, 31-2);  // ((t1 >>  8) & 0xff) << 2
3776   rlwinm(t2, t3,  32+(2-16), 24-2, 31-2);  // ((t1 >> 16) & 0xff) << 2
3777   rlwinm(t3, t3,  32+(2-24), 24-2, 31-2);  // ((t1 >> 24) & 0xff) << 2
3778 
3779   // Use the pre-calculated column addresses.
3780   // Load pre-calculated table values.
3781   lwzx(t0, tc0, t0);
3782   lwzx(t1, tc1, t1);
3783   lwzx(t2, tc2, t2);
3784   lwzx(t3, tc3, t3);
3785 
3786   // Calculate new crc from table values.
3787   xorr(t0,  t0, t1);
3788   xorr(t2,  t2, t3);
3789   xorr(crc, t0, t2);  // Now crc contains the final checksum value.
3790 }
3791 
3792 /**
3793  * @param crc   register containing existing CRC (32-bit)
3794  * @param buf   register pointing to input byte buffer (byte*)
3795  * @param len   register containing number of bytes
3796  * @param table register pointing to CRC table
3797  *
3798  * Uses R9..R12 as work register. Must be saved/restored by caller!
3799  */
3800 void MacroAssembler::kernel_crc32_2word(Register crc, Register buf, Register len, Register table,
3801                                         Register t0,  Register t1,  Register t2,  Register t3,
3802                                         Register tc0, Register tc1, Register tc2, Register tc3) {
3803   assert_different_registers(crc, buf, len, table);
3804 
3805   Label L_mainLoop, L_tail;
3806   Register  tmp  = t0;
3807   Register  data = t0;
3808   Register  tmp2 = t1;
3809   const int mainLoop_stepping  = 8;
3810   const int tailLoop_stepping  = 1;
3811   const int log_stepping       = exact_log2(mainLoop_stepping);
3812   const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
3813   const int complexThreshold   = 2*mainLoop_stepping;
3814 
3815   // Don't test for len <= 0 here. This pathological case should not occur anyway.
3816   // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles.
3817   // The situation itself is detected and handled correctly by the conditional branches
3818   // following  aghi(len, -stepping) and aghi(len, +stepping).
3819   assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
3820 
3821   BLOCK_COMMENT("kernel_crc32_2word {");
3822 
3823   nand(crc, crc, crc);                           // ~c
3824 
3825   // Check for short (<mainLoop_stepping) buffer.
3826   cmpdi(CCR0, len, complexThreshold);
3827   blt(CCR0, L_tail);
3828 
3829   // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
3830   // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
3831   {
3832     // Align buf addr to mainLoop_stepping boundary.
3833     neg(tmp2, buf);                           // Calculate # preLoop iterations for alignment.
3834     rldicl(tmp2, tmp2, 0, 64-log_stepping);   // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
3835 
3836     if (complexThreshold > mainLoop_stepping) {
3837       sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3838     } else {
3839       sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
3840       cmpdi(CCR0, tmp, mainLoop_stepping);
3841       blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
3842       mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3843     }
3844     update_byteLoop_crc32(crc, buf, tmp2, table, data, false, false);
3845   }
3846 
3847   srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
3848   andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
3849   mtctr(tmp2);
3850 
3851 #ifdef VM_LITTLE_ENDIAN
3852   Register crc_rv = crc;
3853 #else
3854   Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
3855                                                  // Occupies tmp, but frees up crc.
3856   load_reverse_32(crc_rv, crc);                  // Revert byte order because we are dealing with big-endian data.
3857   tmp = crc;
3858 #endif
3859 
3860   int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
3861 
3862   align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
3863   BIND(L_mainLoop);
3864     update_1word_crc32(crc_rv, buf, table, 0, 0, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
3865     update_1word_crc32(crc_rv, buf, table, 4, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
3866     bdnz(L_mainLoop);
3867 
3868 #ifndef VM_LITTLE_ENDIAN
3869   load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
3870   tmp = crc_rv;                                  // Tmp uses it's original register again.
3871 #endif
3872 
3873   // Restore original table address for tailLoop.
3874   if (reconstructTableOffset != 0) {
3875     addi(table, table, -reconstructTableOffset);
3876   }
3877 
3878   // Process last few (<complexThreshold) bytes of buffer.
3879   BIND(L_tail);
3880   update_byteLoop_crc32(crc, buf, len, table, data, false, false);
3881 
3882   nand(crc, crc, crc);                           // ~c
3883   BLOCK_COMMENT("} kernel_crc32_2word");
3884 }
3885 
3886 /**
3887  * @param crc   register containing existing CRC (32-bit)
3888  * @param buf   register pointing to input byte buffer (byte*)
3889  * @param len   register containing number of bytes
3890  * @param table register pointing to CRC table
3891  *
3892  * uses R9..R12 as work register. Must be saved/restored by caller!
3893  */
3894 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table,
3895                                         Register t0,  Register t1,  Register t2,  Register t3,
3896                                         Register tc0, Register tc1, Register tc2, Register tc3) {
3897   assert_different_registers(crc, buf, len, table);
3898 
3899   Label L_mainLoop, L_tail;
3900   Register  tmp          = t0;
3901   Register  data         = t0;
3902   Register  tmp2         = t1;
3903   const int mainLoop_stepping  = 4;
3904   const int tailLoop_stepping  = 1;
3905   const int log_stepping       = exact_log2(mainLoop_stepping);
3906   const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
3907   const int complexThreshold   = 2*mainLoop_stepping;
3908 
3909   // Don't test for len <= 0 here. This pathological case should not occur anyway.
3910   // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles.
3911   // The situation itself is detected and handled correctly by the conditional branches
3912   // following  aghi(len, -stepping) and aghi(len, +stepping).
3913   assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
3914 
3915   BLOCK_COMMENT("kernel_crc32_1word {");
3916 
3917   nand(crc, crc, crc);                           // ~c
3918 
3919   // Check for short (<mainLoop_stepping) buffer.
3920   cmpdi(CCR0, len, complexThreshold);
3921   blt(CCR0, L_tail);
3922 
3923   // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
3924   // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
3925   {
3926     // Align buf addr to mainLoop_stepping boundary.
3927     neg(tmp2, buf);                              // Calculate # preLoop iterations for alignment.
3928     rldicl(tmp2, tmp2, 0, 64-log_stepping);      // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
3929 
3930     if (complexThreshold > mainLoop_stepping) {
3931       sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3932     } else {
3933       sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
3934       cmpdi(CCR0, tmp, mainLoop_stepping);
3935       blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
3936       mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3937     }
3938     update_byteLoop_crc32(crc, buf, tmp2, table, data, false, false);
3939   }
3940 
3941   srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
3942   andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
3943   mtctr(tmp2);
3944 
3945 #ifdef VM_LITTLE_ENDIAN
3946   Register crc_rv = crc;
3947 #else
3948   Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
3949                                                  // Occupies tmp, but frees up crc.
3950   load_reverse_32(crc_rv, crc);                  // Revert byte order because we are dealing with big-endian data.
3951   tmp = crc;
3952 #endif
3953 
3954   int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
3955 
3956   align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
3957   BIND(L_mainLoop);
3958     update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
3959     bdnz(L_mainLoop);
3960 
3961 #ifndef VM_LITTLE_ENDIAN
3962   load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
3963   tmp = crc_rv;                                  // Tmp uses it's original register again.
3964 #endif
3965 
3966   // Restore original table address for tailLoop.
3967   if (reconstructTableOffset != 0) {
3968     addi(table, table, -reconstructTableOffset);
3969   }
3970 
3971   // Process last few (<complexThreshold) bytes of buffer.
3972   BIND(L_tail);
3973   update_byteLoop_crc32(crc, buf, len, table, data, false, false);
3974 
3975   nand(crc, crc, crc);                           // ~c
3976   BLOCK_COMMENT("} kernel_crc32_1word");
3977 }
3978 
3979 /**
3980  * @param crc   register containing existing CRC (32-bit)
3981  * @param buf   register pointing to input byte buffer (byte*)
3982  * @param len   register containing number of bytes
3983  * @param table register pointing to CRC table
3984  *
3985  * Uses R7_ARG5, R8_ARG6 as work registers.
3986  */
3987 void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len, Register table,
3988                                         Register t0,  Register t1,  Register t2,  Register t3) {
3989   assert_different_registers(crc, buf, len, table);
3990 
3991   Register  data = t0;                   // Holds the current byte to be folded into crc.
3992 
3993   BLOCK_COMMENT("kernel_crc32_1byte {");
3994 
3995   // Process all bytes in a single-byte loop.
3996   update_byteLoop_crc32(crc, buf, len, table, data, true, true);
3997 
3998   BLOCK_COMMENT("} kernel_crc32_1byte");
3999 }
4000 
4001 void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp) {
4002   assert_different_registers(crc, buf, /* len,  not used!! */ table, tmp);
4003 
4004   BLOCK_COMMENT("kernel_crc32_singleByte:");
4005   nand(crc, crc, crc);       // ~c
4006 
4007   lbz(tmp, 0, buf);          // Byte from buffer, zero-extended.
4008   update_byte_crc32(crc, tmp, table);
4009 
4010   nand(crc, crc, crc);       // ~c
4011 }
4012 
4013 // dest_lo += src1 + src2
4014 // dest_hi += carry1 + carry2
4015 void MacroAssembler::add2_with_carry(Register dest_hi,
4016                                      Register dest_lo,
4017                                      Register src1, Register src2) {
4018   li(R0, 0);
4019   addc(dest_lo, dest_lo, src1);
4020   adde(dest_hi, dest_hi, R0);
4021   addc(dest_lo, dest_lo, src2);
4022   adde(dest_hi, dest_hi, R0);
4023 }
4024 
4025 // Multiply 64 bit by 64 bit first loop.
4026 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,
4027                                            Register x_xstart,
4028                                            Register y, Register y_idx,
4029                                            Register z,
4030                                            Register carry,
4031                                            Register product_high, Register product,
4032                                            Register idx, Register kdx,
4033                                            Register tmp) {
4034   //  jlong carry, x[], y[], z[];
4035   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
4036   //    huge_128 product = y[idx] * x[xstart] + carry;
4037   //    z[kdx] = (jlong)product;
4038   //    carry  = (jlong)(product >>> 64);
4039   //  }
4040   //  z[xstart] = carry;
4041 
4042   Label L_first_loop, L_first_loop_exit;
4043   Label L_one_x, L_one_y, L_multiply;
4044 
4045   addic_(xstart, xstart, -1);
4046   blt(CCR0, L_one_x);   // Special case: length of x is 1.
4047 
4048   // Load next two integers of x.
4049   sldi(tmp, xstart, LogBytesPerInt);
4050   ldx(x_xstart, x, tmp);
4051 #ifdef VM_LITTLE_ENDIAN
4052   rldicl(x_xstart, x_xstart, 32, 0);
4053 #endif
4054 
4055   align(32, 16);
4056   bind(L_first_loop);
4057 
4058   cmpdi(CCR0, idx, 1);
4059   blt(CCR0, L_first_loop_exit);
4060   addi(idx, idx, -2);
4061   beq(CCR0, L_one_y);
4062 
4063   // Load next two integers of y.
4064   sldi(tmp, idx, LogBytesPerInt);
4065   ldx(y_idx, y, tmp);
4066 #ifdef VM_LITTLE_ENDIAN
4067   rldicl(y_idx, y_idx, 32, 0);
4068 #endif
4069 
4070 
4071   bind(L_multiply);
4072   multiply64(product_high, product, x_xstart, y_idx);
4073 
4074   li(tmp, 0);
4075   addc(product, product, carry);         // Add carry to result.
4076   adde(product_high, product_high, tmp); // Add carry of the last addition.
4077   addi(kdx, kdx, -2);
4078 
4079   // Store result.
4080 #ifdef VM_LITTLE_ENDIAN
4081   rldicl(product, product, 32, 0);
4082 #endif
4083   sldi(tmp, kdx, LogBytesPerInt);
4084   stdx(product, z, tmp);
4085   mr_if_needed(carry, product_high);
4086   b(L_first_loop);
4087 
4088 
4089   bind(L_one_y); // Load one 32 bit portion of y as (0,value).
4090 
4091   lwz(y_idx, 0, y);
4092   b(L_multiply);
4093 
4094 
4095   bind(L_one_x); // Load one 32 bit portion of x as (0,value).
4096 
4097   lwz(x_xstart, 0, x);
4098   b(L_first_loop);
4099 
4100   bind(L_first_loop_exit);
4101 }
4102 
4103 // Multiply 64 bit by 64 bit and add 128 bit.
4104 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y,
4105                                             Register z, Register yz_idx,
4106                                             Register idx, Register carry,
4107                                             Register product_high, Register product,
4108                                             Register tmp, int offset) {
4109 
4110   //  huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
4111   //  z[kdx] = (jlong)product;
4112 
4113   sldi(tmp, idx, LogBytesPerInt);
4114   if (offset) {
4115     addi(tmp, tmp, offset);
4116   }
4117   ldx(yz_idx, y, tmp);
4118 #ifdef VM_LITTLE_ENDIAN
4119   rldicl(yz_idx, yz_idx, 32, 0);
4120 #endif
4121 
4122   multiply64(product_high, product, x_xstart, yz_idx);
4123   ldx(yz_idx, z, tmp);
4124 #ifdef VM_LITTLE_ENDIAN
4125   rldicl(yz_idx, yz_idx, 32, 0);
4126 #endif
4127 
4128   add2_with_carry(product_high, product, carry, yz_idx);
4129 
4130   sldi(tmp, idx, LogBytesPerInt);
4131   if (offset) {
4132     addi(tmp, tmp, offset);
4133   }
4134 #ifdef VM_LITTLE_ENDIAN
4135   rldicl(product, product, 32, 0);
4136 #endif
4137   stdx(product, z, tmp);
4138 }
4139 
4140 // Multiply 128 bit by 128 bit. Unrolled inner loop.
4141 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart,
4142                                              Register y, Register z,
4143                                              Register yz_idx, Register idx, Register carry,
4144                                              Register product_high, Register product,
4145                                              Register carry2, Register tmp) {
4146 
4147   //  jlong carry, x[], y[], z[];
4148   //  int kdx = ystart+1;
4149   //  for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
4150   //    huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
4151   //    z[kdx+idx+1] = (jlong)product;
4152   //    jlong carry2 = (jlong)(product >>> 64);
4153   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
4154   //    z[kdx+idx] = (jlong)product;
4155   //    carry = (jlong)(product >>> 64);
4156   //  }
4157   //  idx += 2;
4158   //  if (idx > 0) {
4159   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
4160   //    z[kdx+idx] = (jlong)product;
4161   //    carry = (jlong)(product >>> 64);
4162   //  }
4163 
4164   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
4165   const Register jdx = R0;
4166 
4167   // Scale the index.
4168   srdi_(jdx, idx, 2);
4169   beq(CCR0, L_third_loop_exit);
4170   mtctr(jdx);
4171 
4172   align(32, 16);
4173   bind(L_third_loop);
4174 
4175   addi(idx, idx, -4);
4176 
4177   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8);
4178   mr_if_needed(carry2, product_high);
4179 
4180   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0);
4181   mr_if_needed(carry, product_high);
4182   bdnz(L_third_loop);
4183 
4184   bind(L_third_loop_exit);  // Handle any left-over operand parts.
4185 
4186   andi_(idx, idx, 0x3);
4187   beq(CCR0, L_post_third_loop_done);
4188 
4189   Label L_check_1;
4190 
4191   addic_(idx, idx, -2);
4192   blt(CCR0, L_check_1);
4193 
4194   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0);
4195   mr_if_needed(carry, product_high);
4196 
4197   bind(L_check_1);
4198 
4199   addi(idx, idx, 0x2);
4200   andi_(idx, idx, 0x1);
4201   addic_(idx, idx, -1);
4202   blt(CCR0, L_post_third_loop_done);
4203 
4204   sldi(tmp, idx, LogBytesPerInt);
4205   lwzx(yz_idx, y, tmp);
4206   multiply64(product_high, product, x_xstart, yz_idx);
4207   lwzx(yz_idx, z, tmp);
4208 
4209   add2_with_carry(product_high, product, yz_idx, carry);
4210 
4211   sldi(tmp, idx, LogBytesPerInt);
4212   stwx(product, z, tmp);
4213   srdi(product, product, 32);
4214 
4215   sldi(product_high, product_high, 32);
4216   orr(product, product, product_high);
4217   mr_if_needed(carry, product);
4218 
4219   bind(L_post_third_loop_done);
4220 }   // multiply_128_x_128_loop
4221 
4222 void MacroAssembler::multiply_to_len(Register x, Register xlen,
4223                                      Register y, Register ylen,
4224                                      Register z, Register zlen,
4225                                      Register tmp1, Register tmp2,
4226                                      Register tmp3, Register tmp4,
4227                                      Register tmp5, Register tmp6,
4228                                      Register tmp7, Register tmp8,
4229                                      Register tmp9, Register tmp10,
4230                                      Register tmp11, Register tmp12,
4231                                      Register tmp13) {
4232 
4233   ShortBranchVerifier sbv(this);
4234 
4235   assert_different_registers(x, xlen, y, ylen, z, zlen,
4236                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
4237   assert_different_registers(x, xlen, y, ylen, z, zlen,
4238                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp7);
4239   assert_different_registers(x, xlen, y, ylen, z, zlen,
4240                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp8);
4241 
4242   const Register idx = tmp1;
4243   const Register kdx = tmp2;
4244   const Register xstart = tmp3;
4245 
4246   const Register y_idx = tmp4;
4247   const Register carry = tmp5;
4248   const Register product = tmp6;
4249   const Register product_high = tmp7;
4250   const Register x_xstart = tmp8;
4251   const Register tmp = tmp9;
4252 
4253   // First Loop.
4254   //
4255   //  final static long LONG_MASK = 0xffffffffL;
4256   //  int xstart = xlen - 1;
4257   //  int ystart = ylen - 1;
4258   //  long carry = 0;
4259   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
4260   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
4261   //    z[kdx] = (int)product;
4262   //    carry = product >>> 32;
4263   //  }
4264   //  z[xstart] = (int)carry;
4265 
4266   mr_if_needed(idx, ylen);        // idx = ylen
4267   mr_if_needed(kdx, zlen);        // kdx = xlen + ylen
4268   li(carry, 0);                   // carry = 0
4269 
4270   Label L_done;
4271 
4272   addic_(xstart, xlen, -1);
4273   blt(CCR0, L_done);
4274 
4275   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z,
4276                         carry, product_high, product, idx, kdx, tmp);
4277 
4278   Label L_second_loop;
4279 
4280   cmpdi(CCR0, kdx, 0);
4281   beq(CCR0, L_second_loop);
4282 
4283   Label L_carry;
4284 
4285   addic_(kdx, kdx, -1);
4286   beq(CCR0, L_carry);
4287 
4288   // Store lower 32 bits of carry.
4289   sldi(tmp, kdx, LogBytesPerInt);
4290   stwx(carry, z, tmp);
4291   srdi(carry, carry, 32);
4292   addi(kdx, kdx, -1);
4293 
4294 
4295   bind(L_carry);
4296 
4297   // Store upper 32 bits of carry.
4298   sldi(tmp, kdx, LogBytesPerInt);
4299   stwx(carry, z, tmp);
4300 
4301   // Second and third (nested) loops.
4302   //
4303   //  for (int i = xstart-1; i >= 0; i--) { // Second loop
4304   //    carry = 0;
4305   //    for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
4306   //      long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
4307   //                     (z[k] & LONG_MASK) + carry;
4308   //      z[k] = (int)product;
4309   //      carry = product >>> 32;
4310   //    }
4311   //    z[i] = (int)carry;
4312   //  }
4313   //
4314   //  i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
4315 
4316   bind(L_second_loop);
4317 
4318   li(carry, 0);                   // carry = 0;
4319 
4320   addic_(xstart, xstart, -1);     // i = xstart-1;
4321   blt(CCR0, L_done);
4322 
4323   Register zsave = tmp10;
4324 
4325   mr(zsave, z);
4326 
4327 
4328   Label L_last_x;
4329 
4330   sldi(tmp, xstart, LogBytesPerInt);
4331   add(z, z, tmp);                 // z = z + k - j
4332   addi(z, z, 4);
4333   addic_(xstart, xstart, -1);     // i = xstart-1;
4334   blt(CCR0, L_last_x);
4335 
4336   sldi(tmp, xstart, LogBytesPerInt);
4337   ldx(x_xstart, x, tmp);
4338 #ifdef VM_LITTLE_ENDIAN
4339   rldicl(x_xstart, x_xstart, 32, 0);
4340 #endif
4341 
4342 
4343   Label L_third_loop_prologue;
4344 
4345   bind(L_third_loop_prologue);
4346 
4347   Register xsave = tmp11;
4348   Register xlensave = tmp12;
4349   Register ylensave = tmp13;
4350 
4351   mr(xsave, x);
4352   mr(xlensave, xstart);
4353   mr(ylensave, ylen);
4354 
4355 
4356   multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen,
4357                           carry, product_high, product, x, tmp);
4358 
4359   mr(z, zsave);
4360   mr(x, xsave);
4361   mr(xlen, xlensave);   // This is the decrement of the loop counter!
4362   mr(ylen, ylensave);
4363 
4364   addi(tmp3, xlen, 1);
4365   sldi(tmp, tmp3, LogBytesPerInt);
4366   stwx(carry, z, tmp);
4367   addic_(tmp3, tmp3, -1);
4368   blt(CCR0, L_done);
4369 
4370   srdi(carry, carry, 32);
4371   sldi(tmp, tmp3, LogBytesPerInt);
4372   stwx(carry, z, tmp);
4373   b(L_second_loop);
4374 
4375   // Next infrequent code is moved outside loops.
4376   bind(L_last_x);
4377 
4378   lwz(x_xstart, 0, x);
4379   b(L_third_loop_prologue);
4380 
4381   bind(L_done);
4382 }   // multiply_to_len
4383 
4384 void MacroAssembler::asm_assert(bool check_equal, const char *msg, int id) {
4385 #ifdef ASSERT
4386   Label ok;
4387   if (check_equal) {
4388     beq(CCR0, ok);
4389   } else {
4390     bne(CCR0, ok);
4391   }
4392   stop(msg, id);
4393   bind(ok);
4394 #endif
4395 }
4396 
4397 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset,
4398                                           Register mem_base, const char* msg, int id) {
4399 #ifdef ASSERT
4400   switch (size) {
4401     case 4:
4402       lwz(R0, mem_offset, mem_base);
4403       cmpwi(CCR0, R0, 0);
4404       break;
4405     case 8:
4406       ld(R0, mem_offset, mem_base);
4407       cmpdi(CCR0, R0, 0);
4408       break;
4409     default:
4410       ShouldNotReachHere();
4411   }
4412   asm_assert(check_equal, msg, id);
4413 #endif // ASSERT
4414 }
4415 
4416 void MacroAssembler::verify_thread() {
4417   if (VerifyThread) {
4418     unimplemented("'VerifyThread' currently not implemented on PPC");
4419   }
4420 }
4421 
4422 // READ: oop. KILL: R0. Volatile floats perhaps.
4423 void MacroAssembler::verify_oop(Register oop, const char* msg) {
4424   if (!VerifyOops) {
4425     return;
4426   }
4427 
4428   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4429   const Register tmp = R11; // Will be preserved.
4430   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
4431   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
4432 
4433   mr_if_needed(R4_ARG2, oop);
4434   save_LR_CR(tmp); // save in old frame
4435   push_frame_reg_args(nbytes_save, tmp);
4436   // load FunctionDescriptor** / entry_address *
4437   load_const_optimized(tmp, fd, R0);
4438   // load FunctionDescriptor* / entry_address
4439   ld(tmp, 0, tmp);
4440   load_const_optimized(R3_ARG1, (address)msg, R0);
4441   // Call destination for its side effect.
4442   call_c(tmp);
4443 
4444   pop_frame();
4445   restore_LR_CR(tmp);
4446   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
4447 }
4448 
4449 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) {
4450   if (!VerifyOops) {
4451     return;
4452   }
4453 
4454   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4455   const Register tmp = R11; // Will be preserved.
4456   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
4457   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
4458 
4459   ld(R4_ARG2, offs, base);
4460   save_LR_CR(tmp); // save in old frame
4461   push_frame_reg_args(nbytes_save, tmp);
4462   // load FunctionDescriptor** / entry_address *
4463   load_const_optimized(tmp, fd, R0);
4464   // load FunctionDescriptor* / entry_address
4465   ld(tmp, 0, tmp);
4466   load_const_optimized(R3_ARG1, (address)msg, R0);
4467   // Call destination for its side effect.
4468   call_c(tmp);
4469 
4470   pop_frame();
4471   restore_LR_CR(tmp);
4472   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
4473 }
4474 
4475 const char* stop_types[] = {
4476   "stop",
4477   "untested",
4478   "unimplemented",
4479   "shouldnotreachhere"
4480 };
4481 
4482 static void stop_on_request(int tp, const char* msg) {
4483   tty->print("PPC assembly code requires stop: (%s) %s\n", stop_types[tp%/*stop_end*/4], msg);
4484   guarantee(false, "PPC assembly code requires stop: %s", msg);
4485 }
4486 
4487 // Call a C-function that prints output.
4488 void MacroAssembler::stop(int type, const char* msg, int id) {
4489 #ifndef PRODUCT
4490   block_comment(err_msg("stop: %s %s {", stop_types[type%stop_end], msg));
4491 #else
4492   block_comment("stop {");
4493 #endif
4494 
4495   // setup arguments
4496   load_const_optimized(R3_ARG1, type);
4497   load_const_optimized(R4_ARG2, (void *)msg, /*tmp=*/R0);
4498   call_VM_leaf(CAST_FROM_FN_PTR(address, stop_on_request), R3_ARG1, R4_ARG2);
4499   illtrap();
4500   emit_int32(id);
4501   block_comment("} stop;");
4502 }
4503 
4504 #ifndef PRODUCT
4505 // Write pattern 0x0101010101010101 in memory region [low-before, high+after].
4506 // Val, addr are temp registers.
4507 // If low == addr, addr is killed.
4508 // High is preserved.
4509 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) {
4510   if (!ZapMemory) return;
4511 
4512   assert_different_registers(low, val);
4513 
4514   BLOCK_COMMENT("zap memory region {");
4515   load_const_optimized(val, 0x0101010101010101);
4516   int size = before + after;
4517   if (low == high && size < 5 && size > 0) {
4518     int offset = -before*BytesPerWord;
4519     for (int i = 0; i < size; ++i) {
4520       std(val, offset, low);
4521       offset += (1*BytesPerWord);
4522     }
4523   } else {
4524     addi(addr, low, -before*BytesPerWord);
4525     assert_different_registers(high, val);
4526     if (after) addi(high, high, after * BytesPerWord);
4527     Label loop;
4528     bind(loop);
4529     std(val, 0, addr);
4530     addi(addr, addr, 8);
4531     cmpd(CCR6, addr, high);
4532     ble(CCR6, loop);
4533     if (after) addi(high, high, -after * BytesPerWord);  // Correct back to old value.
4534   }
4535   BLOCK_COMMENT("} zap memory region");
4536 }
4537 
4538 #endif // !PRODUCT
4539 
4540 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() {
4541   int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true);
4542   assert(sizeof(bool) == 1, "PowerPC ABI");
4543   masm->lbz(temp, simm16_offset, temp);
4544   masm->cmpwi(CCR0, temp, 0);
4545   masm->beq(CCR0, _label);
4546 }
4547 
4548 SkipIfEqualZero::~SkipIfEqualZero() {
4549   _masm->bind(_label);
4550 }