1 /*
   2  * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright 2012, 2015 SAP AG. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "compiler/disassembler.hpp"
  29 #include "gc/shared/cardTableModRefBS.hpp"
  30 #include "gc/shared/collectedHeap.inline.hpp"
  31 #include "interpreter/interpreter.hpp"
  32 #include "memory/resourceArea.hpp"
  33 #include "prims/methodHandles.hpp"
  34 #include "runtime/biasedLocking.hpp"
  35 #include "runtime/icache.hpp"
  36 #include "runtime/interfaceSupport.hpp"
  37 #include "runtime/objectMonitor.hpp"
  38 #include "runtime/os.hpp"
  39 #include "runtime/sharedRuntime.hpp"
  40 #include "runtime/stubRoutines.hpp"
  41 #include "utilities/macros.hpp"
  42 #if INCLUDE_ALL_GCS
  43 #include "gc/g1/g1CollectedHeap.inline.hpp"
  44 #include "gc/g1/g1SATBCardTableModRefBS.hpp"
  45 #include "gc/g1/heapRegion.hpp"
  46 #endif // INCLUDE_ALL_GCS
  47 
  48 #ifdef PRODUCT
  49 #define BLOCK_COMMENT(str) // nothing
  50 #else
  51 #define BLOCK_COMMENT(str) block_comment(str)
  52 #endif
  53 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  54 
  55 #ifdef ASSERT
  56 // On RISC, there's no benefit to verifying instruction boundaries.
  57 bool AbstractAssembler::pd_check_instruction_mark() { return false; }
  58 #endif
  59 
  60 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) {
  61   assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range");
  62   if (Assembler::is_simm(si31, 16)) {
  63     ld(d, si31, a);
  64     if (emit_filler_nop) nop();
  65   } else {
  66     const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31);
  67     const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31);
  68     addis(d, a, hi);
  69     ld(d, lo, d);
  70   }
  71 }
  72 
  73 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) {
  74   assert_different_registers(d, a);
  75   ld_largeoffset_unchecked(d, si31, a, emit_filler_nop);
  76 }
  77 
  78 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base,
  79                                       size_t size_in_bytes, bool is_signed) {
  80   switch (size_in_bytes) {
  81   case  8:              ld(dst, offs, base);                         break;
  82   case  4:  is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break;
  83   case  2:  is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break;
  84   case  1:  lbz(dst, offs, base); if (is_signed) extsb(dst, dst);    break; // lba doesn't exist :(
  85   default:  ShouldNotReachHere();
  86   }
  87 }
  88 
  89 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base,
  90                                        size_t size_in_bytes) {
  91   switch (size_in_bytes) {
  92   case  8:  std(dst, offs, base); break;
  93   case  4:  stw(dst, offs, base); break;
  94   case  2:  sth(dst, offs, base); break;
  95   case  1:  stb(dst, offs, base); break;
  96   default:  ShouldNotReachHere();
  97   }
  98 }
  99 
 100 void MacroAssembler::align(int modulus, int max, int rem) {
 101   int padding = (rem + modulus - (offset() % modulus)) % modulus;
 102   if (padding > max) return;
 103   for (int c = (padding >> 2); c > 0; --c) { nop(); }
 104 }
 105 
 106 // Issue instructions that calculate given TOC from global TOC.
 107 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16,
 108                                                        bool add_relocation, bool emit_dummy_addr) {
 109   int offset = -1;
 110   if (emit_dummy_addr) {
 111     offset = -128; // dummy address
 112   } else if (addr != (address)(intptr_t)-1) {
 113     offset = MacroAssembler::offset_to_global_toc(addr);
 114   }
 115 
 116   if (hi16) {
 117     addis(dst, R29, MacroAssembler::largeoffset_si16_si16_hi(offset));
 118   }
 119   if (lo16) {
 120     if (add_relocation) {
 121       // Relocate at the addi to avoid confusion with a load from the method's TOC.
 122       relocate(internal_word_Relocation::spec(addr));
 123     }
 124     addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset));
 125   }
 126 }
 127 
 128 int MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) {
 129   const int offset = MacroAssembler::offset_to_global_toc(addr);
 130 
 131   const address inst2_addr = a;
 132   const int inst2 = *(int *)inst2_addr;
 133 
 134   // The relocation points to the second instruction, the addi,
 135   // and the addi reads and writes the same register dst.
 136   const int dst = inv_rt_field(inst2);
 137   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 138 
 139   // Now, find the preceding addis which writes to dst.
 140   int inst1 = 0;
 141   address inst1_addr = inst2_addr - BytesPerInstWord;
 142   while (inst1_addr >= bound) {
 143     inst1 = *(int *) inst1_addr;
 144     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 145       // Stop, found the addis which writes dst.
 146       break;
 147     }
 148     inst1_addr -= BytesPerInstWord;
 149   }
 150 
 151   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 152   set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset));
 153   set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset));
 154   return (int)((intptr_t)addr - (intptr_t)inst1_addr);
 155 }
 156 
 157 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) {
 158   const address inst2_addr = a;
 159   const int inst2 = *(int *)inst2_addr;
 160 
 161   // The relocation points to the second instruction, the addi,
 162   // and the addi reads and writes the same register dst.
 163   const int dst = inv_rt_field(inst2);
 164   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 165 
 166   // Now, find the preceding addis which writes to dst.
 167   int inst1 = 0;
 168   address inst1_addr = inst2_addr - BytesPerInstWord;
 169   while (inst1_addr >= bound) {
 170     inst1 = *(int *) inst1_addr;
 171     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 172       // stop, found the addis which writes dst
 173       break;
 174     }
 175     inst1_addr -= BytesPerInstWord;
 176   }
 177 
 178   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 179 
 180   int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0);
 181   // -1 is a special case
 182   if (offset == -1) {
 183     return (address)(intptr_t)-1;
 184   } else {
 185     return global_toc() + offset;
 186   }
 187 }
 188 
 189 #ifdef _LP64
 190 // Patch compressed oops or klass constants.
 191 // Assembler sequence is
 192 // 1) compressed oops:
 193 //    lis  rx = const.hi
 194 //    ori rx = rx | const.lo
 195 // 2) compressed klass:
 196 //    lis  rx = const.hi
 197 //    clrldi rx = rx & 0xFFFFffff // clearMS32b, optional
 198 //    ori rx = rx | const.lo
 199 // Clrldi will be passed by.
 200 int MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) {
 201   assert(UseCompressedOops, "Should only patch compressed oops");
 202 
 203   const address inst2_addr = a;
 204   const int inst2 = *(int *)inst2_addr;
 205 
 206   // The relocation points to the second instruction, the ori,
 207   // and the ori reads and writes the same register dst.
 208   const int dst = inv_rta_field(inst2);
 209   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 210   // Now, find the preceding addis which writes to dst.
 211   int inst1 = 0;
 212   address inst1_addr = inst2_addr - BytesPerInstWord;
 213   bool inst1_found = false;
 214   while (inst1_addr >= bound) {
 215     inst1 = *(int *)inst1_addr;
 216     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; }
 217     inst1_addr -= BytesPerInstWord;
 218   }
 219   assert(inst1_found, "inst is not lis");
 220 
 221   int xc = (data >> 16) & 0xffff;
 222   int xd = (data >>  0) & 0xffff;
 223 
 224   set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo
 225   set_imm((int *)inst2_addr,        (xd)); // unsigned int
 226   return (int)((intptr_t)inst2_addr - (intptr_t)inst1_addr);
 227 }
 228 
 229 // Get compressed oop or klass constant.
 230 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) {
 231   assert(UseCompressedOops, "Should only patch compressed oops");
 232 
 233   const address inst2_addr = a;
 234   const int inst2 = *(int *)inst2_addr;
 235 
 236   // The relocation points to the second instruction, the ori,
 237   // and the ori reads and writes the same register dst.
 238   const int dst = inv_rta_field(inst2);
 239   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 240   // Now, find the preceding lis which writes to dst.
 241   int inst1 = 0;
 242   address inst1_addr = inst2_addr - BytesPerInstWord;
 243   bool inst1_found = false;
 244 
 245   while (inst1_addr >= bound) {
 246     inst1 = *(int *) inst1_addr;
 247     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;}
 248     inst1_addr -= BytesPerInstWord;
 249   }
 250   assert(inst1_found, "inst is not lis");
 251 
 252   uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff));
 253   uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16);
 254 
 255   return (int) (xl | xh);
 256 }
 257 #endif // _LP64
 258 
 259 void MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a, Register toc) {
 260   int toc_offset = 0;
 261   // Use RelocationHolder::none for the constant pool entry, otherwise
 262   // we will end up with a failing NativeCall::verify(x) where x is
 263   // the address of the constant pool entry.
 264   // FIXME: We should insert relocation information for oops at the constant
 265   // pool entries instead of inserting it at the loads; patching of a constant
 266   // pool entry should be less expensive.
 267   address oop_address = address_constant((address)a.value(), RelocationHolder::none);
 268   // Relocate at the pc of the load.
 269   relocate(a.rspec());
 270   toc_offset = (int)(oop_address - code()->consts()->start());
 271   ld_largeoffset_unchecked(dst, toc_offset, toc, true);
 272 }
 273 
 274 bool MacroAssembler::is_load_const_from_method_toc_at(address a) {
 275   const address inst1_addr = a;
 276   const int inst1 = *(int *)inst1_addr;
 277 
 278    // The relocation points to the ld or the addis.
 279    return (is_ld(inst1)) ||
 280           (is_addis(inst1) && inv_ra_field(inst1) != 0);
 281 }
 282 
 283 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) {
 284   assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc");
 285 
 286   const address inst1_addr = a;
 287   const int inst1 = *(int *)inst1_addr;
 288 
 289   if (is_ld(inst1)) {
 290     return inv_d1_field(inst1);
 291   } else if (is_addis(inst1)) {
 292     const int dst = inv_rt_field(inst1);
 293 
 294     // Now, find the succeeding ld which reads and writes to dst.
 295     address inst2_addr = inst1_addr + BytesPerInstWord;
 296     int inst2 = 0;
 297     while (true) {
 298       inst2 = *(int *) inst2_addr;
 299       if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) {
 300         // Stop, found the ld which reads and writes dst.
 301         break;
 302       }
 303       inst2_addr += BytesPerInstWord;
 304     }
 305     return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2);
 306   }
 307   ShouldNotReachHere();
 308   return 0;
 309 }
 310 
 311 // Get the constant from a `load_const' sequence.
 312 long MacroAssembler::get_const(address a) {
 313   assert(is_load_const_at(a), "not a load of a constant");
 314   const int *p = (const int*) a;
 315   unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48);
 316   if (is_ori(*(p+1))) {
 317     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32);
 318     x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16);
 319     x |= (((unsigned long) (get_imm(a,4) & 0xffff)));
 320   } else if (is_lis(*(p+1))) {
 321     x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32);
 322     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16);
 323     x |= (((unsigned long) (get_imm(a,3) & 0xffff)));
 324   } else {
 325     ShouldNotReachHere();
 326     return (long) 0;
 327   }
 328   return (long) x;
 329 }
 330 
 331 // Patch the 64 bit constant of a `load_const' sequence. This is a low
 332 // level procedure. It neither flushes the instruction cache nor is it
 333 // mt safe.
 334 void MacroAssembler::patch_const(address a, long x) {
 335   assert(is_load_const_at(a), "not a load of a constant");
 336   int *p = (int*) a;
 337   if (is_ori(*(p+1))) {
 338     set_imm(0 + p, (x >> 48) & 0xffff);
 339     set_imm(1 + p, (x >> 32) & 0xffff);
 340     set_imm(3 + p, (x >> 16) & 0xffff);
 341     set_imm(4 + p, x & 0xffff);
 342   } else if (is_lis(*(p+1))) {
 343     set_imm(0 + p, (x >> 48) & 0xffff);
 344     set_imm(2 + p, (x >> 32) & 0xffff);
 345     set_imm(1 + p, (x >> 16) & 0xffff);
 346     set_imm(3 + p, x & 0xffff);
 347   } else {
 348     ShouldNotReachHere();
 349   }
 350 }
 351 
 352 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) {
 353   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
 354   int index = oop_recorder()->allocate_metadata_index(obj);
 355   RelocationHolder rspec = metadata_Relocation::spec(index);
 356   return AddressLiteral((address)obj, rspec);
 357 }
 358 
 359 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) {
 360   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
 361   int index = oop_recorder()->find_index(obj);
 362   RelocationHolder rspec = metadata_Relocation::spec(index);
 363   return AddressLiteral((address)obj, rspec);
 364 }
 365 
 366 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) {
 367   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
 368   int oop_index = oop_recorder()->allocate_oop_index(obj);
 369   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 370 }
 371 
 372 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) {
 373   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
 374   int oop_index = oop_recorder()->find_index(obj);
 375   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 376 }
 377 
 378 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
 379                                                       Register tmp, int offset) {
 380   intptr_t value = *delayed_value_addr;
 381   if (value != 0) {
 382     return RegisterOrConstant(value + offset);
 383   }
 384 
 385   // Load indirectly to solve generation ordering problem.
 386   // static address, no relocation
 387   int simm16_offset = load_const_optimized(tmp, delayed_value_addr, noreg, true);
 388   ld(tmp, simm16_offset, tmp); // must be aligned ((xa & 3) == 0)
 389 
 390   if (offset != 0) {
 391     addi(tmp, tmp, offset);
 392   }
 393 
 394   return RegisterOrConstant(tmp);
 395 }
 396 
 397 #ifndef PRODUCT
 398 void MacroAssembler::pd_print_patched_instruction(address branch) {
 399   Unimplemented(); // TODO: PPC port
 400 }
 401 #endif // ndef PRODUCT
 402 
 403 // Conditional far branch for destinations encodable in 24+2 bits.
 404 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) {
 405 
 406   // If requested by flag optimize, relocate the bc_far as a
 407   // runtime_call and prepare for optimizing it when the code gets
 408   // relocated.
 409   if (optimize == bc_far_optimize_on_relocate) {
 410     relocate(relocInfo::runtime_call_type);
 411   }
 412 
 413   // variant 2:
 414   //
 415   //    b!cxx SKIP
 416   //    bxx   DEST
 417   //  SKIP:
 418   //
 419 
 420   const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 421                                                 opposite_bcond(inv_boint_bcond(boint)));
 422 
 423   // We emit two branches.
 424   // First, a conditional branch which jumps around the far branch.
 425   const address not_taken_pc = pc() + 2 * BytesPerInstWord;
 426   const address bc_pc        = pc();
 427   bc(opposite_boint, biint, not_taken_pc);
 428 
 429   const int bc_instr = *(int*)bc_pc;
 430   assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition");
 431   assert(opposite_boint == inv_bo_field(bc_instr), "postcondition");
 432   assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))),
 433                                      opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))),
 434          "postcondition");
 435   assert(biint == inv_bi_field(bc_instr), "postcondition");
 436 
 437   // Second, an unconditional far branch which jumps to dest.
 438   // Note: target(dest) remembers the current pc (see CodeSection::target)
 439   //       and returns the current pc if the label is not bound yet; when
 440   //       the label gets bound, the unconditional far branch will be patched.
 441   const address target_pc = target(dest);
 442   const address b_pc  = pc();
 443   b(target_pc);
 444 
 445   assert(not_taken_pc == pc(),                     "postcondition");
 446   assert(dest.is_bound() || target_pc == b_pc, "postcondition");
 447 }
 448 
 449 bool MacroAssembler::is_bc_far_at(address instruction_addr) {
 450   return is_bc_far_variant1_at(instruction_addr) ||
 451          is_bc_far_variant2_at(instruction_addr) ||
 452          is_bc_far_variant3_at(instruction_addr);
 453 }
 454 
 455 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) {
 456   if (is_bc_far_variant1_at(instruction_addr)) {
 457     const address instruction_1_addr = instruction_addr;
 458     const int instruction_1 = *(int*)instruction_1_addr;
 459     return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr);
 460   } else if (is_bc_far_variant2_at(instruction_addr)) {
 461     const address instruction_2_addr = instruction_addr + 4;
 462     return bxx_destination(instruction_2_addr);
 463   } else if (is_bc_far_variant3_at(instruction_addr)) {
 464     return instruction_addr + 8;
 465   }
 466   // variant 4 ???
 467   ShouldNotReachHere();
 468   return NULL;
 469 }
 470 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) {
 471 
 472   if (is_bc_far_variant3_at(instruction_addr)) {
 473     // variant 3, far cond branch to the next instruction, already patched to nops:
 474     //
 475     //    nop
 476     //    endgroup
 477     //  SKIP/DEST:
 478     //
 479     return;
 480   }
 481 
 482   // first, extract boint and biint from the current branch
 483   int boint = 0;
 484   int biint = 0;
 485 
 486   ResourceMark rm;
 487   const int code_size = 2 * BytesPerInstWord;
 488   CodeBuffer buf(instruction_addr, code_size);
 489   MacroAssembler masm(&buf);
 490   if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) {
 491     // Far branch to next instruction: Optimize it by patching nops (produce variant 3).
 492     masm.nop();
 493     masm.endgroup();
 494   } else {
 495     if (is_bc_far_variant1_at(instruction_addr)) {
 496       // variant 1, the 1st instruction contains the destination address:
 497       //
 498       //    bcxx  DEST
 499       //    endgroup
 500       //
 501       const int instruction_1 = *(int*)(instruction_addr);
 502       boint = inv_bo_field(instruction_1);
 503       biint = inv_bi_field(instruction_1);
 504     } else if (is_bc_far_variant2_at(instruction_addr)) {
 505       // variant 2, the 2nd instruction contains the destination address:
 506       //
 507       //    b!cxx SKIP
 508       //    bxx   DEST
 509       //  SKIP:
 510       //
 511       const int instruction_1 = *(int*)(instruction_addr);
 512       boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))),
 513           opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1))));
 514       biint = inv_bi_field(instruction_1);
 515     } else {
 516       // variant 4???
 517       ShouldNotReachHere();
 518     }
 519 
 520     // second, set the new branch destination and optimize the code
 521     if (dest != instruction_addr + 4 && // the bc_far is still unbound!
 522         masm.is_within_range_of_bcxx(dest, instruction_addr)) {
 523       // variant 1:
 524       //
 525       //    bcxx  DEST
 526       //    endgroup
 527       //
 528       masm.bc(boint, biint, dest);
 529       masm.endgroup();
 530     } else {
 531       // variant 2:
 532       //
 533       //    b!cxx SKIP
 534       //    bxx   DEST
 535       //  SKIP:
 536       //
 537       const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 538                                                     opposite_bcond(inv_boint_bcond(boint)));
 539       const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord;
 540       masm.bc(opposite_boint, biint, not_taken_pc);
 541       masm.b(dest);
 542     }
 543   }
 544   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 545 }
 546 
 547 // Emit a NOT mt-safe patchable 64 bit absolute call/jump.
 548 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) {
 549   // get current pc
 550   uint64_t start_pc = (uint64_t) pc();
 551 
 552   const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last
 553   const address pc_of_b  = (address) (start_pc + (0*BytesPerInstWord)); // b is first
 554 
 555   // relocate here
 556   if (rt != relocInfo::none) {
 557     relocate(rt);
 558   }
 559 
 560   if ( ReoptimizeCallSequences &&
 561        (( link && is_within_range_of_b(dest, pc_of_bl)) ||
 562         (!link && is_within_range_of_b(dest, pc_of_b)))) {
 563     // variant 2:
 564     // Emit an optimized, pc-relative call/jump.
 565 
 566     if (link) {
 567       // some padding
 568       nop();
 569       nop();
 570       nop();
 571       nop();
 572       nop();
 573       nop();
 574 
 575       // do the call
 576       assert(pc() == pc_of_bl, "just checking");
 577       bl(dest, relocInfo::none);
 578     } else {
 579       // do the jump
 580       assert(pc() == pc_of_b, "just checking");
 581       b(dest, relocInfo::none);
 582 
 583       // some padding
 584       nop();
 585       nop();
 586       nop();
 587       nop();
 588       nop();
 589       nop();
 590     }
 591 
 592     // Assert that we can identify the emitted call/jump.
 593     assert(is_bxx64_patchable_variant2_at((address)start_pc, link),
 594            "can't identify emitted call");
 595   } else {
 596     // variant 1:
 597     mr(R0, R11);  // spill R11 -> R0.
 598 
 599     // Load the destination address into CTR,
 600     // calculate destination relative to global toc.
 601     calculate_address_from_global_toc(R11, dest, true, true, false);
 602 
 603     mtctr(R11);
 604     mr(R11, R0);  // spill R11 <- R0.
 605     nop();
 606 
 607     // do the call/jump
 608     if (link) {
 609       bctrl();
 610     } else{
 611       bctr();
 612     }
 613     // Assert that we can identify the emitted call/jump.
 614     assert(is_bxx64_patchable_variant1b_at((address)start_pc, link),
 615            "can't identify emitted call");
 616   }
 617 
 618   // Assert that we can identify the emitted call/jump.
 619   assert(is_bxx64_patchable_at((address)start_pc, link),
 620          "can't identify emitted call");
 621   assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest,
 622          "wrong encoding of dest address");
 623 }
 624 
 625 // Identify a bxx64_patchable instruction.
 626 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) {
 627   return is_bxx64_patchable_variant1b_at(instruction_addr, link)
 628     //|| is_bxx64_patchable_variant1_at(instruction_addr, link)
 629       || is_bxx64_patchable_variant2_at(instruction_addr, link);
 630 }
 631 
 632 // Does the call64_patchable instruction use a pc-relative encoding of
 633 // the call destination?
 634 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) {
 635   // variant 2 is pc-relative
 636   return is_bxx64_patchable_variant2_at(instruction_addr, link);
 637 }
 638 
 639 // Identify variant 1.
 640 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) {
 641   unsigned int* instr = (unsigned int*) instruction_addr;
 642   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 643       && is_mtctr(instr[5]) // mtctr
 644     && is_load_const_at(instruction_addr);
 645 }
 646 
 647 // Identify variant 1b: load destination relative to global toc.
 648 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) {
 649   unsigned int* instr = (unsigned int*) instruction_addr;
 650   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 651     && is_mtctr(instr[3]) // mtctr
 652     && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr);
 653 }
 654 
 655 // Identify variant 2.
 656 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) {
 657   unsigned int* instr = (unsigned int*) instruction_addr;
 658   if (link) {
 659     return is_bl (instr[6])  // bl dest is last
 660       && is_nop(instr[0])  // nop
 661       && is_nop(instr[1])  // nop
 662       && is_nop(instr[2])  // nop
 663       && is_nop(instr[3])  // nop
 664       && is_nop(instr[4])  // nop
 665       && is_nop(instr[5]); // nop
 666   } else {
 667     return is_b  (instr[0])  // b  dest is first
 668       && is_nop(instr[1])  // nop
 669       && is_nop(instr[2])  // nop
 670       && is_nop(instr[3])  // nop
 671       && is_nop(instr[4])  // nop
 672       && is_nop(instr[5])  // nop
 673       && is_nop(instr[6]); // nop
 674   }
 675 }
 676 
 677 // Set dest address of a bxx64_patchable instruction.
 678 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) {
 679   ResourceMark rm;
 680   int code_size = MacroAssembler::bxx64_patchable_size;
 681   CodeBuffer buf(instruction_addr, code_size);
 682   MacroAssembler masm(&buf);
 683   masm.bxx64_patchable(dest, relocInfo::none, link);
 684   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 685 }
 686 
 687 // Get dest address of a bxx64_patchable instruction.
 688 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) {
 689   if (is_bxx64_patchable_variant1_at(instruction_addr, link)) {
 690     return (address) (unsigned long) get_const(instruction_addr);
 691   } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) {
 692     unsigned int* instr = (unsigned int*) instruction_addr;
 693     if (link) {
 694       const int instr_idx = 6; // bl is last
 695       int branchoffset = branch_destination(instr[instr_idx], 0);
 696       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 697     } else {
 698       const int instr_idx = 0; // b is first
 699       int branchoffset = branch_destination(instr[instr_idx], 0);
 700       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 701     }
 702   // Load dest relative to global toc.
 703   } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) {
 704     return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord,
 705                                                                instruction_addr);
 706   } else {
 707     ShouldNotReachHere();
 708     return NULL;
 709   }
 710 }
 711 
 712 // Uses ordering which corresponds to ABI:
 713 //    _savegpr0_14:  std  r14,-144(r1)
 714 //    _savegpr0_15:  std  r15,-136(r1)
 715 //    _savegpr0_16:  std  r16,-128(r1)
 716 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) {
 717   std(R14, offset, dst);   offset += 8;
 718   std(R15, offset, dst);   offset += 8;
 719   std(R16, offset, dst);   offset += 8;
 720   std(R17, offset, dst);   offset += 8;
 721   std(R18, offset, dst);   offset += 8;
 722   std(R19, offset, dst);   offset += 8;
 723   std(R20, offset, dst);   offset += 8;
 724   std(R21, offset, dst);   offset += 8;
 725   std(R22, offset, dst);   offset += 8;
 726   std(R23, offset, dst);   offset += 8;
 727   std(R24, offset, dst);   offset += 8;
 728   std(R25, offset, dst);   offset += 8;
 729   std(R26, offset, dst);   offset += 8;
 730   std(R27, offset, dst);   offset += 8;
 731   std(R28, offset, dst);   offset += 8;
 732   std(R29, offset, dst);   offset += 8;
 733   std(R30, offset, dst);   offset += 8;
 734   std(R31, offset, dst);   offset += 8;
 735 
 736   stfd(F14, offset, dst);   offset += 8;
 737   stfd(F15, offset, dst);   offset += 8;
 738   stfd(F16, offset, dst);   offset += 8;
 739   stfd(F17, offset, dst);   offset += 8;
 740   stfd(F18, offset, dst);   offset += 8;
 741   stfd(F19, offset, dst);   offset += 8;
 742   stfd(F20, offset, dst);   offset += 8;
 743   stfd(F21, offset, dst);   offset += 8;
 744   stfd(F22, offset, dst);   offset += 8;
 745   stfd(F23, offset, dst);   offset += 8;
 746   stfd(F24, offset, dst);   offset += 8;
 747   stfd(F25, offset, dst);   offset += 8;
 748   stfd(F26, offset, dst);   offset += 8;
 749   stfd(F27, offset, dst);   offset += 8;
 750   stfd(F28, offset, dst);   offset += 8;
 751   stfd(F29, offset, dst);   offset += 8;
 752   stfd(F30, offset, dst);   offset += 8;
 753   stfd(F31, offset, dst);
 754 }
 755 
 756 // Uses ordering which corresponds to ABI:
 757 //    _restgpr0_14:  ld   r14,-144(r1)
 758 //    _restgpr0_15:  ld   r15,-136(r1)
 759 //    _restgpr0_16:  ld   r16,-128(r1)
 760 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) {
 761   ld(R14, offset, src);   offset += 8;
 762   ld(R15, offset, src);   offset += 8;
 763   ld(R16, offset, src);   offset += 8;
 764   ld(R17, offset, src);   offset += 8;
 765   ld(R18, offset, src);   offset += 8;
 766   ld(R19, offset, src);   offset += 8;
 767   ld(R20, offset, src);   offset += 8;
 768   ld(R21, offset, src);   offset += 8;
 769   ld(R22, offset, src);   offset += 8;
 770   ld(R23, offset, src);   offset += 8;
 771   ld(R24, offset, src);   offset += 8;
 772   ld(R25, offset, src);   offset += 8;
 773   ld(R26, offset, src);   offset += 8;
 774   ld(R27, offset, src);   offset += 8;
 775   ld(R28, offset, src);   offset += 8;
 776   ld(R29, offset, src);   offset += 8;
 777   ld(R30, offset, src);   offset += 8;
 778   ld(R31, offset, src);   offset += 8;
 779 
 780   // FP registers
 781   lfd(F14, offset, src);   offset += 8;
 782   lfd(F15, offset, src);   offset += 8;
 783   lfd(F16, offset, src);   offset += 8;
 784   lfd(F17, offset, src);   offset += 8;
 785   lfd(F18, offset, src);   offset += 8;
 786   lfd(F19, offset, src);   offset += 8;
 787   lfd(F20, offset, src);   offset += 8;
 788   lfd(F21, offset, src);   offset += 8;
 789   lfd(F22, offset, src);   offset += 8;
 790   lfd(F23, offset, src);   offset += 8;
 791   lfd(F24, offset, src);   offset += 8;
 792   lfd(F25, offset, src);   offset += 8;
 793   lfd(F26, offset, src);   offset += 8;
 794   lfd(F27, offset, src);   offset += 8;
 795   lfd(F28, offset, src);   offset += 8;
 796   lfd(F29, offset, src);   offset += 8;
 797   lfd(F30, offset, src);   offset += 8;
 798   lfd(F31, offset, src);
 799 }
 800 
 801 // For verify_oops.
 802 void MacroAssembler::save_volatile_gprs(Register dst, int offset) {
 803   std(R2,  offset, dst);   offset += 8;
 804   std(R3,  offset, dst);   offset += 8;
 805   std(R4,  offset, dst);   offset += 8;
 806   std(R5,  offset, dst);   offset += 8;
 807   std(R6,  offset, dst);   offset += 8;
 808   std(R7,  offset, dst);   offset += 8;
 809   std(R8,  offset, dst);   offset += 8;
 810   std(R9,  offset, dst);   offset += 8;
 811   std(R10, offset, dst);   offset += 8;
 812   std(R11, offset, dst);   offset += 8;
 813   std(R12, offset, dst);
 814 }
 815 
 816 // For verify_oops.
 817 void MacroAssembler::restore_volatile_gprs(Register src, int offset) {
 818   ld(R2,  offset, src);   offset += 8;
 819   ld(R3,  offset, src);   offset += 8;
 820   ld(R4,  offset, src);   offset += 8;
 821   ld(R5,  offset, src);   offset += 8;
 822   ld(R6,  offset, src);   offset += 8;
 823   ld(R7,  offset, src);   offset += 8;
 824   ld(R8,  offset, src);   offset += 8;
 825   ld(R9,  offset, src);   offset += 8;
 826   ld(R10, offset, src);   offset += 8;
 827   ld(R11, offset, src);   offset += 8;
 828   ld(R12, offset, src);
 829 }
 830 
 831 void MacroAssembler::save_LR_CR(Register tmp) {
 832   mfcr(tmp);
 833   std(tmp, _abi(cr), R1_SP);
 834   mflr(tmp);
 835   std(tmp, _abi(lr), R1_SP);
 836   // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad)
 837 }
 838 
 839 void MacroAssembler::restore_LR_CR(Register tmp) {
 840   assert(tmp != R1_SP, "must be distinct");
 841   ld(tmp, _abi(lr), R1_SP);
 842   mtlr(tmp);
 843   ld(tmp, _abi(cr), R1_SP);
 844   mtcr(tmp);
 845 }
 846 
 847 address MacroAssembler::get_PC_trash_LR(Register result) {
 848   Label L;
 849   bl(L);
 850   bind(L);
 851   address lr_pc = pc();
 852   mflr(result);
 853   return lr_pc;
 854 }
 855 
 856 void MacroAssembler::resize_frame(Register offset, Register tmp) {
 857 #ifdef ASSERT
 858   assert_different_registers(offset, tmp, R1_SP);
 859   andi_(tmp, offset, frame::alignment_in_bytes-1);
 860   asm_assert_eq("resize_frame: unaligned", 0x204);
 861 #endif
 862 
 863   // tmp <- *(SP)
 864   ld(tmp, _abi(callers_sp), R1_SP);
 865   // addr <- SP + offset;
 866   // *(addr) <- tmp;
 867   // SP <- addr
 868   stdux(tmp, R1_SP, offset);
 869 }
 870 
 871 void MacroAssembler::resize_frame(int offset, Register tmp) {
 872   assert(is_simm(offset, 16), "too big an offset");
 873   assert_different_registers(tmp, R1_SP);
 874   assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned");
 875   // tmp <- *(SP)
 876   ld(tmp, _abi(callers_sp), R1_SP);
 877   // addr <- SP + offset;
 878   // *(addr) <- tmp;
 879   // SP <- addr
 880   stdu(tmp, offset, R1_SP);
 881 }
 882 
 883 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) {
 884   // (addr == tmp1) || (addr == tmp2) is allowed here!
 885   assert(tmp1 != tmp2, "must be distinct");
 886 
 887   // compute offset w.r.t. current stack pointer
 888   // tmp_1 <- addr - SP (!)
 889   subf(tmp1, R1_SP, addr);
 890 
 891   // atomically update SP keeping back link.
 892   resize_frame(tmp1/* offset */, tmp2/* tmp */);
 893 }
 894 
 895 void MacroAssembler::push_frame(Register bytes, Register tmp) {
 896 #ifdef ASSERT
 897   assert(bytes != R0, "r0 not allowed here");
 898   andi_(R0, bytes, frame::alignment_in_bytes-1);
 899   asm_assert_eq("push_frame(Reg, Reg): unaligned", 0x203);
 900 #endif
 901   neg(tmp, bytes);
 902   stdux(R1_SP, R1_SP, tmp);
 903 }
 904 
 905 // Push a frame of size `bytes'.
 906 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) {
 907   long offset = align_addr(bytes, frame::alignment_in_bytes);
 908   if (is_simm(-offset, 16)) {
 909     stdu(R1_SP, -offset, R1_SP);
 910   } else {
 911     load_const(tmp, -offset);
 912     stdux(R1_SP, R1_SP, tmp);
 913   }
 914 }
 915 
 916 // Push a frame of size `bytes' plus abi_reg_args on top.
 917 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) {
 918   push_frame(bytes + frame::abi_reg_args_size, tmp);
 919 }
 920 
 921 // Setup up a new C frame with a spill area for non-volatile GPRs and
 922 // additional space for local variables.
 923 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes,
 924                                                       Register tmp) {
 925   push_frame(bytes + frame::abi_reg_args_size + frame::spill_nonvolatiles_size, tmp);
 926 }
 927 
 928 // Pop current C frame.
 929 void MacroAssembler::pop_frame() {
 930   ld(R1_SP, _abi(callers_sp), R1_SP);
 931 }
 932 
 933 #if defined(ABI_ELFv2)
 934 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) {
 935   // TODO(asmundak): make sure the caller uses R12 as function descriptor
 936   // most of the times.
 937   if (R12 != r_function_entry) {
 938     mr(R12, r_function_entry);
 939   }
 940   mtctr(R12);
 941   // Do a call or a branch.
 942   if (and_link) {
 943     bctrl();
 944   } else {
 945     bctr();
 946   }
 947   _last_calls_return_pc = pc();
 948 
 949   return _last_calls_return_pc;
 950 }
 951 
 952 // Call a C function via a function descriptor and use full C
 953 // calling conventions. Updates and returns _last_calls_return_pc.
 954 address MacroAssembler::call_c(Register r_function_entry) {
 955   return branch_to(r_function_entry, /*and_link=*/true);
 956 }
 957 
 958 // For tail calls: only branch, don't link, so callee returns to caller of this function.
 959 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) {
 960   return branch_to(r_function_entry, /*and_link=*/false);
 961 }
 962 
 963 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) {
 964   load_const(R12, function_entry, R0);
 965   return branch_to(R12,  /*and_link=*/true);
 966 }
 967 
 968 #else
 969 // Generic version of a call to C function via a function descriptor
 970 // with variable support for C calling conventions (TOC, ENV, etc.).
 971 // Updates and returns _last_calls_return_pc.
 972 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call,
 973                                   bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) {
 974   // we emit standard ptrgl glue code here
 975   assert((function_descriptor != R0), "function_descriptor cannot be R0");
 976 
 977   // retrieve necessary entries from the function descriptor
 978   ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor);
 979   mtctr(R0);
 980 
 981   if (load_toc_of_callee) {
 982     ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor);
 983   }
 984   if (load_env_of_callee) {
 985     ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor);
 986   } else if (load_toc_of_callee) {
 987     li(R11, 0);
 988   }
 989 
 990   // do a call or a branch
 991   if (and_link) {
 992     bctrl();
 993   } else {
 994     bctr();
 995   }
 996   _last_calls_return_pc = pc();
 997 
 998   return _last_calls_return_pc;
 999 }
1000 
1001 // Call a C function via a function descriptor and use full C calling
1002 // conventions.
1003 // We don't use the TOC in generated code, so there is no need to save
1004 // and restore its value.
1005 address MacroAssembler::call_c(Register fd) {
1006   return branch_to(fd, /*and_link=*/true,
1007                        /*save toc=*/false,
1008                        /*restore toc=*/false,
1009                        /*load toc=*/true,
1010                        /*load env=*/true);
1011 }
1012 
1013 address MacroAssembler::call_c_and_return_to_caller(Register fd) {
1014   return branch_to(fd, /*and_link=*/false,
1015                        /*save toc=*/false,
1016                        /*restore toc=*/false,
1017                        /*load toc=*/true,
1018                        /*load env=*/true);
1019 }
1020 
1021 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) {
1022   if (rt != relocInfo::none) {
1023     // this call needs to be relocatable
1024     if (!ReoptimizeCallSequences
1025         || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1026         || fd == NULL   // support code-size estimation
1027         || !fd->is_friend_function()
1028         || fd->entry() == NULL) {
1029       // it's not a friend function as defined by class FunctionDescriptor,
1030       // so do a full call-c here.
1031       load_const(R11, (address)fd, R0);
1032 
1033       bool has_env = (fd != NULL && fd->env() != NULL);
1034       return branch_to(R11, /*and_link=*/true,
1035                             /*save toc=*/false,
1036                             /*restore toc=*/false,
1037                             /*load toc=*/true,
1038                             /*load env=*/has_env);
1039     } else {
1040       // It's a friend function. Load the entry point and don't care about
1041       // toc and env. Use an optimizable call instruction, but ensure the
1042       // same code-size as in the case of a non-friend function.
1043       nop();
1044       nop();
1045       nop();
1046       bl64_patchable(fd->entry(), rt);
1047       _last_calls_return_pc = pc();
1048       return _last_calls_return_pc;
1049     }
1050   } else {
1051     // This call does not need to be relocatable, do more aggressive
1052     // optimizations.
1053     if (!ReoptimizeCallSequences
1054       || !fd->is_friend_function()) {
1055       // It's not a friend function as defined by class FunctionDescriptor,
1056       // so do a full call-c here.
1057       load_const(R11, (address)fd, R0);
1058       return branch_to(R11, /*and_link=*/true,
1059                             /*save toc=*/false,
1060                             /*restore toc=*/false,
1061                             /*load toc=*/true,
1062                             /*load env=*/true);
1063     } else {
1064       // it's a friend function, load the entry point and don't care about
1065       // toc and env.
1066       address dest = fd->entry();
1067       if (is_within_range_of_b(dest, pc())) {
1068         bl(dest);
1069       } else {
1070         bl64_patchable(dest, rt);
1071       }
1072       _last_calls_return_pc = pc();
1073       return _last_calls_return_pc;
1074     }
1075   }
1076 }
1077 
1078 // Call a C function.  All constants needed reside in TOC.
1079 //
1080 // Read the address to call from the TOC.
1081 // Read env from TOC, if fd specifies an env.
1082 // Read new TOC from TOC.
1083 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd,
1084                                          relocInfo::relocType rt, Register toc) {
1085   if (!ReoptimizeCallSequences
1086     || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1087     || !fd->is_friend_function()) {
1088     // It's not a friend function as defined by class FunctionDescriptor,
1089     // so do a full call-c here.
1090     assert(fd->entry() != NULL, "function must be linked");
1091 
1092     AddressLiteral fd_entry(fd->entry());
1093     load_const_from_method_toc(R11, fd_entry, toc);
1094     mtctr(R11);
1095     if (fd->env() == NULL) {
1096       li(R11, 0);
1097       nop();
1098     } else {
1099       AddressLiteral fd_env(fd->env());
1100       load_const_from_method_toc(R11, fd_env, toc);
1101     }
1102     AddressLiteral fd_toc(fd->toc());
1103     load_toc_from_toc(R2_TOC, fd_toc, toc);
1104     // R2_TOC is killed.
1105     bctrl();
1106     _last_calls_return_pc = pc();
1107   } else {
1108     // It's a friend function, load the entry point and don't care about
1109     // toc and env. Use an optimizable call instruction, but ensure the
1110     // same code-size as in the case of a non-friend function.
1111     nop();
1112     bl64_patchable(fd->entry(), rt);
1113     _last_calls_return_pc = pc();
1114   }
1115   return _last_calls_return_pc;
1116 }
1117 #endif // ABI_ELFv2
1118 
1119 void MacroAssembler::call_VM_base(Register oop_result,
1120                                   Register last_java_sp,
1121                                   address  entry_point,
1122                                   bool     check_exceptions) {
1123   BLOCK_COMMENT("call_VM {");
1124   // Determine last_java_sp register.
1125   if (!last_java_sp->is_valid()) {
1126     last_java_sp = R1_SP;
1127   }
1128   set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1);
1129 
1130   // ARG1 must hold thread address.
1131   mr(R3_ARG1, R16_thread);
1132 #if defined(ABI_ELFv2)
1133   address return_pc = call_c(entry_point, relocInfo::none);
1134 #else
1135   address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none);
1136 #endif
1137 
1138   reset_last_Java_frame();
1139 
1140   // Check for pending exceptions.
1141   if (check_exceptions) {
1142     // We don't check for exceptions here.
1143     ShouldNotReachHere();
1144   }
1145 
1146   // Get oop result if there is one and reset the value in the thread.
1147   if (oop_result->is_valid()) {
1148     get_vm_result(oop_result);
1149   }
1150 
1151   _last_calls_return_pc = return_pc;
1152   BLOCK_COMMENT("} call_VM");
1153 }
1154 
1155 void MacroAssembler::call_VM_leaf_base(address entry_point) {
1156   BLOCK_COMMENT("call_VM_leaf {");
1157 #if defined(ABI_ELFv2)
1158   call_c(entry_point, relocInfo::none);
1159 #else
1160   call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none);
1161 #endif
1162   BLOCK_COMMENT("} call_VM_leaf");
1163 }
1164 
1165 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) {
1166   call_VM_base(oop_result, noreg, entry_point, check_exceptions);
1167 }
1168 
1169 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1,
1170                              bool check_exceptions) {
1171   // R3_ARG1 is reserved for the thread.
1172   mr_if_needed(R4_ARG2, arg_1);
1173   call_VM(oop_result, entry_point, check_exceptions);
1174 }
1175 
1176 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2,
1177                              bool check_exceptions) {
1178   // R3_ARG1 is reserved for the thread
1179   mr_if_needed(R4_ARG2, arg_1);
1180   assert(arg_2 != R4_ARG2, "smashed argument");
1181   mr_if_needed(R5_ARG3, arg_2);
1182   call_VM(oop_result, entry_point, check_exceptions);
1183 }
1184 
1185 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3,
1186                              bool check_exceptions) {
1187   // R3_ARG1 is reserved for the thread
1188   mr_if_needed(R4_ARG2, arg_1);
1189   assert(arg_2 != R4_ARG2, "smashed argument");
1190   mr_if_needed(R5_ARG3, arg_2);
1191   mr_if_needed(R6_ARG4, arg_3);
1192   call_VM(oop_result, entry_point, check_exceptions);
1193 }
1194 
1195 void MacroAssembler::call_VM_leaf(address entry_point) {
1196   call_VM_leaf_base(entry_point);
1197 }
1198 
1199 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) {
1200   mr_if_needed(R3_ARG1, arg_1);
1201   call_VM_leaf(entry_point);
1202 }
1203 
1204 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) {
1205   mr_if_needed(R3_ARG1, arg_1);
1206   assert(arg_2 != R3_ARG1, "smashed argument");
1207   mr_if_needed(R4_ARG2, arg_2);
1208   call_VM_leaf(entry_point);
1209 }
1210 
1211 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) {
1212   mr_if_needed(R3_ARG1, arg_1);
1213   assert(arg_2 != R3_ARG1, "smashed argument");
1214   mr_if_needed(R4_ARG2, arg_2);
1215   assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument");
1216   mr_if_needed(R5_ARG3, arg_3);
1217   call_VM_leaf(entry_point);
1218 }
1219 
1220 // Check whether instruction is a read access to the polling page
1221 // which was emitted by load_from_polling_page(..).
1222 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext,
1223                                                address* polling_address_ptr) {
1224   if (!is_ld(instruction))
1225     return false; // It's not a ld. Fail.
1226 
1227   int rt = inv_rt_field(instruction);
1228   int ra = inv_ra_field(instruction);
1229   int ds = inv_ds_field(instruction);
1230   if (!(ds == 0 && ra != 0 && rt == 0)) {
1231     return false; // It's not a ld(r0, X, ra). Fail.
1232   }
1233 
1234   if (!ucontext) {
1235     // Set polling address.
1236     if (polling_address_ptr != NULL) {
1237       *polling_address_ptr = NULL;
1238     }
1239     return true; // No ucontext given. Can't check value of ra. Assume true.
1240   }
1241 
1242 #ifdef LINUX
1243   // Ucontext given. Check that register ra contains the address of
1244   // the safepoing polling page.
1245   ucontext_t* uc = (ucontext_t*) ucontext;
1246   // Set polling address.
1247   address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds;
1248   if (polling_address_ptr != NULL) {
1249     *polling_address_ptr = addr;
1250   }
1251   return os::is_poll_address(addr);
1252 #else
1253   // Not on Linux, ucontext must be NULL.
1254   ShouldNotReachHere();
1255   return false;
1256 #endif
1257 }
1258 
1259 bool MacroAssembler::is_memory_serialization(int instruction, JavaThread* thread, void* ucontext) {
1260 #ifdef LINUX
1261   ucontext_t* uc = (ucontext_t*) ucontext;
1262 
1263   if (is_stwx(instruction) || is_stwux(instruction)) {
1264     int ra = inv_ra_field(instruction);
1265     int rb = inv_rb_field(instruction);
1266 
1267     // look up content of ra and rb in ucontext
1268     address ra_val=(address)uc->uc_mcontext.regs->gpr[ra];
1269     long rb_val=(long)uc->uc_mcontext.regs->gpr[rb];
1270     return os::is_memory_serialize_page(thread, ra_val+rb_val);
1271   } else if (is_stw(instruction) || is_stwu(instruction)) {
1272     int ra = inv_ra_field(instruction);
1273     int d1 = inv_d1_field(instruction);
1274 
1275     // look up content of ra in ucontext
1276     address ra_val=(address)uc->uc_mcontext.regs->gpr[ra];
1277     return os::is_memory_serialize_page(thread, ra_val+d1);
1278   } else {
1279     return false;
1280   }
1281 #else
1282   // workaround not needed on !LINUX :-)
1283   ShouldNotCallThis();
1284   return false;
1285 #endif
1286 }
1287 
1288 void MacroAssembler::bang_stack_with_offset(int offset) {
1289   // When increasing the stack, the old stack pointer will be written
1290   // to the new top of stack according to the PPC64 abi.
1291   // Therefore, stack banging is not necessary when increasing
1292   // the stack by <= os::vm_page_size() bytes.
1293   // When increasing the stack by a larger amount, this method is
1294   // called repeatedly to bang the intermediate pages.
1295 
1296   // Stack grows down, caller passes positive offset.
1297   assert(offset > 0, "must bang with positive offset");
1298 
1299   long stdoffset = -offset;
1300 
1301   if (is_simm(stdoffset, 16)) {
1302     // Signed 16 bit offset, a simple std is ok.
1303     if (UseLoadInstructionsForStackBangingPPC64) {
1304       ld(R0, (int)(signed short)stdoffset, R1_SP);
1305     } else {
1306       std(R0,(int)(signed short)stdoffset, R1_SP);
1307     }
1308   } else if (is_simm(stdoffset, 31)) {
1309     const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset);
1310     const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset);
1311 
1312     Register tmp = R11;
1313     addis(tmp, R1_SP, hi);
1314     if (UseLoadInstructionsForStackBangingPPC64) {
1315       ld(R0,  lo, tmp);
1316     } else {
1317       std(R0, lo, tmp);
1318     }
1319   } else {
1320     ShouldNotReachHere();
1321   }
1322 }
1323 
1324 // If instruction is a stack bang of the form
1325 //    std    R0,    x(Ry),       (see bang_stack_with_offset())
1326 //    stdu   R1_SP, x(R1_SP),    (see push_frame(), resize_frame())
1327 // or stdux  R1_SP, Rx, R1_SP    (see push_frame(), resize_frame())
1328 // return the banged address. Otherwise, return 0.
1329 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) {
1330 #ifdef LINUX
1331   ucontext_t* uc = (ucontext_t*) ucontext;
1332   int rs = inv_rs_field(instruction);
1333   int ra = inv_ra_field(instruction);
1334   if (   (is_ld(instruction)   && rs == 0 &&  UseLoadInstructionsForStackBangingPPC64)
1335       || (is_std(instruction)  && rs == 0 && !UseLoadInstructionsForStackBangingPPC64)
1336       || (is_stdu(instruction) && rs == 1)) {
1337     int ds = inv_ds_field(instruction);
1338     // return banged address
1339     return ds+(address)uc->uc_mcontext.regs->gpr[ra];
1340   } else if (is_stdux(instruction) && rs == 1) {
1341     int rb = inv_rb_field(instruction);
1342     address sp = (address)uc->uc_mcontext.regs->gpr[1];
1343     long rb_val = (long)uc->uc_mcontext.regs->gpr[rb];
1344     return ra != 1 || rb_val >= 0 ? NULL         // not a stack bang
1345                                   : sp + rb_val; // banged address
1346   }
1347   return NULL; // not a stack bang
1348 #else
1349   // workaround not needed on !LINUX :-)
1350   ShouldNotCallThis();
1351   return NULL;
1352 #endif
1353 }
1354 
1355 // CmpxchgX sets condition register to cmpX(current, compare).
1356 void MacroAssembler::cmpxchgw(ConditionRegister flag, Register dest_current_value,
1357                               Register compare_value, Register exchange_value,
1358                               Register addr_base, int semantics, bool cmpxchgx_hint,
1359                               Register int_flag_success, bool contention_hint) {
1360   Label retry;
1361   Label failed;
1362   Label done;
1363 
1364   // Save one branch if result is returned via register and
1365   // result register is different from the other ones.
1366   bool use_result_reg    = (int_flag_success != noreg);
1367   bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value &&
1368                             int_flag_success != exchange_value && int_flag_success != addr_base);
1369 
1370   // release/fence semantics
1371   if (semantics & MemBarRel) {
1372     release();
1373   }
1374 
1375   if (use_result_reg && preset_result_reg) {
1376     li(int_flag_success, 0); // preset (assume cas failed)
1377   }
1378 
1379   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1380   if (contention_hint) { // Don't try to reserve if cmp fails.
1381     lwz(dest_current_value, 0, addr_base);
1382     cmpw(flag, dest_current_value, compare_value);
1383     bne(flag, failed);
1384   }
1385 
1386   // atomic emulation loop
1387   bind(retry);
1388 
1389   lwarx(dest_current_value, addr_base, cmpxchgx_hint);
1390   cmpw(flag, dest_current_value, compare_value);
1391   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1392     bne_predict_not_taken(flag, failed);
1393   } else {
1394     bne(                  flag, failed);
1395   }
1396   // branch to done  => (flag == ne), (dest_current_value != compare_value)
1397   // fall through    => (flag == eq), (dest_current_value == compare_value)
1398 
1399   stwcx_(exchange_value, addr_base);
1400   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1401     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1402   } else {
1403     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1404   }
1405   // fall through    => (flag == eq), (dest_current_value == compare_value), (swapped)
1406 
1407   // Result in register (must do this at the end because int_flag_success can be the
1408   // same register as one above).
1409   if (use_result_reg) {
1410     li(int_flag_success, 1);
1411   }
1412 
1413   if (semantics & MemBarFenceAfter) {
1414     fence();
1415   } else if (semantics & MemBarAcq) {
1416     isync();
1417   }
1418 
1419   if (use_result_reg && !preset_result_reg) {
1420     b(done);
1421   }
1422 
1423   bind(failed);
1424   if (use_result_reg && !preset_result_reg) {
1425     li(int_flag_success, 0);
1426   }
1427 
1428   bind(done);
1429   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1430   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1431 }
1432 
1433 // Preforms atomic compare exchange:
1434 //   if (compare_value == *addr_base)
1435 //     *addr_base = exchange_value
1436 //     int_flag_success = 1;
1437 //   else
1438 //     int_flag_success = 0;
1439 //
1440 // ConditionRegister flag       = cmp(compare_value, *addr_base)
1441 // Register dest_current_value  = *addr_base
1442 // Register compare_value       Used to compare with value in memory
1443 // Register exchange_value      Written to memory if compare_value == *addr_base
1444 // Register addr_base           The memory location to compareXChange
1445 // Register int_flag_success    Set to 1 if exchange_value was written to *addr_base
1446 //
1447 // To avoid the costly compare exchange the value is tested beforehand.
1448 // Several special cases exist to avoid that unnecessary information is generated.
1449 //
1450 void MacroAssembler::cmpxchgd(ConditionRegister flag,
1451                               Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value,
1452                               Register addr_base, int semantics, bool cmpxchgx_hint,
1453                               Register int_flag_success, Label* failed_ext, bool contention_hint) {
1454   Label retry;
1455   Label failed_int;
1456   Label& failed = (failed_ext != NULL) ? *failed_ext : failed_int;
1457   Label done;
1458 
1459   // Save one branch if result is returned via register and result register is different from the other ones.
1460   bool use_result_reg    = (int_flag_success!=noreg);
1461   bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() &&
1462                             int_flag_success!=exchange_value && int_flag_success!=addr_base);
1463   assert(int_flag_success == noreg || failed_ext == NULL, "cannot have both");
1464 
1465   // release/fence semantics
1466   if (semantics & MemBarRel) {
1467     release();
1468   }
1469 
1470   if (use_result_reg && preset_result_reg) {
1471     li(int_flag_success, 0); // preset (assume cas failed)
1472   }
1473 
1474   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1475   if (contention_hint) { // Don't try to reserve if cmp fails.
1476     ld(dest_current_value, 0, addr_base);
1477     cmpd(flag, compare_value, dest_current_value);
1478     bne(flag, failed);
1479   }
1480 
1481   // atomic emulation loop
1482   bind(retry);
1483 
1484   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1485   cmpd(flag, compare_value, dest_current_value);
1486   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1487     bne_predict_not_taken(flag, failed);
1488   } else {
1489     bne(                  flag, failed);
1490   }
1491 
1492   stdcx_(exchange_value, addr_base);
1493   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1494     bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
1495   } else {
1496     bne(                  CCR0, retry); // stXcx_ sets CCR0
1497   }
1498 
1499   // result in register (must do this at the end because int_flag_success can be the same register as one above)
1500   if (use_result_reg) {
1501     li(int_flag_success, 1);
1502   }
1503 
1504   // POWER6 doesn't need isync in CAS.
1505   // Always emit isync to be on the safe side.
1506   if (semantics & MemBarFenceAfter) {
1507     fence();
1508   } else if (semantics & MemBarAcq) {
1509     isync();
1510   }
1511 
1512   if (use_result_reg && !preset_result_reg) {
1513     b(done);
1514   }
1515 
1516   bind(failed_int);
1517   if (use_result_reg && !preset_result_reg) {
1518     li(int_flag_success, 0);
1519   }
1520 
1521   bind(done);
1522   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1523   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1524 }
1525 
1526 // Look up the method for a megamorphic invokeinterface call.
1527 // The target method is determined by <intf_klass, itable_index>.
1528 // The receiver klass is in recv_klass.
1529 // On success, the result will be in method_result, and execution falls through.
1530 // On failure, execution transfers to the given label.
1531 void MacroAssembler::lookup_interface_method(Register recv_klass,
1532                                              Register intf_klass,
1533                                              RegisterOrConstant itable_index,
1534                                              Register method_result,
1535                                              Register scan_temp,
1536                                              Register sethi_temp,
1537                                              Label& L_no_such_interface) {
1538   assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
1539   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
1540          "caller must use same register for non-constant itable index as for method");
1541 
1542   // Compute start of first itableOffsetEntry (which is at the end of the vtable).
1543   int vtable_base = InstanceKlass::vtable_start_offset() * wordSize;
1544   int itentry_off = itableMethodEntry::method_offset_in_bytes();
1545   int logMEsize   = exact_log2(itableMethodEntry::size() * wordSize);
1546   int scan_step   = itableOffsetEntry::size() * wordSize;
1547   int log_vte_size= exact_log2(vtableEntry::size() * wordSize);
1548 
1549   lwz(scan_temp, InstanceKlass::vtable_length_offset() * wordSize, recv_klass);
1550   // %%% We should store the aligned, prescaled offset in the klassoop.
1551   // Then the next several instructions would fold away.
1552 
1553   sldi(scan_temp, scan_temp, log_vte_size);
1554   addi(scan_temp, scan_temp, vtable_base);
1555   add(scan_temp, recv_klass, scan_temp);
1556 
1557   // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1558   if (itable_index.is_register()) {
1559     Register itable_offset = itable_index.as_register();
1560     sldi(itable_offset, itable_offset, logMEsize);
1561     if (itentry_off) addi(itable_offset, itable_offset, itentry_off);
1562     add(recv_klass, itable_offset, recv_klass);
1563   } else {
1564     long itable_offset = (long)itable_index.as_constant();
1565     load_const_optimized(sethi_temp, (itable_offset<<logMEsize)+itentry_off); // static address, no relocation
1566     add(recv_klass, sethi_temp, recv_klass);
1567   }
1568 
1569   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
1570   //   if (scan->interface() == intf) {
1571   //     result = (klass + scan->offset() + itable_index);
1572   //   }
1573   // }
1574   Label search, found_method;
1575 
1576   for (int peel = 1; peel >= 0; peel--) {
1577     // %%%% Could load both offset and interface in one ldx, if they were
1578     // in the opposite order. This would save a load.
1579     ld(method_result, itableOffsetEntry::interface_offset_in_bytes(), scan_temp);
1580 
1581     // Check that this entry is non-null. A null entry means that
1582     // the receiver class doesn't implement the interface, and wasn't the
1583     // same as when the caller was compiled.
1584     cmpd(CCR0, method_result, intf_klass);
1585 
1586     if (peel) {
1587       beq(CCR0, found_method);
1588     } else {
1589       bne(CCR0, search);
1590       // (invert the test to fall through to found_method...)
1591     }
1592 
1593     if (!peel) break;
1594 
1595     bind(search);
1596 
1597     cmpdi(CCR0, method_result, 0);
1598     beq(CCR0, L_no_such_interface);
1599     addi(scan_temp, scan_temp, scan_step);
1600   }
1601 
1602   bind(found_method);
1603 
1604   // Got a hit.
1605   int ito_offset = itableOffsetEntry::offset_offset_in_bytes();
1606   lwz(scan_temp, ito_offset, scan_temp);
1607   ldx(method_result, scan_temp, recv_klass);
1608 }
1609 
1610 // virtual method calling
1611 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1612                                            RegisterOrConstant vtable_index,
1613                                            Register method_result) {
1614 
1615   assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg());
1616 
1617   const int base = InstanceKlass::vtable_start_offset() * wordSize;
1618   assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1619 
1620   if (vtable_index.is_register()) {
1621     sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord);
1622     add(recv_klass, vtable_index.as_register(), recv_klass);
1623   } else {
1624     addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord);
1625   }
1626   ld(R19_method, base + vtableEntry::method_offset_in_bytes(), recv_klass);
1627 }
1628 
1629 /////////////////////////////////////////// subtype checking ////////////////////////////////////////////
1630 
1631 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1632                                                    Register super_klass,
1633                                                    Register temp1_reg,
1634                                                    Register temp2_reg,
1635                                                    Label& L_success,
1636                                                    Label& L_failure) {
1637 
1638   const Register check_cache_offset = temp1_reg;
1639   const Register cached_super       = temp2_reg;
1640 
1641   assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super);
1642 
1643   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1644   int sc_offset  = in_bytes(Klass::secondary_super_cache_offset());
1645 
1646   // If the pointers are equal, we are done (e.g., String[] elements).
1647   // This self-check enables sharing of secondary supertype arrays among
1648   // non-primary types such as array-of-interface. Otherwise, each such
1649   // type would need its own customized SSA.
1650   // We move this check to the front of the fast path because many
1651   // type checks are in fact trivially successful in this manner,
1652   // so we get a nicely predicted branch right at the start of the check.
1653   cmpd(CCR0, sub_klass, super_klass);
1654   beq(CCR0, L_success);
1655 
1656   // Check the supertype display:
1657   lwz(check_cache_offset, sco_offset, super_klass);
1658   // The loaded value is the offset from KlassOopDesc.
1659 
1660   ldx(cached_super, check_cache_offset, sub_klass);
1661   cmpd(CCR0, cached_super, super_klass);
1662   beq(CCR0, L_success);
1663 
1664   // This check has worked decisively for primary supers.
1665   // Secondary supers are sought in the super_cache ('super_cache_addr').
1666   // (Secondary supers are interfaces and very deeply nested subtypes.)
1667   // This works in the same check above because of a tricky aliasing
1668   // between the super_cache and the primary super display elements.
1669   // (The 'super_check_addr' can address either, as the case requires.)
1670   // Note that the cache is updated below if it does not help us find
1671   // what we need immediately.
1672   // So if it was a primary super, we can just fail immediately.
1673   // Otherwise, it's the slow path for us (no success at this point).
1674 
1675   cmpwi(CCR0, check_cache_offset, sc_offset);
1676   bne(CCR0, L_failure);
1677   // bind(slow_path); // fallthru
1678 }
1679 
1680 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1681                                                    Register super_klass,
1682                                                    Register temp1_reg,
1683                                                    Register temp2_reg,
1684                                                    Label* L_success,
1685                                                    Register result_reg) {
1686   const Register array_ptr = temp1_reg; // current value from cache array
1687   const Register temp      = temp2_reg;
1688 
1689   assert_different_registers(sub_klass, super_klass, array_ptr, temp);
1690 
1691   int source_offset = in_bytes(Klass::secondary_supers_offset());
1692   int target_offset = in_bytes(Klass::secondary_super_cache_offset());
1693 
1694   int length_offset = Array<Klass*>::length_offset_in_bytes();
1695   int base_offset   = Array<Klass*>::base_offset_in_bytes();
1696 
1697   Label hit, loop, failure, fallthru;
1698 
1699   ld(array_ptr, source_offset, sub_klass);
1700 
1701   //assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated.");
1702   lwz(temp, length_offset, array_ptr);
1703   cmpwi(CCR0, temp, 0);
1704   beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0
1705 
1706   mtctr(temp); // load ctr
1707 
1708   bind(loop);
1709   // Oops in table are NO MORE compressed.
1710   ld(temp, base_offset, array_ptr);
1711   cmpd(CCR0, temp, super_klass);
1712   beq(CCR0, hit);
1713   addi(array_ptr, array_ptr, BytesPerWord);
1714   bdnz(loop);
1715 
1716   bind(failure);
1717   if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss)
1718   b(fallthru);
1719 
1720   bind(hit);
1721   std(super_klass, target_offset, sub_klass); // save result to cache
1722   if (result_reg != noreg) li(result_reg, 0); // load zero result (indicates a hit)
1723   if (L_success != NULL) b(*L_success);
1724 
1725   bind(fallthru);
1726 }
1727 
1728 // Try fast path, then go to slow one if not successful
1729 void MacroAssembler::check_klass_subtype(Register sub_klass,
1730                          Register super_klass,
1731                          Register temp1_reg,
1732                          Register temp2_reg,
1733                          Label& L_success) {
1734   Label L_failure;
1735   check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, L_success, L_failure);
1736   check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success);
1737   bind(L_failure); // Fallthru if not successful.
1738 }
1739 
1740 void MacroAssembler::check_method_handle_type(Register mtype_reg, Register mh_reg,
1741                                               Register temp_reg,
1742                                               Label& wrong_method_type) {
1743   assert_different_registers(mtype_reg, mh_reg, temp_reg);
1744   // Compare method type against that of the receiver.
1745   load_heap_oop_not_null(temp_reg, delayed_value(java_lang_invoke_MethodHandle::type_offset_in_bytes, temp_reg), mh_reg);
1746   cmpd(CCR0, temp_reg, mtype_reg);
1747   bne(CCR0, wrong_method_type);
1748 }
1749 
1750 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot,
1751                                                    Register temp_reg,
1752                                                    int extra_slot_offset) {
1753   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
1754   int stackElementSize = Interpreter::stackElementSize;
1755   int offset = extra_slot_offset * stackElementSize;
1756   if (arg_slot.is_constant()) {
1757     offset += arg_slot.as_constant() * stackElementSize;
1758     return offset;
1759   } else {
1760     assert(temp_reg != noreg, "must specify");
1761     sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize));
1762     if (offset != 0)
1763       addi(temp_reg, temp_reg, offset);
1764     return temp_reg;
1765   }
1766 }
1767 
1768 void MacroAssembler::biased_locking_enter(ConditionRegister cr_reg, Register obj_reg,
1769                                           Register mark_reg, Register temp_reg,
1770                                           Register temp2_reg, Label& done, Label* slow_case) {
1771   assert(UseBiasedLocking, "why call this otherwise?");
1772 
1773 #ifdef ASSERT
1774   assert_different_registers(obj_reg, mark_reg, temp_reg, temp2_reg);
1775 #endif
1776 
1777   Label cas_label;
1778 
1779   // Branch to done if fast path fails and no slow_case provided.
1780   Label *slow_case_int = (slow_case != NULL) ? slow_case : &done;
1781 
1782   // Biased locking
1783   // See whether the lock is currently biased toward our thread and
1784   // whether the epoch is still valid
1785   // Note that the runtime guarantees sufficient alignment of JavaThread
1786   // pointers to allow age to be placed into low bits
1787   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits,
1788          "biased locking makes assumptions about bit layout");
1789 
1790   if (PrintBiasedLockingStatistics) {
1791     load_const(temp_reg, (address) BiasedLocking::total_entry_count_addr(), temp2_reg);
1792     lwz(temp2_reg, 0, temp_reg);
1793     addi(temp2_reg, temp2_reg, 1);
1794     stw(temp2_reg, 0, temp_reg);
1795   }
1796 
1797   andi(temp_reg, mark_reg, markOopDesc::biased_lock_mask_in_place);
1798   cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern);
1799   bne(cr_reg, cas_label);
1800 
1801   load_klass(temp_reg, obj_reg);
1802 
1803   load_const_optimized(temp2_reg, ~((int) markOopDesc::age_mask_in_place));
1804   ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
1805   orr(temp_reg, R16_thread, temp_reg);
1806   xorr(temp_reg, mark_reg, temp_reg);
1807   andr(temp_reg, temp_reg, temp2_reg);
1808   cmpdi(cr_reg, temp_reg, 0);
1809   if (PrintBiasedLockingStatistics) {
1810     Label l;
1811     bne(cr_reg, l);
1812     load_const(mark_reg, (address) BiasedLocking::biased_lock_entry_count_addr());
1813     lwz(temp2_reg, 0, mark_reg);
1814     addi(temp2_reg, temp2_reg, 1);
1815     stw(temp2_reg, 0, mark_reg);
1816     // restore mark_reg
1817     ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);
1818     bind(l);
1819   }
1820   beq(cr_reg, done);
1821 
1822   Label try_revoke_bias;
1823   Label try_rebias;
1824 
1825   // At this point we know that the header has the bias pattern and
1826   // that we are not the bias owner in the current epoch. We need to
1827   // figure out more details about the state of the header in order to
1828   // know what operations can be legally performed on the object's
1829   // header.
1830 
1831   // If the low three bits in the xor result aren't clear, that means
1832   // the prototype header is no longer biased and we have to revoke
1833   // the bias on this object.
1834   andi(temp2_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
1835   cmpwi(cr_reg, temp2_reg, 0);
1836   bne(cr_reg, try_revoke_bias);
1837 
1838   // Biasing is still enabled for this data type. See whether the
1839   // epoch of the current bias is still valid, meaning that the epoch
1840   // bits of the mark word are equal to the epoch bits of the
1841   // prototype header. (Note that the prototype header's epoch bits
1842   // only change at a safepoint.) If not, attempt to rebias the object
1843   // toward the current thread. Note that we must be absolutely sure
1844   // that the current epoch is invalid in order to do this because
1845   // otherwise the manipulations it performs on the mark word are
1846   // illegal.
1847 
1848   int shift_amount = 64 - markOopDesc::epoch_shift;
1849   // rotate epoch bits to right (little) end and set other bits to 0
1850   // [ big part | epoch | little part ] -> [ 0..0 | epoch ]
1851   rldicl_(temp2_reg, temp_reg, shift_amount, 64 - markOopDesc::epoch_bits);
1852   // branch if epoch bits are != 0, i.e. they differ, because the epoch has been incremented
1853   bne(CCR0, try_rebias);
1854 
1855   // The epoch of the current bias is still valid but we know nothing
1856   // about the owner; it might be set or it might be clear. Try to
1857   // acquire the bias of the object using an atomic operation. If this
1858   // fails we will go in to the runtime to revoke the object's bias.
1859   // Note that we first construct the presumed unbiased header so we
1860   // don't accidentally blow away another thread's valid bias.
1861   andi(mark_reg, mark_reg, (markOopDesc::biased_lock_mask_in_place |
1862                                 markOopDesc::age_mask_in_place |
1863                                 markOopDesc::epoch_mask_in_place));
1864   orr(temp_reg, R16_thread, mark_reg);
1865 
1866   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
1867 
1868   // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
1869   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
1870            /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
1871            /*where=*/obj_reg,
1872            MacroAssembler::MemBarAcq,
1873            MacroAssembler::cmpxchgx_hint_acquire_lock(),
1874            noreg, slow_case_int); // bail out if failed
1875 
1876   // If the biasing toward our thread failed, this means that
1877   // another thread succeeded in biasing it toward itself and we
1878   // need to revoke that bias. The revocation will occur in the
1879   // interpreter runtime in the slow case.
1880   if (PrintBiasedLockingStatistics) {
1881     load_const(temp_reg, (address) BiasedLocking::anonymously_biased_lock_entry_count_addr(), temp2_reg);
1882     lwz(temp2_reg, 0, temp_reg);
1883     addi(temp2_reg, temp2_reg, 1);
1884     stw(temp2_reg, 0, temp_reg);
1885   }
1886   b(done);
1887 
1888   bind(try_rebias);
1889   // At this point we know the epoch has expired, meaning that the
1890   // current "bias owner", if any, is actually invalid. Under these
1891   // circumstances _only_, we are allowed to use the current header's
1892   // value as the comparison value when doing the cas to acquire the
1893   // bias in the current epoch. In other words, we allow transfer of
1894   // the bias from one thread to another directly in this situation.
1895   andi(temp_reg, mark_reg, markOopDesc::age_mask_in_place);
1896   orr(temp_reg, R16_thread, temp_reg);
1897   load_klass(temp2_reg, obj_reg);
1898   ld(temp2_reg, in_bytes(Klass::prototype_header_offset()), temp2_reg);
1899   orr(temp_reg, temp_reg, temp2_reg);
1900 
1901   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
1902 
1903   // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
1904   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
1905                  /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
1906                  /*where=*/obj_reg,
1907                  MacroAssembler::MemBarAcq,
1908                  MacroAssembler::cmpxchgx_hint_acquire_lock(),
1909                  noreg, slow_case_int); // bail out if failed
1910 
1911   // If the biasing toward our thread failed, this means that
1912   // another thread succeeded in biasing it toward itself and we
1913   // need to revoke that bias. The revocation will occur in the
1914   // interpreter runtime in the slow case.
1915   if (PrintBiasedLockingStatistics) {
1916     load_const(temp_reg, (address) BiasedLocking::rebiased_lock_entry_count_addr(), temp2_reg);
1917     lwz(temp2_reg, 0, temp_reg);
1918     addi(temp2_reg, temp2_reg, 1);
1919     stw(temp2_reg, 0, temp_reg);
1920   }
1921   b(done);
1922 
1923   bind(try_revoke_bias);
1924   // The prototype mark in the klass doesn't have the bias bit set any
1925   // more, indicating that objects of this data type are not supposed
1926   // to be biased any more. We are going to try to reset the mark of
1927   // this object to the prototype value and fall through to the
1928   // CAS-based locking scheme. Note that if our CAS fails, it means
1929   // that another thread raced us for the privilege of revoking the
1930   // bias of this particular object, so it's okay to continue in the
1931   // normal locking code.
1932   load_klass(temp_reg, obj_reg);
1933   ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
1934   andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place);
1935   orr(temp_reg, temp_reg, temp2_reg);
1936 
1937   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
1938 
1939   // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
1940   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
1941                  /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
1942                  /*where=*/obj_reg,
1943                  MacroAssembler::MemBarAcq,
1944                  MacroAssembler::cmpxchgx_hint_acquire_lock());
1945 
1946   // reload markOop in mark_reg before continuing with lightweight locking
1947   ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);
1948 
1949   // Fall through to the normal CAS-based lock, because no matter what
1950   // the result of the above CAS, some thread must have succeeded in
1951   // removing the bias bit from the object's header.
1952   if (PrintBiasedLockingStatistics) {
1953     Label l;
1954     bne(cr_reg, l);
1955     load_const(temp_reg, (address) BiasedLocking::revoked_lock_entry_count_addr(), temp2_reg);
1956     lwz(temp2_reg, 0, temp_reg);
1957     addi(temp2_reg, temp2_reg, 1);
1958     stw(temp2_reg, 0, temp_reg);
1959     bind(l);
1960   }
1961 
1962   bind(cas_label);
1963 }
1964 
1965 void MacroAssembler::biased_locking_exit (ConditionRegister cr_reg, Register mark_addr, Register temp_reg, Label& done) {
1966   // Check for biased locking unlock case, which is a no-op
1967   // Note: we do not have to check the thread ID for two reasons.
1968   // First, the interpreter checks for IllegalMonitorStateException at
1969   // a higher level. Second, if the bias was revoked while we held the
1970   // lock, the object could not be rebiased toward another thread, so
1971   // the bias bit would be clear.
1972 
1973   ld(temp_reg, 0, mark_addr);
1974   andi(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
1975 
1976   cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern);
1977   beq(cr_reg, done);
1978 }
1979 
1980 // TM on PPC64.
1981 void MacroAssembler::atomic_inc_ptr(Register addr, Register result, int simm16) {
1982   Label retry;
1983   bind(retry);
1984   ldarx(result, addr, /*hint*/ false);
1985   addi(result, result, simm16);
1986   stdcx_(result, addr);
1987   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1988     bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
1989   } else {
1990     bne(                  CCR0, retry); // stXcx_ sets CCR0
1991   }
1992 }
1993 
1994 void MacroAssembler::atomic_ori_int(Register addr, Register result, int uimm16) {
1995   Label retry;
1996   bind(retry);
1997   lwarx(result, addr, /*hint*/ false);
1998   ori(result, result, uimm16);
1999   stwcx_(result, addr);
2000   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
2001     bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
2002   } else {
2003     bne(                  CCR0, retry); // stXcx_ sets CCR0
2004   }
2005 }
2006 
2007 #if INCLUDE_RTM_OPT
2008 
2009 // Update rtm_counters based on abort status
2010 // input: abort_status
2011 //        rtm_counters (RTMLockingCounters*)
2012 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters_Reg) {
2013   // Mapping to keep PreciseRTMLockingStatistics similar to x86.
2014   // x86 ppc (! means inverted, ? means not the same)
2015   //  0   31  Set if abort caused by XABORT instruction.
2016   //  1  ! 7  If set, the transaction may succeed on a retry. This bit is always clear if bit 0 is set.
2017   //  2   13  Set if another logical processor conflicted with a memory address that was part of the transaction that aborted.
2018   //  3   10  Set if an internal buffer overflowed.
2019   //  4  ?12  Set if a debug breakpoint was hit.
2020   //  5  ?32  Set if an abort occurred during execution of a nested transaction.
2021   const  int tm_failure_bit[] = {Assembler::tm_tabort, // Note: Seems like signal handler sets this, too.
2022                                  Assembler::tm_failure_persistent, // inverted: transient
2023                                  Assembler::tm_trans_cf,
2024                                  Assembler::tm_footprint_of,
2025                                  Assembler::tm_non_trans_cf,
2026                                  Assembler::tm_suspended};
2027   const bool tm_failure_inv[] = {false, true, false, false, false, false};
2028   assert(sizeof(tm_failure_bit)/sizeof(int) == RTMLockingCounters::ABORT_STATUS_LIMIT, "adapt mapping!");
2029 
2030   const Register addr_Reg = R0;
2031   // Keep track of offset to where rtm_counters_Reg had pointed to.
2032   int counters_offs = RTMLockingCounters::abort_count_offset();
2033   addi(addr_Reg, rtm_counters_Reg, counters_offs);
2034   const Register temp_Reg = rtm_counters_Reg;
2035 
2036   //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically
2037   ldx(temp_Reg, addr_Reg);
2038   addi(temp_Reg, temp_Reg, 1);
2039   stdx(temp_Reg, addr_Reg);
2040 
2041   if (PrintPreciseRTMLockingStatistics) {
2042     int counters_offs_delta = RTMLockingCounters::abortX_count_offset() - counters_offs;
2043 
2044     //mftexasr(abort_status); done by caller
2045     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
2046       counters_offs += counters_offs_delta;
2047       li(temp_Reg, counters_offs_delta); // can't use addi with R0
2048       add(addr_Reg, addr_Reg, temp_Reg); // point to next counter
2049       counters_offs_delta = sizeof(uintx);
2050 
2051       Label check_abort;
2052       rldicr_(temp_Reg, abort_status, tm_failure_bit[i], 0);
2053       if (tm_failure_inv[i]) {
2054         bne(CCR0, check_abort);
2055       } else {
2056         beq(CCR0, check_abort);
2057       }
2058       //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically
2059       ldx(temp_Reg, addr_Reg);
2060       addi(temp_Reg, temp_Reg, 1);
2061       stdx(temp_Reg, addr_Reg);
2062       bind(check_abort);
2063     }
2064   }
2065   li(temp_Reg, -counters_offs); // can't use addi with R0
2066   add(rtm_counters_Reg, addr_Reg, temp_Reg); // restore
2067 }
2068 
2069 // Branch if (random & (count-1) != 0), count is 2^n
2070 // tmp and CR0 are killed
2071 void MacroAssembler::branch_on_random_using_tb(Register tmp, int count, Label& brLabel) {
2072   mftb(tmp);
2073   andi_(tmp, tmp, count-1);
2074   bne(CCR0, brLabel);
2075 }
2076 
2077 // Perform abort ratio calculation, set no_rtm bit if high ratio.
2078 // input:  rtm_counters_Reg (RTMLockingCounters* address) - KILLED
2079 void MacroAssembler::rtm_abort_ratio_calculation(Register rtm_counters_Reg,
2080                                                  RTMLockingCounters* rtm_counters,
2081                                                  Metadata* method_data) {
2082   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
2083 
2084   if (RTMLockingCalculationDelay > 0) {
2085     // Delay calculation.
2086     ld(rtm_counters_Reg, (RegisterOrConstant)(intptr_t)RTMLockingCounters::rtm_calculation_flag_addr());
2087     cmpdi(CCR0, rtm_counters_Reg, 0);
2088     beq(CCR0, L_done);
2089     load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2090   }
2091   // Abort ratio calculation only if abort_count > RTMAbortThreshold.
2092   //   Aborted transactions = abort_count * 100
2093   //   All transactions = total_count *  RTMTotalCountIncrRate
2094   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
2095   ld(R0, RTMLockingCounters::abort_count_offset(), rtm_counters_Reg);
2096   cmpdi(CCR0, R0, RTMAbortThreshold);
2097   blt(CCR0, L_check_always_rtm2);
2098   mulli(R0, R0, 100);
2099 
2100   const Register tmpReg = rtm_counters_Reg;
2101   ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2102   mulli(tmpReg, tmpReg, RTMTotalCountIncrRate);
2103   mulli(tmpReg, tmpReg, RTMAbortRatio);
2104   cmpd(CCR0, R0, tmpReg);
2105   blt(CCR0, L_check_always_rtm1); // jump to reload
2106   if (method_data != NULL) {
2107     // Set rtm_state to "no rtm" in MDO.
2108     // Not using a metadata relocation. Method and Class Loader are kept alive anyway.
2109     // (See nmethod::metadata_do and CodeBuffer::finalize_oop_references.)
2110     load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2111     atomic_ori_int(R0, tmpReg, NoRTM);
2112   }
2113   b(L_done);
2114 
2115   bind(L_check_always_rtm1);
2116   load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2117   bind(L_check_always_rtm2);
2118   ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2119   cmpdi(CCR0, tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
2120   blt(CCR0, L_done);
2121   if (method_data != NULL) {
2122     // Set rtm_state to "always rtm" in MDO.
2123     // Not using a metadata relocation. See above.
2124     load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2125     atomic_ori_int(R0, tmpReg, UseRTM);
2126   }
2127   bind(L_done);
2128 }
2129 
2130 // Update counters and perform abort ratio calculation.
2131 // input: abort_status_Reg
2132 void MacroAssembler::rtm_profiling(Register abort_status_Reg, Register temp_Reg,
2133                                    RTMLockingCounters* rtm_counters,
2134                                    Metadata* method_data,
2135                                    bool profile_rtm) {
2136 
2137   assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2138   // Update rtm counters based on state at abort.
2139   // Reads abort_status_Reg, updates flags.
2140   assert_different_registers(abort_status_Reg, temp_Reg);
2141   load_const_optimized(temp_Reg, (address)rtm_counters, R0);
2142   rtm_counters_update(abort_status_Reg, temp_Reg);
2143   if (profile_rtm) {
2144     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2145     rtm_abort_ratio_calculation(temp_Reg, rtm_counters, method_data);
2146   }
2147 }
2148 
2149 // Retry on abort if abort's status indicates non-persistent failure.
2150 // inputs: retry_count_Reg
2151 //       : abort_status_Reg
2152 // output: retry_count_Reg decremented by 1
2153 void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg,
2154                                              Label& retryLabel, Label* checkRetry) {
2155   Label doneRetry;
2156   rldicr_(R0, abort_status_Reg, tm_failure_persistent, 0);
2157   bne(CCR0, doneRetry);
2158   if (checkRetry) { bind(*checkRetry); }
2159   addic_(retry_count_Reg, retry_count_Reg, -1);
2160   blt(CCR0, doneRetry);
2161   smt_yield(); // Can't use wait(). No permission (SIGILL).
2162   b(retryLabel);
2163   bind(doneRetry);
2164 }
2165 
2166 // Spin and retry if lock is busy.
2167 // inputs: box_Reg (monitor address)
2168 //       : retry_count_Reg
2169 // output: retry_count_Reg decremented by 1
2170 // CTR is killed
2171 void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register owner_addr_Reg, Label& retryLabel) {
2172   Label SpinLoop, doneRetry;
2173   addic_(retry_count_Reg, retry_count_Reg, -1);
2174   blt(CCR0, doneRetry);
2175   li(R0, RTMSpinLoopCount);
2176   mtctr(R0);
2177 
2178   bind(SpinLoop);
2179   smt_yield(); // Can't use waitrsv(). No permission (SIGILL).
2180   bdz(retryLabel);
2181   ld(R0, 0, owner_addr_Reg);
2182   cmpdi(CCR0, R0, 0);
2183   bne(CCR0, SpinLoop);
2184   b(retryLabel);
2185 
2186   bind(doneRetry);
2187 }
2188 
2189 // Use RTM for normal stack locks.
2190 // Input: objReg (object to lock)
2191 void MacroAssembler::rtm_stack_locking(ConditionRegister flag,
2192                                        Register obj, Register mark_word, Register tmp,
2193                                        Register retry_on_abort_count_Reg,
2194                                        RTMLockingCounters* stack_rtm_counters,
2195                                        Metadata* method_data, bool profile_rtm,
2196                                        Label& DONE_LABEL, Label& IsInflated) {
2197   assert(UseRTMForStackLocks, "why call this otherwise?");
2198   assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
2199   Label L_rtm_retry, L_decrement_retry, L_on_abort;
2200 
2201   if (RTMRetryCount > 0) {
2202     load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
2203     bind(L_rtm_retry);
2204   }
2205   andi_(R0, mark_word, markOopDesc::monitor_value);  // inflated vs stack-locked|neutral|biased
2206   bne(CCR0, IsInflated);
2207 
2208   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2209     Label L_noincrement;
2210     if (RTMTotalCountIncrRate > 1) {
2211       branch_on_random_using_tb(tmp, (int)RTMTotalCountIncrRate, L_noincrement);
2212     }
2213     assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
2214     load_const_optimized(tmp, (address)stack_rtm_counters->total_count_addr(), R0);
2215     //atomic_inc_ptr(tmp, /*temp, will be reloaded*/mark_word); We don't increment atomically
2216     ldx(mark_word, tmp);
2217     addi(mark_word, mark_word, 1);
2218     stdx(mark_word, tmp);
2219     bind(L_noincrement);
2220   }
2221   tbegin_();
2222   beq(CCR0, L_on_abort);
2223   ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);         // Reload in transaction, conflicts need to be tracked.
2224   andi(R0, mark_word, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
2225   cmpwi(flag, R0, markOopDesc::unlocked_value);                // bits = 001 unlocked
2226   beq(flag, DONE_LABEL);                                       // all done if unlocked
2227 
2228   if (UseRTMXendForLockBusy) {
2229     tend_();
2230     b(L_decrement_retry);
2231   } else {
2232     tabort_();
2233   }
2234   bind(L_on_abort);
2235   const Register abort_status_Reg = tmp;
2236   mftexasr(abort_status_Reg);
2237   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2238     rtm_profiling(abort_status_Reg, /*temp*/mark_word, stack_rtm_counters, method_data, profile_rtm);
2239   }
2240   ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // reload
2241   if (RTMRetryCount > 0) {
2242     // Retry on lock abort if abort status is not permanent.
2243     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry, &L_decrement_retry);
2244   } else {
2245     bind(L_decrement_retry);
2246   }
2247 }
2248 
2249 // Use RTM for inflating locks
2250 // inputs: obj       (object to lock)
2251 //         mark_word (current header - KILLED)
2252 //         boxReg    (on-stack box address (displaced header location) - KILLED)
2253 void MacroAssembler::rtm_inflated_locking(ConditionRegister flag,
2254                                           Register obj, Register mark_word, Register boxReg,
2255                                           Register retry_on_busy_count_Reg, Register retry_on_abort_count_Reg,
2256                                           RTMLockingCounters* rtm_counters,
2257                                           Metadata* method_data, bool profile_rtm,
2258                                           Label& DONE_LABEL) {
2259   assert(UseRTMLocking, "why call this otherwise?");
2260   Label L_rtm_retry, L_decrement_retry, L_on_abort;
2261   // Clean monitor_value bit to get valid pointer.
2262   int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markOopDesc::monitor_value;
2263 
2264   // Store non-null, using boxReg instead of (intptr_t)markOopDesc::unused_mark().
2265   std(boxReg, BasicLock::displaced_header_offset_in_bytes(), boxReg);
2266   const Register tmpReg = boxReg;
2267   const Register owner_addr_Reg = mark_word;
2268   addi(owner_addr_Reg, mark_word, owner_offset);
2269 
2270   if (RTMRetryCount > 0) {
2271     load_const_optimized(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy.
2272     load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort.
2273     bind(L_rtm_retry);
2274   }
2275   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2276     Label L_noincrement;
2277     if (RTMTotalCountIncrRate > 1) {
2278       branch_on_random_using_tb(R0, (int)RTMTotalCountIncrRate, L_noincrement);
2279     }
2280     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2281     load_const(R0, (address)rtm_counters->total_count_addr(), tmpReg);
2282     //atomic_inc_ptr(R0, tmpReg); We don't increment atomically
2283     ldx(tmpReg, R0);
2284     addi(tmpReg, tmpReg, 1);
2285     stdx(tmpReg, R0);
2286     bind(L_noincrement);
2287   }
2288   tbegin_();
2289   beq(CCR0, L_on_abort);
2290   // We don't reload mark word. Will only be reset at safepoint.
2291   ld(R0, 0, owner_addr_Reg); // Load in transaction, conflicts need to be tracked.
2292   cmpdi(flag, R0, 0);
2293   beq(flag, DONE_LABEL);
2294 
2295   if (UseRTMXendForLockBusy) {
2296     tend_();
2297     b(L_decrement_retry);
2298   } else {
2299     tabort_();
2300   }
2301   bind(L_on_abort);
2302   const Register abort_status_Reg = tmpReg;
2303   mftexasr(abort_status_Reg);
2304   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2305     rtm_profiling(abort_status_Reg, /*temp*/ owner_addr_Reg, rtm_counters, method_data, profile_rtm);
2306     // Restore owner_addr_Reg
2307     ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);
2308 #ifdef ASSERT
2309     andi_(R0, mark_word, markOopDesc::monitor_value);
2310     asm_assert_ne("must be inflated", 0xa754); // Deflating only allowed at safepoint.
2311 #endif
2312     addi(owner_addr_Reg, mark_word, owner_offset);
2313   }
2314   if (RTMRetryCount > 0) {
2315     // Retry on lock abort if abort status is not permanent.
2316     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
2317   }
2318 
2319   // Appears unlocked - try to swing _owner from null to non-null.
2320   cmpxchgd(flag, /*current val*/ R0, (intptr_t)0, /*new val*/ R16_thread, owner_addr_Reg,
2321            MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2322            MacroAssembler::cmpxchgx_hint_acquire_lock(), noreg, &L_decrement_retry, true);
2323 
2324   if (RTMRetryCount > 0) {
2325     // success done else retry
2326     b(DONE_LABEL);
2327     bind(L_decrement_retry);
2328     // Spin and retry if lock is busy.
2329     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, owner_addr_Reg, L_rtm_retry);
2330   } else {
2331     bind(L_decrement_retry);
2332   }
2333 }
2334 
2335 #endif //  INCLUDE_RTM_OPT
2336 
2337 // "The box" is the space on the stack where we copy the object mark.
2338 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box,
2339                                                Register temp, Register displaced_header, Register current_header,
2340                                                bool try_bias,
2341                                                RTMLockingCounters* rtm_counters,
2342                                                RTMLockingCounters* stack_rtm_counters,
2343                                                Metadata* method_data,
2344                                                bool use_rtm, bool profile_rtm) {
2345   assert_different_registers(oop, box, temp, displaced_header, current_header);
2346   assert(flag != CCR0, "bad condition register");
2347   Label cont;
2348   Label object_has_monitor;
2349   Label cas_failed;
2350 
2351   // Load markOop from object into displaced_header.
2352   ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop);
2353 
2354 
2355   // Always do locking in runtime.
2356   if (EmitSync & 0x01) {
2357     cmpdi(flag, oop, 0); // Oop can't be 0 here => always false.
2358     return;
2359   }
2360 
2361   if (try_bias) {
2362     biased_locking_enter(flag, oop, displaced_header, temp, current_header, cont);
2363   }
2364 
2365 #if INCLUDE_RTM_OPT
2366   if (UseRTMForStackLocks && use_rtm) {
2367     rtm_stack_locking(flag, oop, displaced_header, temp, /*temp*/ current_header,
2368                       stack_rtm_counters, method_data, profile_rtm,
2369                       cont, object_has_monitor);
2370   }
2371 #endif // INCLUDE_RTM_OPT
2372 
2373   // Handle existing monitor.
2374   if ((EmitSync & 0x02) == 0) {
2375     // The object has an existing monitor iff (mark & monitor_value) != 0.
2376     andi_(temp, displaced_header, markOopDesc::monitor_value);
2377     bne(CCR0, object_has_monitor);
2378   }
2379 
2380   // Set displaced_header to be (markOop of object | UNLOCK_VALUE).
2381   ori(displaced_header, displaced_header, markOopDesc::unlocked_value);
2382 
2383   // Load Compare Value application register.
2384 
2385   // Initialize the box. (Must happen before we update the object mark!)
2386   std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2387 
2388   // Must fence, otherwise, preceding store(s) may float below cmpxchg.
2389   // Compare object markOop with mark and if equal exchange scratch1 with object markOop.
2390   // CmpxchgX sets cr_reg to cmpX(current, displaced).
2391   membar(Assembler::StoreStore);
2392   cmpxchgd(/*flag=*/flag,
2393            /*current_value=*/current_header,
2394            /*compare_value=*/displaced_header,
2395            /*exchange_value=*/box,
2396            /*where=*/oop,
2397            MacroAssembler::MemBarAcq,
2398            MacroAssembler::cmpxchgx_hint_acquire_lock(),
2399            noreg,
2400            &cas_failed);
2401   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2402 
2403   // If the compare-and-exchange succeeded, then we found an unlocked
2404   // object and we have now locked it.
2405   b(cont);
2406 
2407   bind(cas_failed);
2408   // We did not see an unlocked object so try the fast recursive case.
2409 
2410   // Check if the owner is self by comparing the value in the markOop of object
2411   // (current_header) with the stack pointer.
2412   sub(current_header, current_header, R1_SP);
2413   load_const_optimized(temp, (address) (~(os::vm_page_size()-1) |
2414                                         markOopDesc::lock_mask_in_place));
2415 
2416   and_(R0/*==0?*/, current_header, temp);
2417   // If condition is true we are cont and hence we can store 0 as the
2418   // displaced header in the box, which indicates that it is a recursive lock.
2419   mcrf(flag,CCR0);
2420   std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box);
2421 
2422   // Handle existing monitor.
2423   if ((EmitSync & 0x02) == 0) {
2424     b(cont);
2425 
2426     bind(object_has_monitor);
2427     // The object's monitor m is unlocked iff m->owner == NULL,
2428     // otherwise m->owner may contain a thread or a stack address.
2429 
2430 #if INCLUDE_RTM_OPT
2431     // Use the same RTM locking code in 32- and 64-bit VM.
2432     if (use_rtm) {
2433       rtm_inflated_locking(flag, oop, displaced_header, box, temp, /*temp*/ current_header,
2434                            rtm_counters, method_data, profile_rtm, cont);
2435     } else {
2436 #endif // INCLUDE_RTM_OPT
2437 
2438     // Try to CAS m->owner from NULL to current thread.
2439     addi(temp, displaced_header, ObjectMonitor::owner_offset_in_bytes()-markOopDesc::monitor_value);
2440     li(displaced_header, 0);
2441     // CmpxchgX sets flag to cmpX(current, displaced).
2442     cmpxchgd(/*flag=*/flag,
2443              /*current_value=*/current_header,
2444              /*compare_value=*/(intptr_t)0,
2445              /*exchange_value=*/R16_thread,
2446              /*where=*/temp,
2447              MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2448              MacroAssembler::cmpxchgx_hint_acquire_lock());
2449 
2450     // Store a non-null value into the box.
2451     std(box, BasicLock::displaced_header_offset_in_bytes(), box);
2452 
2453 #   ifdef ASSERT
2454     bne(flag, cont);
2455     // We have acquired the monitor, check some invariants.
2456     addi(/*monitor=*/temp, temp, -ObjectMonitor::owner_offset_in_bytes());
2457     // Invariant 1: _recursions should be 0.
2458     //assert(ObjectMonitor::recursions_size_in_bytes() == 8, "unexpected size");
2459     asm_assert_mem8_is_zero(ObjectMonitor::recursions_offset_in_bytes(), temp,
2460                             "monitor->_recursions should be 0", -1);
2461     // Invariant 2: OwnerIsThread shouldn't be 0.
2462     //assert(ObjectMonitor::OwnerIsThread_size_in_bytes() == 4, "unexpected size");
2463     //asm_assert_mem4_isnot_zero(ObjectMonitor::OwnerIsThread_offset_in_bytes(), temp,
2464     //                           "monitor->OwnerIsThread shouldn't be 0", -1);
2465 #   endif
2466 
2467 #if INCLUDE_RTM_OPT
2468     } // use_rtm()
2469 #endif
2470   }
2471 
2472   bind(cont);
2473   // flag == EQ indicates success
2474   // flag == NE indicates failure
2475 }
2476 
2477 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box,
2478                                                  Register temp, Register displaced_header, Register current_header,
2479                                                  bool try_bias, bool use_rtm) {
2480   assert_different_registers(oop, box, temp, displaced_header, current_header);
2481   assert(flag != CCR0, "bad condition register");
2482   Label cont;
2483   Label object_has_monitor;
2484 
2485   // Always do locking in runtime.
2486   if (EmitSync & 0x01) {
2487     cmpdi(flag, oop, 0); // Oop can't be 0 here => always false.
2488     return;
2489   }
2490 
2491   if (try_bias) {
2492     biased_locking_exit(flag, oop, current_header, cont);
2493   }
2494 
2495 #if INCLUDE_RTM_OPT
2496   if (UseRTMForStackLocks && use_rtm) {
2497     assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
2498     Label L_regular_unlock;
2499     ld(current_header, oopDesc::mark_offset_in_bytes(), oop);         // fetch markword
2500     andi(R0, current_header, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
2501     cmpwi(flag, R0, markOopDesc::unlocked_value);                     // bits = 001 unlocked
2502     bne(flag, L_regular_unlock);                                      // else RegularLock
2503     tend_();                                                          // otherwise end...
2504     b(cont);                                                          // ... and we're done
2505     bind(L_regular_unlock);
2506   }
2507 #endif
2508 
2509   // Find the lock address and load the displaced header from the stack.
2510   ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2511 
2512   // If the displaced header is 0, we have a recursive unlock.
2513   cmpdi(flag, displaced_header, 0);
2514   beq(flag, cont);
2515 
2516   // Handle existing monitor.
2517   if ((EmitSync & 0x02) == 0) {
2518     // The object has an existing monitor iff (mark & monitor_value) != 0.
2519     RTM_OPT_ONLY( if (!(UseRTMForStackLocks && use_rtm)) ) // skip load if already done
2520     ld(current_header, oopDesc::mark_offset_in_bytes(), oop);
2521     andi_(R0, current_header, markOopDesc::monitor_value);
2522     bne(CCR0, object_has_monitor);
2523   }
2524 
2525   // Check if it is still a light weight lock, this is is true if we see
2526   // the stack address of the basicLock in the markOop of the object.
2527   // Cmpxchg sets flag to cmpd(current_header, box).
2528   cmpxchgd(/*flag=*/flag,
2529            /*current_value=*/current_header,
2530            /*compare_value=*/box,
2531            /*exchange_value=*/displaced_header,
2532            /*where=*/oop,
2533            MacroAssembler::MemBarRel,
2534            MacroAssembler::cmpxchgx_hint_release_lock(),
2535            noreg,
2536            &cont);
2537 
2538   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2539 
2540   // Handle existing monitor.
2541   if ((EmitSync & 0x02) == 0) {
2542     b(cont);
2543 
2544     bind(object_has_monitor);
2545     addi(current_header, current_header, -markOopDesc::monitor_value); // monitor
2546     ld(temp,             ObjectMonitor::owner_offset_in_bytes(), current_header);
2547 
2548     // It's inflated.
2549 #if INCLUDE_RTM_OPT
2550     if (use_rtm) {
2551       Label L_regular_inflated_unlock;
2552       // Clean monitor_value bit to get valid pointer
2553       cmpdi(flag, temp, 0);
2554       bne(flag, L_regular_inflated_unlock);
2555       tend_();
2556       b(cont);
2557       bind(L_regular_inflated_unlock);
2558     }
2559 #endif
2560 
2561     ld(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header);
2562     xorr(temp, R16_thread, temp);      // Will be 0 if we are the owner.
2563     orr(temp, temp, displaced_header); // Will be 0 if there are 0 recursions.
2564     cmpdi(flag, temp, 0);
2565     bne(flag, cont);
2566 
2567     ld(temp,             ObjectMonitor::EntryList_offset_in_bytes(), current_header);
2568     ld(displaced_header, ObjectMonitor::cxq_offset_in_bytes(), current_header);
2569     orr(temp, temp, displaced_header); // Will be 0 if both are 0.
2570     cmpdi(flag, temp, 0);
2571     bne(flag, cont);
2572     release();
2573     std(temp, ObjectMonitor::owner_offset_in_bytes(), current_header);
2574   }
2575 
2576   bind(cont);
2577   // flag == EQ indicates success
2578   // flag == NE indicates failure
2579 }
2580 
2581 // Write serialization page so VM thread can do a pseudo remote membar.
2582 // We use the current thread pointer to calculate a thread specific
2583 // offset to write to within the page. This minimizes bus traffic
2584 // due to cache line collision.
2585 void MacroAssembler::serialize_memory(Register thread, Register tmp1, Register tmp2) {
2586   srdi(tmp2, thread, os::get_serialize_page_shift_count());
2587 
2588   int mask = os::vm_page_size() - sizeof(int);
2589   if (Assembler::is_simm(mask, 16)) {
2590     andi(tmp2, tmp2, mask);
2591   } else {
2592     lis(tmp1, (int)((signed short) (mask >> 16)));
2593     ori(tmp1, tmp1, mask & 0x0000ffff);
2594     andr(tmp2, tmp2, tmp1);
2595   }
2596 
2597   load_const(tmp1, (long) os::get_memory_serialize_page());
2598   release();
2599   stwx(R0, tmp1, tmp2);
2600 }
2601 
2602 
2603 // GC barrier helper macros
2604 
2605 // Write the card table byte if needed.
2606 void MacroAssembler::card_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp) {
2607   CardTableModRefBS* bs =
2608     barrier_set_cast<CardTableModRefBS>(Universe::heap()->barrier_set());
2609   assert(bs->kind() == BarrierSet::CardTableForRS ||
2610          bs->kind() == BarrierSet::CardTableExtension, "wrong barrier");
2611 #ifdef ASSERT
2612   cmpdi(CCR0, Rnew_val, 0);
2613   asm_assert_ne("null oop not allowed", 0x321);
2614 #endif
2615   card_table_write(bs->byte_map_base, Rtmp, Rstore_addr);
2616 }
2617 
2618 // Write the card table byte.
2619 void MacroAssembler::card_table_write(jbyte* byte_map_base, Register Rtmp, Register Robj) {
2620   assert_different_registers(Robj, Rtmp, R0);
2621   load_const_optimized(Rtmp, (address)byte_map_base, R0);
2622   srdi(Robj, Robj, CardTableModRefBS::card_shift);
2623   li(R0, 0); // dirty
2624   if (UseConcMarkSweepGC) membar(Assembler::StoreStore);
2625   stbx(R0, Rtmp, Robj);
2626 }
2627 
2628 #if INCLUDE_ALL_GCS
2629 // General G1 pre-barrier generator.
2630 // Goal: record the previous value if it is not null.
2631 void MacroAssembler::g1_write_barrier_pre(Register Robj, RegisterOrConstant offset, Register Rpre_val,
2632                                           Register Rtmp1, Register Rtmp2, bool needs_frame) {
2633   Label runtime, filtered;
2634 
2635   // Is marking active?
2636   if (in_bytes(PtrQueue::byte_width_of_active()) == 4) {
2637     lwz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + PtrQueue::byte_offset_of_active()), R16_thread);
2638   } else {
2639     guarantee(in_bytes(PtrQueue::byte_width_of_active()) == 1, "Assumption");
2640     lbz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + PtrQueue::byte_offset_of_active()), R16_thread);
2641   }
2642   cmpdi(CCR0, Rtmp1, 0);
2643   beq(CCR0, filtered);
2644 
2645   // Do we need to load the previous value?
2646   if (Robj != noreg) {
2647     // Load the previous value...
2648     if (UseCompressedOops) {
2649       lwz(Rpre_val, offset, Robj);
2650     } else {
2651       ld(Rpre_val, offset, Robj);
2652     }
2653     // Previous value has been loaded into Rpre_val.
2654   }
2655   assert(Rpre_val != noreg, "must have a real register");
2656 
2657   // Is the previous value null?
2658   cmpdi(CCR0, Rpre_val, 0);
2659   beq(CCR0, filtered);
2660 
2661   if (Robj != noreg && UseCompressedOops) {
2662     decode_heap_oop_not_null(Rpre_val);
2663   }
2664 
2665   // OK, it's not filtered, so we'll need to call enqueue. In the normal
2666   // case, pre_val will be a scratch G-reg, but there are some cases in
2667   // which it's an O-reg. In the first case, do a normal call. In the
2668   // latter, do a save here and call the frameless version.
2669 
2670   // Can we store original value in the thread's buffer?
2671   // Is index == 0?
2672   // (The index field is typed as size_t.)
2673   const Register Rbuffer = Rtmp1, Rindex = Rtmp2;
2674 
2675   ld(Rindex, in_bytes(JavaThread::satb_mark_queue_offset() + PtrQueue::byte_offset_of_index()), R16_thread);
2676   cmpdi(CCR0, Rindex, 0);
2677   beq(CCR0, runtime); // If index == 0, goto runtime.
2678   ld(Rbuffer, in_bytes(JavaThread::satb_mark_queue_offset() + PtrQueue::byte_offset_of_buf()), R16_thread);
2679 
2680   addi(Rindex, Rindex, -wordSize); // Decrement index.
2681   std(Rindex, in_bytes(JavaThread::satb_mark_queue_offset() + PtrQueue::byte_offset_of_index()), R16_thread);
2682 
2683   // Record the previous value.
2684   stdx(Rpre_val, Rbuffer, Rindex);
2685   b(filtered);
2686 
2687   bind(runtime);
2688 
2689   // VM call need frame to access(write) O register.
2690   if (needs_frame) {
2691     save_LR_CR(Rtmp1);
2692     push_frame_reg_args(0, Rtmp2);
2693   }
2694 
2695   if (Rpre_val->is_volatile() && Robj == noreg) mr(R31, Rpre_val); // Save pre_val across C call if it was preloaded.
2696   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), Rpre_val, R16_thread);
2697   if (Rpre_val->is_volatile() && Robj == noreg) mr(Rpre_val, R31); // restore
2698 
2699   if (needs_frame) {
2700     pop_frame();
2701     restore_LR_CR(Rtmp1);
2702   }
2703 
2704   bind(filtered);
2705 }
2706 
2707 // General G1 post-barrier generator
2708 // Store cross-region card.
2709 void MacroAssembler::g1_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp1, Register Rtmp2, Register Rtmp3, Label *filtered_ext) {
2710   Label runtime, filtered_int;
2711   Label& filtered = (filtered_ext != NULL) ? *filtered_ext : filtered_int;
2712   assert_different_registers(Rstore_addr, Rnew_val, Rtmp1, Rtmp2);
2713 
2714   G1SATBCardTableLoggingModRefBS* bs =
2715     barrier_set_cast<G1SATBCardTableLoggingModRefBS>(Universe::heap()->barrier_set());
2716 
2717   // Does store cross heap regions?
2718   if (G1RSBarrierRegionFilter) {
2719     xorr(Rtmp1, Rstore_addr, Rnew_val);
2720     srdi_(Rtmp1, Rtmp1, HeapRegion::LogOfHRGrainBytes);
2721     beq(CCR0, filtered);
2722   }
2723 
2724   // Crosses regions, storing NULL?
2725 #ifdef ASSERT
2726   cmpdi(CCR0, Rnew_val, 0);
2727   asm_assert_ne("null oop not allowed (G1)", 0x322); // Checked by caller on PPC64, so following branch is obsolete:
2728   //beq(CCR0, filtered);
2729 #endif
2730 
2731   // Storing region crossing non-NULL, is card already dirty?
2732   assert(sizeof(*bs->byte_map_base) == sizeof(jbyte), "adjust this code");
2733   const Register Rcard_addr = Rtmp1;
2734   Register Rbase = Rtmp2;
2735   load_const_optimized(Rbase, (address)bs->byte_map_base, /*temp*/ Rtmp3);
2736 
2737   srdi(Rcard_addr, Rstore_addr, CardTableModRefBS::card_shift);
2738 
2739   // Get the address of the card.
2740   lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr);
2741   cmpwi(CCR0, Rtmp3, (int)G1SATBCardTableModRefBS::g1_young_card_val());
2742   beq(CCR0, filtered);
2743 
2744   membar(Assembler::StoreLoad);
2745   lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr);  // Reload after membar.
2746   cmpwi(CCR0, Rtmp3 /* card value */, CardTableModRefBS::dirty_card_val());
2747   beq(CCR0, filtered);
2748 
2749   // Storing a region crossing, non-NULL oop, card is clean.
2750   // Dirty card and log.
2751   li(Rtmp3, CardTableModRefBS::dirty_card_val());
2752   //release(); // G1: oops are allowed to get visible after dirty marking.
2753   stbx(Rtmp3, Rbase, Rcard_addr);
2754 
2755   add(Rcard_addr, Rbase, Rcard_addr); // This is the address which needs to get enqueued.
2756   Rbase = noreg; // end of lifetime
2757 
2758   const Register Rqueue_index = Rtmp2,
2759                  Rqueue_buf   = Rtmp3;
2760   ld(Rqueue_index, in_bytes(JavaThread::dirty_card_queue_offset() + PtrQueue::byte_offset_of_index()), R16_thread);
2761   cmpdi(CCR0, Rqueue_index, 0);
2762   beq(CCR0, runtime); // index == 0 then jump to runtime
2763   ld(Rqueue_buf, in_bytes(JavaThread::dirty_card_queue_offset() + PtrQueue::byte_offset_of_buf()), R16_thread);
2764 
2765   addi(Rqueue_index, Rqueue_index, -wordSize); // decrement index
2766   std(Rqueue_index, in_bytes(JavaThread::dirty_card_queue_offset() + PtrQueue::byte_offset_of_index()), R16_thread);
2767 
2768   stdx(Rcard_addr, Rqueue_buf, Rqueue_index); // store card
2769   b(filtered);
2770 
2771   bind(runtime);
2772 
2773   // Save the live input values.
2774   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), Rcard_addr, R16_thread);
2775 
2776   bind(filtered_int);
2777 }
2778 #endif // INCLUDE_ALL_GCS
2779 
2780 // Values for last_Java_pc, and last_Java_sp must comply to the rules
2781 // in frame_ppc.hpp.
2782 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) {
2783   // Always set last_Java_pc and flags first because once last_Java_sp
2784   // is visible has_last_Java_frame is true and users will look at the
2785   // rest of the fields. (Note: flags should always be zero before we
2786   // get here so doesn't need to be set.)
2787 
2788   // Verify that last_Java_pc was zeroed on return to Java
2789   asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread,
2790                           "last_Java_pc not zeroed before leaving Java", 0x200);
2791 
2792   // When returning from calling out from Java mode the frame anchor's
2793   // last_Java_pc will always be set to NULL. It is set here so that
2794   // if we are doing a call to native (not VM) that we capture the
2795   // known pc and don't have to rely on the native call having a
2796   // standard frame linkage where we can find the pc.
2797   if (last_Java_pc != noreg)
2798     std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
2799 
2800   // Set last_Java_sp last.
2801   std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
2802 }
2803 
2804 void MacroAssembler::reset_last_Java_frame(void) {
2805   asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()),
2806                              R16_thread, "SP was not set, still zero", 0x202);
2807 
2808   BLOCK_COMMENT("reset_last_Java_frame {");
2809   li(R0, 0);
2810 
2811   // _last_Java_sp = 0
2812   std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
2813 
2814   // _last_Java_pc = 0
2815   std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
2816   BLOCK_COMMENT("} reset_last_Java_frame");
2817 }
2818 
2819 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) {
2820   assert_different_registers(sp, tmp1);
2821 
2822   // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via
2823   // TOP_IJAVA_FRAME_ABI.
2824   // FIXME: assert that we really have a TOP_IJAVA_FRAME here!
2825 #ifdef CC_INTERP
2826   ld(tmp1/*pc*/, _top_ijava_frame_abi(frame_manager_lr), sp);
2827 #else
2828   address entry = pc();
2829   load_const_optimized(tmp1, entry);
2830 #endif
2831 
2832   set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1);
2833 }
2834 
2835 void MacroAssembler::get_vm_result(Register oop_result) {
2836   // Read:
2837   //   R16_thread
2838   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
2839   //
2840   // Updated:
2841   //   oop_result
2842   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
2843 
2844   verify_thread();
2845 
2846   ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread);
2847   li(R0, 0);
2848   std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread);
2849 
2850   verify_oop(oop_result);
2851 }
2852 
2853 void MacroAssembler::get_vm_result_2(Register metadata_result) {
2854   // Read:
2855   //   R16_thread
2856   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
2857   //
2858   // Updated:
2859   //   metadata_result
2860   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
2861 
2862   ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
2863   li(R0, 0);
2864   std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
2865 }
2866 
2867 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) {
2868   Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided.
2869   if (Universe::narrow_klass_base() != 0) {
2870     // Use dst as temp if it is free.
2871     sub_const_optimized(dst, current, Universe::narrow_klass_base(), R0);
2872     current = dst;
2873   }
2874   if (Universe::narrow_klass_shift() != 0) {
2875     srdi(dst, current, Universe::narrow_klass_shift());
2876     current = dst;
2877   }
2878   return current;
2879 }
2880 
2881 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) {
2882   if (UseCompressedClassPointers) {
2883     Register compressedKlass = encode_klass_not_null(ck, klass);
2884     stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop);
2885   } else {
2886     std(klass, oopDesc::klass_offset_in_bytes(), dst_oop);
2887   }
2888 }
2889 
2890 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) {
2891   if (UseCompressedClassPointers) {
2892     if (val == noreg) {
2893       val = R0;
2894       li(val, 0);
2895     }
2896     stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed
2897   }
2898 }
2899 
2900 int MacroAssembler::instr_size_for_decode_klass_not_null() {
2901   if (!UseCompressedClassPointers) return 0;
2902   int num_instrs = 1;  // shift or move
2903   if (Universe::narrow_klass_base() != 0) num_instrs = 7;  // shift + load const + add
2904   return num_instrs * BytesPerInstWord;
2905 }
2906 
2907 void MacroAssembler::decode_klass_not_null(Register dst, Register src) {
2908   assert(dst != R0, "Dst reg may not be R0, as R0 is used here.");
2909   if (src == noreg) src = dst;
2910   Register shifted_src = src;
2911   if (Universe::narrow_klass_shift() != 0 ||
2912       Universe::narrow_klass_base() == 0 && src != dst) {  // Move required.
2913     shifted_src = dst;
2914     sldi(shifted_src, src, Universe::narrow_klass_shift());
2915   }
2916   if (Universe::narrow_klass_base() != 0) {
2917     add_const_optimized(dst, shifted_src, Universe::narrow_klass_base(), R0);
2918   }
2919 }
2920 
2921 void MacroAssembler::load_klass(Register dst, Register src) {
2922   if (UseCompressedClassPointers) {
2923     lwz(dst, oopDesc::klass_offset_in_bytes(), src);
2924     // Attention: no null check here!
2925     decode_klass_not_null(dst, dst);
2926   } else {
2927     ld(dst, oopDesc::klass_offset_in_bytes(), src);
2928   }
2929 }
2930 
2931 void MacroAssembler::load_klass_with_trap_null_check(Register dst, Register src) {
2932   if (!os::zero_page_read_protected()) {
2933     if (TrapBasedNullChecks) {
2934       trap_null_check(src);
2935     }
2936   }
2937   load_klass(dst, src);
2938 }
2939 
2940 void MacroAssembler::reinit_heapbase(Register d, Register tmp) {
2941   if (Universe::heap() != NULL) {
2942     load_const_optimized(R30, Universe::narrow_ptrs_base(), tmp);
2943   } else {
2944     // Heap not yet allocated. Load indirectly.
2945     int simm16_offset = load_const_optimized(R30, Universe::narrow_ptrs_base_addr(), tmp, true);
2946     ld(R30, simm16_offset, R30);
2947   }
2948 }
2949 
2950 // Clear Array
2951 // Kills both input registers. tmp == R0 is allowed.
2952 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp) {
2953   // Procedure for large arrays (uses data cache block zero instruction).
2954     Label startloop, fast, fastloop, small_rest, restloop, done;
2955     const int cl_size         = VM_Version::get_cache_line_size(),
2956               cl_dwords       = cl_size>>3,
2957               cl_dw_addr_bits = exact_log2(cl_dwords),
2958               dcbz_min        = 1;                     // Min count of dcbz executions, needs to be >0.
2959 
2960 //2:
2961     cmpdi(CCR1, cnt_dwords, ((dcbz_min+1)<<cl_dw_addr_bits)-1); // Big enough? (ensure >=dcbz_min lines included).
2962     blt(CCR1, small_rest);                                      // Too small.
2963     rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits);           // Extract dword offset within first cache line.
2964     beq(CCR0, fast);                                            // Already 128byte aligned.
2965 
2966     subfic(tmp, tmp, cl_dwords);
2967     mtctr(tmp);                        // Set ctr to hit 128byte boundary (0<ctr<cl_dwords).
2968     subf(cnt_dwords, tmp, cnt_dwords); // rest.
2969     li(tmp, 0);
2970 //10:
2971   bind(startloop);                     // Clear at the beginning to reach 128byte boundary.
2972     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
2973     addi(base_ptr, base_ptr, 8);
2974     bdnz(startloop);
2975 //13:
2976   bind(fast);                                  // Clear 128byte blocks.
2977     srdi(tmp, cnt_dwords, cl_dw_addr_bits);    // Loop count for 128byte loop (>0).
2978     andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords.
2979     mtctr(tmp);                                // Load counter.
2980 //16:
2981   bind(fastloop);
2982     dcbz(base_ptr);                    // Clear 128byte aligned block.
2983     addi(base_ptr, base_ptr, cl_size);
2984     bdnz(fastloop);
2985     if (InsertEndGroupPPC64) { endgroup(); } else { nop(); }
2986 //20:
2987   bind(small_rest);
2988     cmpdi(CCR0, cnt_dwords, 0);        // size 0?
2989     beq(CCR0, done);                   // rest == 0
2990     li(tmp, 0);
2991     mtctr(cnt_dwords);                 // Load counter.
2992 //24:
2993   bind(restloop);                      // Clear rest.
2994     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
2995     addi(base_ptr, base_ptr, 8);
2996     bdnz(restloop);
2997 //27:
2998   bind(done);
2999 }
3000 
3001 /////////////////////////////////////////// String intrinsics ////////////////////////////////////////////
3002 
3003 // Search for a single jchar in an jchar[].
3004 //
3005 // Assumes that result differs from all other registers.
3006 //
3007 // Haystack, needle are the addresses of jchar-arrays.
3008 // NeedleChar is needle[0] if it is known at compile time.
3009 // Haycnt is the length of the haystack. We assume haycnt >=1.
3010 //
3011 // Preserves haystack, haycnt, kills all other registers.
3012 //
3013 // If needle == R0, we search for the constant needleChar.
3014 void MacroAssembler::string_indexof_1(Register result, Register haystack, Register haycnt,
3015                                       Register needle, jchar needleChar,
3016                                       Register tmp1, Register tmp2) {
3017 
3018   assert_different_registers(result, haystack, haycnt, needle, tmp1, tmp2);
3019 
3020   Label L_InnerLoop, L_FinalCheck, L_Found1, L_Found2, L_Found3, L_NotFound, L_End;
3021   Register needle0 = needle, // Contains needle[0].
3022            addr = tmp1,
3023            ch1 = tmp2,
3024            ch2 = R0;
3025 
3026 //2 (variable) or 3 (const):
3027    if (needle != R0) lhz(needle0, 0, needle); // Preload needle character, needle has len==1.
3028    dcbtct(haystack, 0x00);                        // Indicate R/O access to haystack.
3029 
3030    srwi_(tmp2, haycnt, 1);   // Shift right by exact_log2(UNROLL_FACTOR).
3031    mr(addr, haystack);
3032    beq(CCR0, L_FinalCheck);
3033    mtctr(tmp2);              // Move to count register.
3034 //8:
3035   bind(L_InnerLoop);             // Main work horse (2x unrolled search loop).
3036    lhz(ch1, 0, addr);        // Load characters from haystack.
3037    lhz(ch2, 2, addr);
3038    (needle != R0) ? cmpw(CCR0, ch1, needle0) : cmplwi(CCR0, ch1, needleChar);
3039    (needle != R0) ? cmpw(CCR1, ch2, needle0) : cmplwi(CCR1, ch2, needleChar);
3040    beq(CCR0, L_Found1);   // Did we find the needle?
3041    beq(CCR1, L_Found2);
3042    addi(addr, addr, 4);
3043    bdnz(L_InnerLoop);
3044 //16:
3045   bind(L_FinalCheck);
3046    andi_(R0, haycnt, 1);
3047    beq(CCR0, L_NotFound);
3048    lhz(ch1, 0, addr);        // One position left at which we have to compare.
3049    (needle != R0) ? cmpw(CCR1, ch1, needle0) : cmplwi(CCR1, ch1, needleChar);
3050    beq(CCR1, L_Found3);
3051 //21:
3052   bind(L_NotFound);
3053    li(result, -1);           // Not found.
3054    b(L_End);
3055 
3056   bind(L_Found2);
3057    addi(addr, addr, 2);
3058 //24:
3059   bind(L_Found1);
3060   bind(L_Found3);                  // Return index ...
3061    subf(addr, haystack, addr); // relative to haystack,
3062    srdi(result, addr, 1);      // in characters.
3063   bind(L_End);
3064 }
3065 
3066 
3067 // Implementation of IndexOf for jchar arrays.
3068 //
3069 // The length of haystack and needle are not constant, i.e. passed in a register.
3070 //
3071 // Preserves registers haystack, needle.
3072 // Kills registers haycnt, needlecnt.
3073 // Assumes that result differs from all other registers.
3074 // Haystack, needle are the addresses of jchar-arrays.
3075 // Haycnt, needlecnt are the lengths of them, respectively.
3076 //
3077 // Needlecntval must be zero or 15-bit unsigned immediate and > 1.
3078 void MacroAssembler::string_indexof(Register result, Register haystack, Register haycnt,
3079                                     Register needle, ciTypeArray* needle_values, Register needlecnt, int needlecntval,
3080                                     Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
3081 
3082   // Ensure 0<needlecnt<=haycnt in ideal graph as prerequisite!
3083   Label L_TooShort, L_Found, L_NotFound, L_End;
3084   Register last_addr = haycnt, // Kill haycnt at the beginning.
3085            addr      = tmp1,
3086            n_start   = tmp2,
3087            ch1       = tmp3,
3088            ch2       = R0;
3089 
3090   // **************************************************************************************************
3091   // Prepare for main loop: optimized for needle count >=2, bail out otherwise.
3092   // **************************************************************************************************
3093 
3094 //1 (variable) or 3 (const):
3095    dcbtct(needle, 0x00);    // Indicate R/O access to str1.
3096    dcbtct(haystack, 0x00);  // Indicate R/O access to str2.
3097 
3098   // Compute last haystack addr to use if no match gets found.
3099   if (needlecntval == 0) { // variable needlecnt
3100 //3:
3101    subf(ch1, needlecnt, haycnt);      // Last character index to compare is haycnt-needlecnt.
3102    addi(addr, haystack, -2);          // Accesses use pre-increment.
3103    cmpwi(CCR6, needlecnt, 2);
3104    blt(CCR6, L_TooShort);          // Variable needlecnt: handle short needle separately.
3105    slwi(ch1, ch1, 1);                 // Scale to number of bytes.
3106    lwz(n_start, 0, needle);           // Load first 2 characters of needle.
3107    add(last_addr, haystack, ch1);     // Point to last address to compare (haystack+2*(haycnt-needlecnt)).
3108    addi(needlecnt, needlecnt, -2);    // Rest of needle.
3109   } else { // constant needlecnt
3110   guarantee(needlecntval != 1, "IndexOf with single-character needle must be handled separately");
3111   assert((needlecntval & 0x7fff) == needlecntval, "wrong immediate");
3112 //5:
3113    addi(ch1, haycnt, -needlecntval);  // Last character index to compare is haycnt-needlecnt.
3114    lwz(n_start, 0, needle);           // Load first 2 characters of needle.
3115    addi(addr, haystack, -2);          // Accesses use pre-increment.
3116    slwi(ch1, ch1, 1);                 // Scale to number of bytes.
3117    add(last_addr, haystack, ch1);     // Point to last address to compare (haystack+2*(haycnt-needlecnt)).
3118    li(needlecnt, needlecntval-2);     // Rest of needle.
3119   }
3120 
3121   // Main Loop (now we have at least 3 characters).
3122 //11:
3123   Label L_OuterLoop, L_InnerLoop, L_FinalCheck, L_Comp1, L_Comp2, L_Comp3;
3124   bind(L_OuterLoop); // Search for 1st 2 characters.
3125   Register addr_diff = tmp4;
3126    subf(addr_diff, addr, last_addr); // Difference between already checked address and last address to check.
3127    addi(addr, addr, 2);              // This is the new address we want to use for comparing.
3128    srdi_(ch2, addr_diff, 2);
3129    beq(CCR0, L_FinalCheck);       // 2 characters left?
3130    mtctr(ch2);                       // addr_diff/4
3131 //16:
3132   bind(L_InnerLoop);                // Main work horse (2x unrolled search loop)
3133    lwz(ch1, 0, addr);           // Load 2 characters of haystack (ignore alignment).
3134    lwz(ch2, 2, addr);
3135    cmpw(CCR0, ch1, n_start); // Compare 2 characters (1 would be sufficient but try to reduce branches to CompLoop).
3136    cmpw(CCR1, ch2, n_start);
3137    beq(CCR0, L_Comp1);       // Did we find the needle start?
3138    beq(CCR1, L_Comp2);
3139    addi(addr, addr, 4);
3140    bdnz(L_InnerLoop);
3141 //24:
3142   bind(L_FinalCheck);
3143    rldicl_(addr_diff, addr_diff, 64-1, 63); // Remaining characters not covered by InnerLoop: (addr_diff>>1)&1.
3144    beq(CCR0, L_NotFound);
3145    lwz(ch1, 0, addr);                       // One position left at which we have to compare.
3146    cmpw(CCR1, ch1, n_start);
3147    beq(CCR1, L_Comp3);
3148 //29:
3149   bind(L_NotFound);
3150    li(result, -1); // not found
3151    b(L_End);
3152 
3153 
3154    // **************************************************************************************************
3155    // Special Case: unfortunately, the variable needle case can be called with needlecnt<2
3156    // **************************************************************************************************
3157 //31:
3158  if ((needlecntval>>1) !=1 ) { // Const needlecnt is 2 or 3? Reduce code size.
3159   int nopcnt = 5;
3160   if (needlecntval !=0 ) ++nopcnt; // Balance alignment (other case: see below).
3161   if (needlecntval == 0) {         // We have to handle these cases separately.
3162   Label L_OneCharLoop;
3163   bind(L_TooShort);
3164    mtctr(haycnt);
3165    lhz(n_start, 0, needle);    // First character of needle
3166   bind(L_OneCharLoop);
3167    lhzu(ch1, 2, addr);
3168    cmpw(CCR1, ch1, n_start);
3169    beq(CCR1, L_Found);      // Did we find the one character needle?
3170    bdnz(L_OneCharLoop);
3171    li(result, -1);             // Not found.
3172    b(L_End);
3173   } // 8 instructions, so no impact on alignment.
3174   for (int x = 0; x < nopcnt; ++x) nop();
3175  }
3176 
3177   // **************************************************************************************************
3178   // Regular Case Part II: compare rest of needle (first 2 characters have been compared already)
3179   // **************************************************************************************************
3180 
3181   // Compare the rest
3182 //36 if needlecntval==0, else 37:
3183   bind(L_Comp2);
3184    addi(addr, addr, 2); // First comparison has failed, 2nd one hit.
3185   bind(L_Comp1);            // Addr points to possible needle start.
3186   bind(L_Comp3);            // Could have created a copy and use a different return address but saving code size here.
3187   if (needlecntval != 2) {  // Const needlecnt==2?
3188    if (needlecntval != 3) {
3189     if (needlecntval == 0) beq(CCR6, L_Found); // Variable needlecnt==2?
3190     Register ind_reg = tmp4;
3191     li(ind_reg, 2*2);   // First 2 characters are already compared, use index 2.
3192     mtctr(needlecnt);   // Decremented by 2, still > 0.
3193 //40:
3194    Label L_CompLoop;
3195    bind(L_CompLoop);
3196     lhzx(ch2, needle, ind_reg);
3197     lhzx(ch1, addr, ind_reg);
3198     cmpw(CCR1, ch1, ch2);
3199     bne(CCR1, L_OuterLoop);
3200     addi(ind_reg, ind_reg, 2);
3201     bdnz(L_CompLoop);
3202    } else { // No loop required if there's only one needle character left.
3203     lhz(ch2, 2*2, needle);
3204     lhz(ch1, 2*2, addr);
3205     cmpw(CCR1, ch1, ch2);
3206     bne(CCR1, L_OuterLoop);
3207    }
3208   }
3209   // Return index ...
3210 //46:
3211   bind(L_Found);
3212    subf(addr, haystack, addr); // relative to haystack, ...
3213    srdi(result, addr, 1);      // in characters.
3214 //48:
3215   bind(L_End);
3216 }
3217 
3218 // Implementation of Compare for jchar arrays.
3219 //
3220 // Kills the registers str1, str2, cnt1, cnt2.
3221 // Kills cr0, ctr.
3222 // Assumes that result differes from the input registers.
3223 void MacroAssembler::string_compare(Register str1_reg, Register str2_reg, Register cnt1_reg, Register cnt2_reg,
3224                                     Register result_reg, Register tmp_reg) {
3225    assert_different_registers(result_reg, str1_reg, str2_reg, cnt1_reg, cnt2_reg, tmp_reg);
3226 
3227    Label Ldone, Lslow_case, Lslow_loop, Lfast_loop;
3228    Register cnt_diff = R0,
3229             limit_reg = cnt1_reg,
3230             chr1_reg = result_reg,
3231             chr2_reg = cnt2_reg,
3232             addr_diff = str2_reg;
3233 
3234    // Offset 0 should be 32 byte aligned.
3235 //-4:
3236     dcbtct(str1_reg, 0x00);  // Indicate R/O access to str1.
3237     dcbtct(str2_reg, 0x00);  // Indicate R/O access to str2.
3238 //-2:
3239    // Compute min(cnt1, cnt2) and check if 0 (bail out if we don't need to compare characters).
3240     subf(result_reg, cnt2_reg, cnt1_reg);  // difference between cnt1/2
3241     subf_(addr_diff, str1_reg, str2_reg);  // alias?
3242     beq(CCR0, Ldone);                   // return cnt difference if both ones are identical
3243     srawi(limit_reg, result_reg, 31);      // generate signmask (cnt1/2 must be non-negative so cnt_diff can't overflow)
3244     mr(cnt_diff, result_reg);
3245     andr(limit_reg, result_reg, limit_reg); // difference or zero (negative): cnt1<cnt2 ? cnt1-cnt2 : 0
3246     add_(limit_reg, cnt2_reg, limit_reg);  // min(cnt1, cnt2)==0?
3247     beq(CCR0, Ldone);                   // return cnt difference if one has 0 length
3248 
3249     lhz(chr1_reg, 0, str1_reg);            // optional: early out if first characters mismatch
3250     lhzx(chr2_reg, str1_reg, addr_diff);   // optional: early out if first characters mismatch
3251     addi(tmp_reg, limit_reg, -1);          // min(cnt1, cnt2)-1
3252     subf_(result_reg, chr2_reg, chr1_reg); // optional: early out if first characters mismatch
3253     bne(CCR0, Ldone);                   // optional: early out if first characters mismatch
3254 
3255    // Set loop counter by scaling down tmp_reg
3256     srawi_(chr2_reg, tmp_reg, exact_log2(4)); // (min(cnt1, cnt2)-1)/4
3257     ble(CCR0, Lslow_case);                 // need >4 characters for fast loop
3258     andi(limit_reg, tmp_reg, 4-1);            // remaining characters
3259 
3260    // Adapt str1_reg str2_reg for the first loop iteration
3261     mtctr(chr2_reg);                 // (min(cnt1, cnt2)-1)/4
3262     addi(limit_reg, limit_reg, 4+1); // compare last 5-8 characters in slow_case if mismatch found in fast_loop
3263 //16:
3264    // Compare the rest of the characters
3265    bind(Lfast_loop);
3266     ld(chr1_reg, 0, str1_reg);
3267     ldx(chr2_reg, str1_reg, addr_diff);
3268     cmpd(CCR0, chr2_reg, chr1_reg);
3269     bne(CCR0, Lslow_case); // return chr1_reg
3270     addi(str1_reg, str1_reg, 4*2);
3271     bdnz(Lfast_loop);
3272     addi(limit_reg, limit_reg, -4); // no mismatch found in fast_loop, only 1-4 characters missing
3273 //23:
3274    bind(Lslow_case);
3275     mtctr(limit_reg);
3276 //24:
3277    bind(Lslow_loop);
3278     lhz(chr1_reg, 0, str1_reg);
3279     lhzx(chr2_reg, str1_reg, addr_diff);
3280     subf_(result_reg, chr2_reg, chr1_reg);
3281     bne(CCR0, Ldone); // return chr1_reg
3282     addi(str1_reg, str1_reg, 1*2);
3283     bdnz(Lslow_loop);
3284 //30:
3285    // If strings are equal up to min length, return the length difference.
3286     mr(result_reg, cnt_diff);
3287     nop(); // alignment
3288 //32:
3289    // Otherwise, return the difference between the first mismatched chars.
3290    bind(Ldone);
3291 }
3292 
3293 
3294 // Compare char[] arrays.
3295 //
3296 // str1_reg   USE only
3297 // str2_reg   USE only
3298 // cnt_reg    USE_DEF, due to tmp reg shortage
3299 // result_reg DEF only, might compromise USE only registers
3300 void MacroAssembler::char_arrays_equals(Register str1_reg, Register str2_reg, Register cnt_reg, Register result_reg,
3301                                         Register tmp1_reg, Register tmp2_reg, Register tmp3_reg, Register tmp4_reg,
3302                                         Register tmp5_reg) {
3303 
3304   // Str1 may be the same register as str2 which can occur e.g. after scalar replacement.
3305   assert_different_registers(result_reg, str1_reg, cnt_reg, tmp1_reg, tmp2_reg, tmp3_reg, tmp4_reg, tmp5_reg);
3306   assert_different_registers(result_reg, str2_reg, cnt_reg, tmp1_reg, tmp2_reg, tmp3_reg, tmp4_reg, tmp5_reg);
3307 
3308   // Offset 0 should be 32 byte aligned.
3309   Label Linit_cbc, Lcbc, Lloop, Ldone_true, Ldone_false;
3310   Register index_reg = tmp5_reg;
3311   Register cbc_iter  = tmp4_reg;
3312 
3313 //-1:
3314   dcbtct(str1_reg, 0x00);  // Indicate R/O access to str1.
3315   dcbtct(str2_reg, 0x00);  // Indicate R/O access to str2.
3316 //1:
3317   andi(cbc_iter, cnt_reg, 4-1);            // Remaining iterations after 4 java characters per iteration loop.
3318   li(index_reg, 0); // init
3319   li(result_reg, 0); // assume false
3320   srwi_(tmp2_reg, cnt_reg, exact_log2(4)); // Div: 4 java characters per iteration (main loop).
3321 
3322   cmpwi(CCR1, cbc_iter, 0);             // CCR1 = (cbc_iter==0)
3323   beq(CCR0, Linit_cbc);                 // too short
3324     mtctr(tmp2_reg);
3325 //8:
3326     bind(Lloop);
3327       ldx(tmp1_reg, str1_reg, index_reg);
3328       ldx(tmp2_reg, str2_reg, index_reg);
3329       cmpd(CCR0, tmp1_reg, tmp2_reg);
3330       bne(CCR0, Ldone_false);  // Unequal char pair found -> done.
3331       addi(index_reg, index_reg, 4*sizeof(jchar));
3332       bdnz(Lloop);
3333 //14:
3334   bind(Linit_cbc);
3335   beq(CCR1, Ldone_true);
3336     mtctr(cbc_iter);
3337 //16:
3338     bind(Lcbc);
3339       lhzx(tmp1_reg, str1_reg, index_reg);
3340       lhzx(tmp2_reg, str2_reg, index_reg);
3341       cmpw(CCR0, tmp1_reg, tmp2_reg);
3342       bne(CCR0, Ldone_false);  // Unequal char pair found -> done.
3343       addi(index_reg, index_reg, 1*sizeof(jchar));
3344       bdnz(Lcbc);
3345     nop();
3346   bind(Ldone_true);
3347   li(result_reg, 1);
3348 //24:
3349   bind(Ldone_false);
3350 }
3351 
3352 
3353 void MacroAssembler::char_arrays_equalsImm(Register str1_reg, Register str2_reg, int cntval, Register result_reg,
3354                                            Register tmp1_reg, Register tmp2_reg) {
3355   // Str1 may be the same register as str2 which can occur e.g. after scalar replacement.
3356   assert_different_registers(result_reg, str1_reg, tmp1_reg, tmp2_reg);
3357   assert_different_registers(result_reg, str2_reg, tmp1_reg, tmp2_reg);
3358   assert(sizeof(jchar) == 2, "must be");
3359   assert(cntval >= 0 && ((cntval & 0x7fff) == cntval), "wrong immediate");
3360 
3361   Label Ldone_false;
3362 
3363   if (cntval < 16) { // short case
3364     if (cntval != 0) li(result_reg, 0); // assume false
3365 
3366     const int num_bytes = cntval*sizeof(jchar);
3367     int index = 0;
3368     for (int next_index; (next_index = index + 8) <= num_bytes; index = next_index) {
3369       ld(tmp1_reg, index, str1_reg);
3370       ld(tmp2_reg, index, str2_reg);
3371       cmpd(CCR0, tmp1_reg, tmp2_reg);
3372       bne(CCR0, Ldone_false);
3373     }
3374     if (cntval & 2) {
3375       lwz(tmp1_reg, index, str1_reg);
3376       lwz(tmp2_reg, index, str2_reg);
3377       cmpw(CCR0, tmp1_reg, tmp2_reg);
3378       bne(CCR0, Ldone_false);
3379       index += 4;
3380     }
3381     if (cntval & 1) {
3382       lhz(tmp1_reg, index, str1_reg);
3383       lhz(tmp2_reg, index, str2_reg);
3384       cmpw(CCR0, tmp1_reg, tmp2_reg);
3385       bne(CCR0, Ldone_false);
3386     }
3387     // fallthrough: true
3388   } else {
3389     Label Lloop;
3390     Register index_reg = tmp1_reg;
3391     const int loopcnt = cntval/4;
3392     assert(loopcnt > 0, "must be");
3393     // Offset 0 should be 32 byte aligned.
3394     //2:
3395     dcbtct(str1_reg, 0x00);  // Indicate R/O access to str1.
3396     dcbtct(str2_reg, 0x00);  // Indicate R/O access to str2.
3397     li(tmp2_reg, loopcnt);
3398     li(index_reg, 0); // init
3399     li(result_reg, 0); // assume false
3400     mtctr(tmp2_reg);
3401     //8:
3402     bind(Lloop);
3403     ldx(R0, str1_reg, index_reg);
3404     ldx(tmp2_reg, str2_reg, index_reg);
3405     cmpd(CCR0, R0, tmp2_reg);
3406     bne(CCR0, Ldone_false);  // Unequal char pair found -> done.
3407     addi(index_reg, index_reg, 4*sizeof(jchar));
3408     bdnz(Lloop);
3409     //14:
3410     if (cntval & 2) {
3411       lwzx(R0, str1_reg, index_reg);
3412       lwzx(tmp2_reg, str2_reg, index_reg);
3413       cmpw(CCR0, R0, tmp2_reg);
3414       bne(CCR0, Ldone_false);
3415       if (cntval & 1) addi(index_reg, index_reg, 2*sizeof(jchar));
3416     }
3417     if (cntval & 1) {
3418       lhzx(R0, str1_reg, index_reg);
3419       lhzx(tmp2_reg, str2_reg, index_reg);
3420       cmpw(CCR0, R0, tmp2_reg);
3421       bne(CCR0, Ldone_false);
3422     }
3423     // fallthru: true
3424   }
3425   li(result_reg, 1);
3426   bind(Ldone_false);
3427 }
3428 
3429 // Helpers for Intrinsic Emitters
3430 //
3431 // Revert the byte order of a 32bit value in a register
3432 //   src: 0x44556677
3433 //   dst: 0x77665544
3434 // Three steps to obtain the result:
3435 //  1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word
3436 //     into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared.
3437 //     This value initializes dst.
3438 //  2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost
3439 //     byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go.
3440 //     This value is mask inserted into dst with a [0..23] mask of 1s.
3441 //  3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position.
3442 //     This value is mask inserted into dst with a [8..15] mask of 1s.
3443 void MacroAssembler::load_reverse_32(Register dst, Register src) {
3444   assert_different_registers(dst, src);
3445 
3446   rldicl(dst, src, (4+1)*8, 56);       // Rotate byte 4 into position 7 (rightmost), clear all to the left.
3447   rlwimi(dst, src,     3*8,  0, 23);   // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone.
3448   rlwimi(dst, src,     1*8,  8, 15);   // Insert byte 6 into position 5, leave the rest alone.
3449 }
3450 
3451 // Calculate the column addresses of the crc32 lookup table into distinct registers.
3452 // This loop-invariant calculation is moved out of the loop body, reducing the loop
3453 // body size from 20 to 16 instructions.
3454 // Returns the offset that was used to calculate the address of column tc3.
3455 // Due to register shortage, setting tc3 may overwrite table. With the return offset
3456 // at hand, the original table address can be easily reconstructed.
3457 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {
3458 
3459 #ifdef VM_LITTLE_ENDIAN
3460   // This is what we implement (the DOLIT4 part):
3461   // ========================================================================= */
3462   // #define DOLIT4 c ^= *buf4++; \
3463   //         c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \
3464   //             crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24]
3465   // #define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4
3466   // ========================================================================= */
3467   const int ix0 = 3*(4*CRC32_COLUMN_SIZE);
3468   const int ix1 = 2*(4*CRC32_COLUMN_SIZE);
3469   const int ix2 = 1*(4*CRC32_COLUMN_SIZE);
3470   const int ix3 = 0*(4*CRC32_COLUMN_SIZE);
3471 #else
3472   // This is what we implement (the DOBIG4 part):
3473   // =========================================================================
3474   // #define DOBIG4 c ^= *++buf4; \
3475   //         c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \
3476   //             crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24]
3477   // #define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4
3478   // =========================================================================
3479   const int ix0 = 4*(4*CRC32_COLUMN_SIZE);
3480   const int ix1 = 5*(4*CRC32_COLUMN_SIZE);
3481   const int ix2 = 6*(4*CRC32_COLUMN_SIZE);
3482   const int ix3 = 7*(4*CRC32_COLUMN_SIZE);
3483 #endif
3484   assert_different_registers(table, tc0, tc1, tc2);
3485   assert(table == tc3, "must be!");
3486 
3487   if (ix0 != 0) addi(tc0, table, ix0);
3488   if (ix1 != 0) addi(tc1, table, ix1);
3489   if (ix2 != 0) addi(tc2, table, ix2);
3490   if (ix3 != 0) addi(tc3, table, ix3);
3491 
3492   return ix3;
3493 }
3494 
3495 /**
3496  * uint32_t crc;
3497  * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
3498  */
3499 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) {
3500   assert_different_registers(crc, table, tmp);
3501   assert_different_registers(val, table);
3502 
3503   if (crc == val) {                   // Must rotate first to use the unmodified value.
3504     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3505                                       // As we use a word (4-byte) instruction, we have to adapt the mask bit positions.
3506     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
3507   } else {
3508     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
3509     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3510   }
3511   lwzx(tmp, table, tmp);
3512   xorr(crc, crc, tmp);
3513 }
3514 
3515 /**
3516  * uint32_t crc;
3517  * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
3518  */
3519 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
3520   fold_byte_crc32(crc, crc, table, tmp);
3521 }
3522 
3523 /**
3524  * Emits code to update CRC-32 with a byte value according to constants in table.
3525  *
3526  * @param [in,out]crc   Register containing the crc.
3527  * @param [in]val       Register containing the byte to fold into the CRC.
3528  * @param [in]table     Register containing the table of crc constants.
3529  *
3530  * uint32_t crc;
3531  * val = crc_table[(val ^ crc) & 0xFF];
3532  * crc = val ^ (crc >> 8);
3533  */
3534 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3535   BLOCK_COMMENT("update_byte_crc32:");
3536   xorr(val, val, crc);
3537   fold_byte_crc32(crc, val, table, val);
3538 }
3539 
3540 /**
3541  * @param crc   register containing existing CRC (32-bit)
3542  * @param buf   register pointing to input byte buffer (byte*)
3543  * @param len   register containing number of bytes
3544  * @param table register pointing to CRC table
3545  */
3546 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,
3547                                            Register data, bool loopAlignment, bool invertCRC) {
3548   assert_different_registers(crc, buf, len, table, data);
3549 
3550   Label L_mainLoop, L_done;
3551   const int mainLoop_stepping  = 1;
3552   const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4;
3553 
3554   // Process all bytes in a single-byte loop.
3555   cmpdi(CCR0, len, 0);                           // Anything to do?
3556   mtctr(len);
3557   beq(CCR0, L_done);
3558 
3559   if (invertCRC) {
3560     nand(crc, crc, crc);                         // ~c
3561   }
3562 
3563   align(mainLoop_alignment);
3564   BIND(L_mainLoop);
3565     lbz(data, 0, buf);                           // Byte from buffer, zero-extended.
3566     addi(buf, buf, mainLoop_stepping);           // Advance buffer position.
3567     update_byte_crc32(crc, data, table);
3568     bdnz(L_mainLoop);                            // Iterate.
3569 
3570   if (invertCRC) {
3571     nand(crc, crc, crc);                         // ~c
3572   }
3573 
3574   bind(L_done);
3575 }
3576 
3577 /**
3578  * Emits code to update CRC-32 with a 4-byte value according to constants in table
3579  * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
3580  */
3581 // A not on the lookup table address(es):
3582 // The lookup table consists of two sets of four columns each.
3583 // The columns {0..3} are used for little-endian machines.
3584 // The columns {4..7} are used for big-endian machines.
3585 // To save the effort of adding the column offset to the table address each time
3586 // a table element is looked up, it is possible to pass the pre-calculated
3587 // column addresses.
3588 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary.
3589 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,
3590                                         Register t0,  Register t1,  Register t2,  Register t3,
3591                                         Register tc0, Register tc1, Register tc2, Register tc3) {
3592   assert_different_registers(crc, t3);
3593 
3594   // XOR crc with next four bytes of buffer.
3595   lwz(t3, bufDisp, buf);
3596   if (bufInc != 0) {
3597     addi(buf, buf, bufInc);
3598   }
3599   xorr(t3, t3, crc);
3600 
3601   // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices.
3602   rlwinm(t0, t3,  2,         24-2, 31-2);  // ((t1 >>  0) & 0xff) << 2
3603   rlwinm(t1, t3,  32+(2- 8), 24-2, 31-2);  // ((t1 >>  8) & 0xff) << 2
3604   rlwinm(t2, t3,  32+(2-16), 24-2, 31-2);  // ((t1 >> 16) & 0xff) << 2
3605   rlwinm(t3, t3,  32+(2-24), 24-2, 31-2);  // ((t1 >> 24) & 0xff) << 2
3606 
3607   // Use the pre-calculated column addresses.
3608   // Load pre-calculated table values.
3609   lwzx(t0, tc0, t0);
3610   lwzx(t1, tc1, t1);
3611   lwzx(t2, tc2, t2);
3612   lwzx(t3, tc3, t3);
3613 
3614   // Calculate new crc from table values.
3615   xorr(t0,  t0, t1);
3616   xorr(t2,  t2, t3);
3617   xorr(crc, t0, t2);  // Now crc contains the final checksum value.
3618 }
3619 
3620 /**
3621  * @param crc   register containing existing CRC (32-bit)
3622  * @param buf   register pointing to input byte buffer (byte*)
3623  * @param len   register containing number of bytes
3624  * @param table register pointing to CRC table
3625  *
3626  * Uses R9..R12 as work register. Must be saved/restored by caller!
3627  */
3628 void MacroAssembler::kernel_crc32_2word(Register crc, Register buf, Register len, Register table,
3629                                         Register t0,  Register t1,  Register t2,  Register t3,
3630                                         Register tc0, Register tc1, Register tc2, Register tc3) {
3631   assert_different_registers(crc, buf, len, table);
3632 
3633   Label L_mainLoop, L_tail;
3634   Register  tmp  = t0;
3635   Register  data = t0;
3636   Register  tmp2 = t1;
3637   const int mainLoop_stepping  = 8;
3638   const int tailLoop_stepping  = 1;
3639   const int log_stepping       = exact_log2(mainLoop_stepping);
3640   const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
3641   const int complexThreshold   = 2*mainLoop_stepping;
3642 
3643   // Don't test for len <= 0 here. This pathological case should not occur anyway.
3644   // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles.
3645   // The situation itself is detected and handled correctly by the conditional branches
3646   // following  aghi(len, -stepping) and aghi(len, +stepping).
3647   assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
3648 
3649   BLOCK_COMMENT("kernel_crc32_2word {");
3650 
3651   nand(crc, crc, crc);                           // ~c
3652 
3653   // Check for short (<mainLoop_stepping) buffer.
3654   cmpdi(CCR0, len, complexThreshold);
3655   blt(CCR0, L_tail);
3656 
3657   // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
3658   // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
3659   {
3660     // Align buf addr to mainLoop_stepping boundary.
3661     neg(tmp2, buf);                           // Calculate # preLoop iterations for alignment.
3662     rldicl(tmp2, tmp2, 0, 64-log_stepping);   // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
3663 
3664     if (complexThreshold > mainLoop_stepping) {
3665       sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3666     } else {
3667       sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
3668       cmpdi(CCR0, tmp, mainLoop_stepping);
3669       blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
3670       mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3671     }
3672     update_byteLoop_crc32(crc, buf, tmp2, table, data, false, false);
3673   }
3674 
3675   srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
3676   andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
3677   mtctr(tmp2);
3678 
3679 #ifdef VM_LITTLE_ENDIAN
3680   Register crc_rv = crc;
3681 #else
3682   Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
3683                                                  // Occupies tmp, but frees up crc.
3684   load_reverse_32(crc_rv, crc);                  // Revert byte order because we are dealing with big-endian data.
3685   tmp = crc;
3686 #endif
3687 
3688   int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
3689 
3690   align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
3691   BIND(L_mainLoop);
3692     update_1word_crc32(crc_rv, buf, table, 0, 0, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
3693     update_1word_crc32(crc_rv, buf, table, 4, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
3694     bdnz(L_mainLoop);
3695 
3696 #ifndef VM_LITTLE_ENDIAN
3697   load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
3698   tmp = crc_rv;                                  // Tmp uses it's original register again.
3699 #endif
3700 
3701   // Restore original table address for tailLoop.
3702   if (reconstructTableOffset != 0) {
3703     addi(table, table, -reconstructTableOffset);
3704   }
3705 
3706   // Process last few (<complexThreshold) bytes of buffer.
3707   BIND(L_tail);
3708   update_byteLoop_crc32(crc, buf, len, table, data, false, false);
3709 
3710   nand(crc, crc, crc);                           // ~c
3711   BLOCK_COMMENT("} kernel_crc32_2word");
3712 }
3713 
3714 /**
3715  * @param crc   register containing existing CRC (32-bit)
3716  * @param buf   register pointing to input byte buffer (byte*)
3717  * @param len   register containing number of bytes
3718  * @param table register pointing to CRC table
3719  *
3720  * uses R9..R12 as work register. Must be saved/restored by caller!
3721  */
3722 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table,
3723                                         Register t0,  Register t1,  Register t2,  Register t3,
3724                                         Register tc0, Register tc1, Register tc2, Register tc3) {
3725   assert_different_registers(crc, buf, len, table);
3726 
3727   Label L_mainLoop, L_tail;
3728   Register  tmp          = t0;
3729   Register  data         = t0;
3730   Register  tmp2         = t1;
3731   const int mainLoop_stepping  = 4;
3732   const int tailLoop_stepping  = 1;
3733   const int log_stepping       = exact_log2(mainLoop_stepping);
3734   const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
3735   const int complexThreshold   = 2*mainLoop_stepping;
3736 
3737   // Don't test for len <= 0 here. This pathological case should not occur anyway.
3738   // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles.
3739   // The situation itself is detected and handled correctly by the conditional branches
3740   // following  aghi(len, -stepping) and aghi(len, +stepping).
3741   assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
3742 
3743   BLOCK_COMMENT("kernel_crc32_1word {");
3744 
3745   nand(crc, crc, crc);                           // ~c
3746 
3747   // Check for short (<mainLoop_stepping) buffer.
3748   cmpdi(CCR0, len, complexThreshold);
3749   blt(CCR0, L_tail);
3750 
3751   // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
3752   // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
3753   {
3754     // Align buf addr to mainLoop_stepping boundary.
3755     neg(tmp2, buf);                              // Calculate # preLoop iterations for alignment.
3756     rldicl(tmp2, tmp2, 0, 64-log_stepping);      // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
3757 
3758     if (complexThreshold > mainLoop_stepping) {
3759       sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3760     } else {
3761       sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
3762       cmpdi(CCR0, tmp, mainLoop_stepping);
3763       blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
3764       mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3765     }
3766     update_byteLoop_crc32(crc, buf, tmp2, table, data, false, false);
3767   }
3768 
3769   srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
3770   andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
3771   mtctr(tmp2);
3772 
3773 #ifdef VM_LITTLE_ENDIAN
3774   Register crc_rv = crc;
3775 #else
3776   Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
3777                                                  // Occupies tmp, but frees up crc.
3778   load_reverse_32(crc_rv, crc);                  // evert byte order because we are dealing with big-endian data.
3779   tmp = crc;
3780 #endif
3781 
3782   int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
3783 
3784   align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
3785   BIND(L_mainLoop);
3786     update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
3787     bdnz(L_mainLoop);
3788 
3789 #ifndef VM_LITTLE_ENDIAN
3790   load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
3791   tmp = crc_rv;                                  // Tmp uses it's original register again.
3792 #endif
3793 
3794   // Restore original table address for tailLoop.
3795   if (reconstructTableOffset != 0) {
3796     addi(table, table, -reconstructTableOffset);
3797   }
3798 
3799   // Process last few (<complexThreshold) bytes of buffer.
3800   BIND(L_tail);
3801   update_byteLoop_crc32(crc, buf, len, table, data, false, false);
3802 
3803   nand(crc, crc, crc);                           // ~c
3804   BLOCK_COMMENT("} kernel_crc32_1word");
3805 }
3806 
3807 /**
3808  * @param crc   register containing existing CRC (32-bit)
3809  * @param buf   register pointing to input byte buffer (byte*)
3810  * @param len   register containing number of bytes
3811  * @param table register pointing to CRC table
3812  *
3813  * Uses R7_ARG5, R8_ARG6 as work registers.
3814  */
3815 void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len, Register table,
3816                                         Register t0,  Register t1,  Register t2,  Register t3) {
3817   assert_different_registers(crc, buf, len, table);
3818 
3819   Register  data = t0;                   // Holds the current byte to be folded into crc.
3820 
3821   BLOCK_COMMENT("kernel_crc32_1byte {");
3822 
3823   // Process all bytes in a single-byte loop.
3824   update_byteLoop_crc32(crc, buf, len, table, data, true, true);
3825 
3826   BLOCK_COMMENT("} kernel_crc32_1byte");
3827 }
3828 
3829 void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp) {
3830   assert_different_registers(crc, buf, /* len,  not used!! */ table, tmp);
3831 
3832   BLOCK_COMMENT("kernel_crc32_singleByte:");
3833   nand(crc, crc, crc);       // ~c
3834 
3835   lbz(tmp, 0, buf);          // Byte from buffer, zero-extended.
3836   update_byte_crc32(crc, tmp, table);
3837 
3838   nand(crc, crc, crc);       // ~c
3839 }
3840 
3841 // dest_lo += src1 + src2
3842 // dest_hi += carry1 + carry2
3843 void MacroAssembler::add2_with_carry(Register dest_hi,
3844                                      Register dest_lo,
3845                                      Register src1, Register src2) {
3846   li(R0, 0);
3847   addc(dest_lo, dest_lo, src1);
3848   adde(dest_hi, dest_hi, R0);
3849   addc(dest_lo, dest_lo, src2);
3850   adde(dest_hi, dest_hi, R0);
3851 }
3852 
3853 // Multiply 64 bit by 64 bit first loop.
3854 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,
3855                                            Register x_xstart,
3856                                            Register y, Register y_idx,
3857                                            Register z,
3858                                            Register carry,
3859                                            Register product_high, Register product,
3860                                            Register idx, Register kdx,
3861                                            Register tmp) {
3862   //  jlong carry, x[], y[], z[];
3863   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
3864   //    huge_128 product = y[idx] * x[xstart] + carry;
3865   //    z[kdx] = (jlong)product;
3866   //    carry  = (jlong)(product >>> 64);
3867   //  }
3868   //  z[xstart] = carry;
3869 
3870   Label L_first_loop, L_first_loop_exit;
3871   Label L_one_x, L_one_y, L_multiply;
3872 
3873   addic_(xstart, xstart, -1);
3874   blt(CCR0, L_one_x);   // Special case: length of x is 1.
3875 
3876   // Load next two integers of x.
3877   sldi(tmp, xstart, LogBytesPerInt);
3878   ldx(x_xstart, x, tmp);
3879 #ifdef VM_LITTLE_ENDIAN
3880   rldicl(x_xstart, x_xstart, 32, 0);
3881 #endif
3882 
3883   align(32, 16);
3884   bind(L_first_loop);
3885 
3886   cmpdi(CCR0, idx, 1);
3887   blt(CCR0, L_first_loop_exit);
3888   addi(idx, idx, -2);
3889   beq(CCR0, L_one_y);
3890 
3891   // Load next two integers of y.
3892   sldi(tmp, idx, LogBytesPerInt);
3893   ldx(y_idx, y, tmp);
3894 #ifdef VM_LITTLE_ENDIAN
3895   rldicl(y_idx, y_idx, 32, 0);
3896 #endif
3897 
3898 
3899   bind(L_multiply);
3900   multiply64(product_high, product, x_xstart, y_idx);
3901 
3902   li(tmp, 0);
3903   addc(product, product, carry);         // Add carry to result.
3904   adde(product_high, product_high, tmp); // Add carry of the last addition.
3905   addi(kdx, kdx, -2);
3906 
3907   // Store result.
3908 #ifdef VM_LITTLE_ENDIAN
3909   rldicl(product, product, 32, 0);
3910 #endif
3911   sldi(tmp, kdx, LogBytesPerInt);
3912   stdx(product, z, tmp);
3913   mr_if_needed(carry, product_high);
3914   b(L_first_loop);
3915 
3916 
3917   bind(L_one_y); // Load one 32 bit portion of y as (0,value).
3918 
3919   lwz(y_idx, 0, y);
3920   b(L_multiply);
3921 
3922 
3923   bind(L_one_x); // Load one 32 bit portion of x as (0,value).
3924 
3925   lwz(x_xstart, 0, x);
3926   b(L_first_loop);
3927 
3928   bind(L_first_loop_exit);
3929 }
3930 
3931 // Multiply 64 bit by 64 bit and add 128 bit.
3932 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y,
3933                                             Register z, Register yz_idx,
3934                                             Register idx, Register carry,
3935                                             Register product_high, Register product,
3936                                             Register tmp, int offset) {
3937 
3938   //  huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
3939   //  z[kdx] = (jlong)product;
3940 
3941   sldi(tmp, idx, LogBytesPerInt);
3942   if (offset) {
3943     addi(tmp, tmp, offset);
3944   }
3945   ldx(yz_idx, y, tmp);
3946 #ifdef VM_LITTLE_ENDIAN
3947   rldicl(yz_idx, yz_idx, 32, 0);
3948 #endif
3949 
3950   multiply64(product_high, product, x_xstart, yz_idx);
3951   ldx(yz_idx, z, tmp);
3952 #ifdef VM_LITTLE_ENDIAN
3953   rldicl(yz_idx, yz_idx, 32, 0);
3954 #endif
3955 
3956   add2_with_carry(product_high, product, carry, yz_idx);
3957 
3958   sldi(tmp, idx, LogBytesPerInt);
3959   if (offset) {
3960     addi(tmp, tmp, offset);
3961   }
3962 #ifdef VM_LITTLE_ENDIAN
3963   rldicl(product, product, 32, 0);
3964 #endif
3965   stdx(product, z, tmp);
3966 }
3967 
3968 // Multiply 128 bit by 128 bit. Unrolled inner loop.
3969 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart,
3970                                              Register y, Register z,
3971                                              Register yz_idx, Register idx, Register carry,
3972                                              Register product_high, Register product,
3973                                              Register carry2, Register tmp) {
3974 
3975   //  jlong carry, x[], y[], z[];
3976   //  int kdx = ystart+1;
3977   //  for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
3978   //    huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
3979   //    z[kdx+idx+1] = (jlong)product;
3980   //    jlong carry2 = (jlong)(product >>> 64);
3981   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
3982   //    z[kdx+idx] = (jlong)product;
3983   //    carry = (jlong)(product >>> 64);
3984   //  }
3985   //  idx += 2;
3986   //  if (idx > 0) {
3987   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
3988   //    z[kdx+idx] = (jlong)product;
3989   //    carry = (jlong)(product >>> 64);
3990   //  }
3991 
3992   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
3993   const Register jdx = R0;
3994 
3995   // Scale the index.
3996   srdi_(jdx, idx, 2);
3997   beq(CCR0, L_third_loop_exit);
3998   mtctr(jdx);
3999 
4000   align(32, 16);
4001   bind(L_third_loop);
4002 
4003   addi(idx, idx, -4);
4004 
4005   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8);
4006   mr_if_needed(carry2, product_high);
4007 
4008   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0);
4009   mr_if_needed(carry, product_high);
4010   bdnz(L_third_loop);
4011 
4012   bind(L_third_loop_exit);  // Handle any left-over operand parts.
4013 
4014   andi_(idx, idx, 0x3);
4015   beq(CCR0, L_post_third_loop_done);
4016 
4017   Label L_check_1;
4018 
4019   addic_(idx, idx, -2);
4020   blt(CCR0, L_check_1);
4021 
4022   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0);
4023   mr_if_needed(carry, product_high);
4024 
4025   bind(L_check_1);
4026 
4027   addi(idx, idx, 0x2);
4028   andi_(idx, idx, 0x1) ;
4029   addic_(idx, idx, -1);
4030   blt(CCR0, L_post_third_loop_done);
4031 
4032   sldi(tmp, idx, LogBytesPerInt);
4033   lwzx(yz_idx, y, tmp);
4034   multiply64(product_high, product, x_xstart, yz_idx);
4035   lwzx(yz_idx, z, tmp);
4036 
4037   add2_with_carry(product_high, product, yz_idx, carry);
4038 
4039   sldi(tmp, idx, LogBytesPerInt);
4040   stwx(product, z, tmp);
4041   srdi(product, product, 32);
4042 
4043   sldi(product_high, product_high, 32);
4044   orr(product, product, product_high);
4045   mr_if_needed(carry, product);
4046 
4047   bind(L_post_third_loop_done);
4048 }   // multiply_128_x_128_loop
4049 
4050 void MacroAssembler::multiply_to_len(Register x, Register xlen,
4051                                      Register y, Register ylen,
4052                                      Register z, Register zlen,
4053                                      Register tmp1, Register tmp2,
4054                                      Register tmp3, Register tmp4,
4055                                      Register tmp5, Register tmp6,
4056                                      Register tmp7, Register tmp8,
4057                                      Register tmp9, Register tmp10,
4058                                      Register tmp11, Register tmp12,
4059                                      Register tmp13) {
4060 
4061   ShortBranchVerifier sbv(this);
4062 
4063   assert_different_registers(x, xlen, y, ylen, z, zlen,
4064                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
4065   assert_different_registers(x, xlen, y, ylen, z, zlen,
4066                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp7);
4067   assert_different_registers(x, xlen, y, ylen, z, zlen,
4068                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp8);
4069 
4070   const Register idx = tmp1;
4071   const Register kdx = tmp2;
4072   const Register xstart = tmp3;
4073 
4074   const Register y_idx = tmp4;
4075   const Register carry = tmp5;
4076   const Register product = tmp6;
4077   const Register product_high = tmp7;
4078   const Register x_xstart = tmp8;
4079   const Register tmp = tmp9;
4080 
4081   // First Loop.
4082   //
4083   //  final static long LONG_MASK = 0xffffffffL;
4084   //  int xstart = xlen - 1;
4085   //  int ystart = ylen - 1;
4086   //  long carry = 0;
4087   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
4088   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
4089   //    z[kdx] = (int)product;
4090   //    carry = product >>> 32;
4091   //  }
4092   //  z[xstart] = (int)carry;
4093 
4094   mr_if_needed(idx, ylen);        // idx = ylen
4095   mr_if_needed(kdx, zlen);        // kdx = xlen + ylen
4096   li(carry, 0);                   // carry = 0
4097 
4098   Label L_done;
4099 
4100   addic_(xstart, xlen, -1);
4101   blt(CCR0, L_done);
4102 
4103   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z,
4104                         carry, product_high, product, idx, kdx, tmp);
4105 
4106   Label L_second_loop;
4107 
4108   cmpdi(CCR0, kdx, 0);
4109   beq(CCR0, L_second_loop);
4110 
4111   Label L_carry;
4112 
4113   addic_(kdx, kdx, -1);
4114   beq(CCR0, L_carry);
4115 
4116   // Store lower 32 bits of carry.
4117   sldi(tmp, kdx, LogBytesPerInt);
4118   stwx(carry, z, tmp);
4119   srdi(carry, carry, 32);
4120   addi(kdx, kdx, -1);
4121 
4122 
4123   bind(L_carry);
4124 
4125   // Store upper 32 bits of carry.
4126   sldi(tmp, kdx, LogBytesPerInt);
4127   stwx(carry, z, tmp);
4128 
4129   // Second and third (nested) loops.
4130   //
4131   //  for (int i = xstart-1; i >= 0; i--) { // Second loop
4132   //    carry = 0;
4133   //    for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
4134   //      long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
4135   //                     (z[k] & LONG_MASK) + carry;
4136   //      z[k] = (int)product;
4137   //      carry = product >>> 32;
4138   //    }
4139   //    z[i] = (int)carry;
4140   //  }
4141   //
4142   //  i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
4143 
4144   bind(L_second_loop);
4145 
4146   li(carry, 0);                   // carry = 0;
4147 
4148   addic_(xstart, xstart, -1);     // i = xstart-1;
4149   blt(CCR0, L_done);
4150 
4151   Register zsave = tmp10;
4152 
4153   mr(zsave, z);
4154 
4155 
4156   Label L_last_x;
4157 
4158   sldi(tmp, xstart, LogBytesPerInt);
4159   add(z, z, tmp);                 // z = z + k - j
4160   addi(z, z, 4);
4161   addic_(xstart, xstart, -1);     // i = xstart-1;
4162   blt(CCR0, L_last_x);
4163 
4164   sldi(tmp, xstart, LogBytesPerInt);
4165   ldx(x_xstart, x, tmp);
4166 #ifdef VM_LITTLE_ENDIAN
4167   rldicl(x_xstart, x_xstart, 32, 0);
4168 #endif
4169 
4170 
4171   Label L_third_loop_prologue;
4172 
4173   bind(L_third_loop_prologue);
4174 
4175   Register xsave = tmp11;
4176   Register xlensave = tmp12;
4177   Register ylensave = tmp13;
4178 
4179   mr(xsave, x);
4180   mr(xlensave, xstart);
4181   mr(ylensave, ylen);
4182 
4183 
4184   multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen,
4185                           carry, product_high, product, x, tmp);
4186 
4187   mr(z, zsave);
4188   mr(x, xsave);
4189   mr(xlen, xlensave);   // This is the decrement of the loop counter!
4190   mr(ylen, ylensave);
4191 
4192   addi(tmp3, xlen, 1);
4193   sldi(tmp, tmp3, LogBytesPerInt);
4194   stwx(carry, z, tmp);
4195   addic_(tmp3, tmp3, -1);
4196   blt(CCR0, L_done);
4197 
4198   srdi(carry, carry, 32);
4199   sldi(tmp, tmp3, LogBytesPerInt);
4200   stwx(carry, z, tmp);
4201   b(L_second_loop);
4202 
4203   // Next infrequent code is moved outside loops.
4204   bind(L_last_x);
4205 
4206   lwz(x_xstart, 0, x);
4207   b(L_third_loop_prologue);
4208 
4209   bind(L_done);
4210 }   // multiply_to_len
4211 
4212 void MacroAssembler::asm_assert(bool check_equal, const char *msg, int id) {
4213 #ifdef ASSERT
4214   Label ok;
4215   if (check_equal) {
4216     beq(CCR0, ok);
4217   } else {
4218     bne(CCR0, ok);
4219   }
4220   stop(msg, id);
4221   bind(ok);
4222 #endif
4223 }
4224 
4225 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset,
4226                                           Register mem_base, const char* msg, int id) {
4227 #ifdef ASSERT
4228   switch (size) {
4229     case 4:
4230       lwz(R0, mem_offset, mem_base);
4231       cmpwi(CCR0, R0, 0);
4232       break;
4233     case 8:
4234       ld(R0, mem_offset, mem_base);
4235       cmpdi(CCR0, R0, 0);
4236       break;
4237     default:
4238       ShouldNotReachHere();
4239   }
4240   asm_assert(check_equal, msg, id);
4241 #endif // ASSERT
4242 }
4243 
4244 void MacroAssembler::verify_thread() {
4245   if (VerifyThread) {
4246     unimplemented("'VerifyThread' currently not implemented on PPC");
4247   }
4248 }
4249 
4250 // READ: oop. KILL: R0. Volatile floats perhaps.
4251 void MacroAssembler::verify_oop(Register oop, const char* msg) {
4252   if (!VerifyOops) {
4253     return;
4254   }
4255 
4256   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4257   const Register tmp = R11; // Will be preserved.
4258   const int nbytes_save = 11*8; // Volatile gprs except R0.
4259   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
4260 
4261   if (oop == tmp) mr(R4_ARG2, oop);
4262   save_LR_CR(tmp); // save in old frame
4263   push_frame_reg_args(nbytes_save, tmp);
4264   // load FunctionDescriptor** / entry_address *
4265   load_const_optimized(tmp, fd, R0);
4266   // load FunctionDescriptor* / entry_address
4267   ld(tmp, 0, tmp);
4268   if (oop != tmp) mr_if_needed(R4_ARG2, oop);
4269   load_const_optimized(R3_ARG1, (address)msg, R0);
4270   // Call destination for its side effect.
4271   call_c(tmp);
4272 
4273   pop_frame();
4274   restore_LR_CR(tmp);
4275   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
4276 }
4277 
4278 const char* stop_types[] = {
4279   "stop",
4280   "untested",
4281   "unimplemented",
4282   "shouldnotreachhere"
4283 };
4284 
4285 static void stop_on_request(int tp, const char* msg) {
4286   tty->print("PPC assembly code requires stop: (%s) %s\n", stop_types[tp%/*stop_end*/4], msg);
4287   guarantee(false, err_msg("PPC assembly code requires stop: %s", msg));
4288 }
4289 
4290 // Call a C-function that prints output.
4291 void MacroAssembler::stop(int type, const char* msg, int id) {
4292 #ifndef PRODUCT
4293   block_comment(err_msg("stop: %s %s {", stop_types[type%stop_end], msg));
4294 #else
4295   block_comment("stop {");
4296 #endif
4297 
4298   // setup arguments
4299   load_const_optimized(R3_ARG1, type);
4300   load_const_optimized(R4_ARG2, (void *)msg, /*tmp=*/R0);
4301   call_VM_leaf(CAST_FROM_FN_PTR(address, stop_on_request), R3_ARG1, R4_ARG2);
4302   illtrap();
4303   emit_int32(id);
4304   block_comment("} stop;");
4305 }
4306 
4307 #ifndef PRODUCT
4308 // Write pattern 0x0101010101010101 in memory region [low-before, high+after].
4309 // Val, addr are temp registers.
4310 // If low == addr, addr is killed.
4311 // High is preserved.
4312 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) {
4313   if (!ZapMemory) return;
4314 
4315   assert_different_registers(low, val);
4316 
4317   BLOCK_COMMENT("zap memory region {");
4318   load_const_optimized(val, 0x0101010101010101);
4319   int size = before + after;
4320   if (low == high && size < 5 && size > 0) {
4321     int offset = -before*BytesPerWord;
4322     for (int i = 0; i < size; ++i) {
4323       std(val, offset, low);
4324       offset += (1*BytesPerWord);
4325     }
4326   } else {
4327     addi(addr, low, -before*BytesPerWord);
4328     assert_different_registers(high, val);
4329     if (after) addi(high, high, after * BytesPerWord);
4330     Label loop;
4331     bind(loop);
4332     std(val, 0, addr);
4333     addi(addr, addr, 8);
4334     cmpd(CCR6, addr, high);
4335     ble(CCR6, loop);
4336     if (after) addi(high, high, -after * BytesPerWord);  // Correct back to old value.
4337   }
4338   BLOCK_COMMENT("} zap memory region");
4339 }
4340 
4341 #endif // !PRODUCT
4342 
4343 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() {
4344   int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true);
4345   assert(sizeof(bool) == 1, "PowerPC ABI");
4346   masm->lbz(temp, simm16_offset, temp);
4347   masm->cmpwi(CCR0, temp, 0);
4348   masm->beq(CCR0, _label);
4349 }
4350 
4351 SkipIfEqualZero::~SkipIfEqualZero() {
4352   _masm->bind(_label);
4353 }