1 /*
   2  * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright 2012, 2015 SAP AG. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "compiler/disassembler.hpp"
  29 #include "gc/shared/cardTableModRefBS.hpp"
  30 #include "gc/shared/collectedHeap.inline.hpp"
  31 #include "interpreter/interpreter.hpp"
  32 #include "memory/resourceArea.hpp"
  33 #include "prims/methodHandles.hpp"
  34 #include "runtime/biasedLocking.hpp"
  35 #include "runtime/icache.hpp"
  36 #include "runtime/interfaceSupport.hpp"
  37 #include "runtime/objectMonitor.hpp"
  38 #include "runtime/os.hpp"
  39 #include "runtime/sharedRuntime.hpp"
  40 #include "runtime/stubRoutines.hpp"
  41 #include "utilities/macros.hpp"
  42 #if INCLUDE_ALL_GCS
  43 #include "gc/g1/g1CollectedHeap.inline.hpp"
  44 #include "gc/g1/g1SATBCardTableModRefBS.hpp"
  45 #include "gc/g1/heapRegion.hpp"
  46 #endif // INCLUDE_ALL_GCS
  47 
  48 #ifdef PRODUCT
  49 #define BLOCK_COMMENT(str) // nothing
  50 #else
  51 #define BLOCK_COMMENT(str) block_comment(str)
  52 #endif
  53 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  54 
  55 #ifdef ASSERT
  56 // On RISC, there's no benefit to verifying instruction boundaries.
  57 bool AbstractAssembler::pd_check_instruction_mark() { return false; }
  58 #endif
  59 
  60 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) {
  61   assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range");
  62   if (Assembler::is_simm(si31, 16)) {
  63     ld(d, si31, a);
  64     if (emit_filler_nop) nop();
  65   } else {
  66     const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31);
  67     const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31);
  68     addis(d, a, hi);
  69     ld(d, lo, d);
  70   }
  71 }
  72 
  73 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) {
  74   assert_different_registers(d, a);
  75   ld_largeoffset_unchecked(d, si31, a, emit_filler_nop);
  76 }
  77 
  78 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base,
  79                                       size_t size_in_bytes, bool is_signed) {
  80   switch (size_in_bytes) {
  81   case  8:              ld(dst, offs, base);                         break;
  82   case  4:  is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break;
  83   case  2:  is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break;
  84   case  1:  lbz(dst, offs, base); if (is_signed) extsb(dst, dst);    break; // lba doesn't exist :(
  85   default:  ShouldNotReachHere();
  86   }
  87 }
  88 
  89 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base,
  90                                        size_t size_in_bytes) {
  91   switch (size_in_bytes) {
  92   case  8:  std(dst, offs, base); break;
  93   case  4:  stw(dst, offs, base); break;
  94   case  2:  sth(dst, offs, base); break;
  95   case  1:  stb(dst, offs, base); break;
  96   default:  ShouldNotReachHere();
  97   }
  98 }
  99 
 100 void MacroAssembler::align(int modulus, int max, int rem) {
 101   int padding = (rem + modulus - (offset() % modulus)) % modulus;
 102   if (padding > max) return;
 103   for (int c = (padding >> 2); c > 0; --c) { nop(); }
 104 }
 105 
 106 // Issue instructions that calculate given TOC from global TOC.
 107 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16,
 108                                                        bool add_relocation, bool emit_dummy_addr) {
 109   int offset = -1;
 110   if (emit_dummy_addr) {
 111     offset = -128; // dummy address
 112   } else if (addr != (address)(intptr_t)-1) {
 113     offset = MacroAssembler::offset_to_global_toc(addr);
 114   }
 115 
 116   if (hi16) {
 117     addis(dst, R29, MacroAssembler::largeoffset_si16_si16_hi(offset));
 118   }
 119   if (lo16) {
 120     if (add_relocation) {
 121       // Relocate at the addi to avoid confusion with a load from the method's TOC.
 122       relocate(internal_word_Relocation::spec(addr));
 123     }
 124     addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset));
 125   }
 126 }
 127 
 128 int MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) {
 129   const int offset = MacroAssembler::offset_to_global_toc(addr);
 130 
 131   const address inst2_addr = a;
 132   const int inst2 = *(int *)inst2_addr;
 133 
 134   // The relocation points to the second instruction, the addi,
 135   // and the addi reads and writes the same register dst.
 136   const int dst = inv_rt_field(inst2);
 137   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 138 
 139   // Now, find the preceding addis which writes to dst.
 140   int inst1 = 0;
 141   address inst1_addr = inst2_addr - BytesPerInstWord;
 142   while (inst1_addr >= bound) {
 143     inst1 = *(int *) inst1_addr;
 144     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 145       // Stop, found the addis which writes dst.
 146       break;
 147     }
 148     inst1_addr -= BytesPerInstWord;
 149   }
 150 
 151   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 152   set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset));
 153   set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset));
 154   return (int)((intptr_t)addr - (intptr_t)inst1_addr);
 155 }
 156 
 157 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) {
 158   const address inst2_addr = a;
 159   const int inst2 = *(int *)inst2_addr;
 160 
 161   // The relocation points to the second instruction, the addi,
 162   // and the addi reads and writes the same register dst.
 163   const int dst = inv_rt_field(inst2);
 164   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 165 
 166   // Now, find the preceding addis which writes to dst.
 167   int inst1 = 0;
 168   address inst1_addr = inst2_addr - BytesPerInstWord;
 169   while (inst1_addr >= bound) {
 170     inst1 = *(int *) inst1_addr;
 171     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 172       // stop, found the addis which writes dst
 173       break;
 174     }
 175     inst1_addr -= BytesPerInstWord;
 176   }
 177 
 178   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 179 
 180   int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0);
 181   // -1 is a special case
 182   if (offset == -1) {
 183     return (address)(intptr_t)-1;
 184   } else {
 185     return global_toc() + offset;
 186   }
 187 }
 188 
 189 #ifdef _LP64
 190 // Patch compressed oops or klass constants.
 191 // Assembler sequence is
 192 // 1) compressed oops:
 193 //    lis  rx = const.hi
 194 //    ori rx = rx | const.lo
 195 // 2) compressed klass:
 196 //    lis  rx = const.hi
 197 //    clrldi rx = rx & 0xFFFFffff // clearMS32b, optional
 198 //    ori rx = rx | const.lo
 199 // Clrldi will be passed by.
 200 int MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) {
 201   assert(UseCompressedOops, "Should only patch compressed oops");
 202 
 203   const address inst2_addr = a;
 204   const int inst2 = *(int *)inst2_addr;
 205 
 206   // The relocation points to the second instruction, the ori,
 207   // and the ori reads and writes the same register dst.
 208   const int dst = inv_rta_field(inst2);
 209   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 210   // Now, find the preceding addis which writes to dst.
 211   int inst1 = 0;
 212   address inst1_addr = inst2_addr - BytesPerInstWord;
 213   bool inst1_found = false;
 214   while (inst1_addr >= bound) {
 215     inst1 = *(int *)inst1_addr;
 216     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; }
 217     inst1_addr -= BytesPerInstWord;
 218   }
 219   assert(inst1_found, "inst is not lis");
 220 
 221   int xc = (data >> 16) & 0xffff;
 222   int xd = (data >>  0) & 0xffff;
 223 
 224   set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo
 225   set_imm((int *)inst2_addr,        (xd)); // unsigned int
 226   return (int)((intptr_t)inst2_addr - (intptr_t)inst1_addr);
 227 }
 228 
 229 // Get compressed oop or klass constant.
 230 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) {
 231   assert(UseCompressedOops, "Should only patch compressed oops");
 232 
 233   const address inst2_addr = a;
 234   const int inst2 = *(int *)inst2_addr;
 235 
 236   // The relocation points to the second instruction, the ori,
 237   // and the ori reads and writes the same register dst.
 238   const int dst = inv_rta_field(inst2);
 239   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 240   // Now, find the preceding lis which writes to dst.
 241   int inst1 = 0;
 242   address inst1_addr = inst2_addr - BytesPerInstWord;
 243   bool inst1_found = false;
 244 
 245   while (inst1_addr >= bound) {
 246     inst1 = *(int *) inst1_addr;
 247     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;}
 248     inst1_addr -= BytesPerInstWord;
 249   }
 250   assert(inst1_found, "inst is not lis");
 251 
 252   uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff));
 253   uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16);
 254 
 255   return (int) (xl | xh);
 256 }
 257 #endif // _LP64
 258 
 259 void MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a, Register toc) {
 260   int toc_offset = 0;
 261   // Use RelocationHolder::none for the constant pool entry, otherwise
 262   // we will end up with a failing NativeCall::verify(x) where x is
 263   // the address of the constant pool entry.
 264   // FIXME: We should insert relocation information for oops at the constant
 265   // pool entries instead of inserting it at the loads; patching of a constant
 266   // pool entry should be less expensive.
 267   address oop_address = address_constant((address)a.value(), RelocationHolder::none);
 268   // Relocate at the pc of the load.
 269   relocate(a.rspec());
 270   toc_offset = (int)(oop_address - code()->consts()->start());
 271   ld_largeoffset_unchecked(dst, toc_offset, toc, true);
 272 }
 273 
 274 bool MacroAssembler::is_load_const_from_method_toc_at(address a) {
 275   const address inst1_addr = a;
 276   const int inst1 = *(int *)inst1_addr;
 277 
 278    // The relocation points to the ld or the addis.
 279    return (is_ld(inst1)) ||
 280           (is_addis(inst1) && inv_ra_field(inst1) != 0);
 281 }
 282 
 283 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) {
 284   assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc");
 285 
 286   const address inst1_addr = a;
 287   const int inst1 = *(int *)inst1_addr;
 288 
 289   if (is_ld(inst1)) {
 290     return inv_d1_field(inst1);
 291   } else if (is_addis(inst1)) {
 292     const int dst = inv_rt_field(inst1);
 293 
 294     // Now, find the succeeding ld which reads and writes to dst.
 295     address inst2_addr = inst1_addr + BytesPerInstWord;
 296     int inst2 = 0;
 297     while (true) {
 298       inst2 = *(int *) inst2_addr;
 299       if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) {
 300         // Stop, found the ld which reads and writes dst.
 301         break;
 302       }
 303       inst2_addr += BytesPerInstWord;
 304     }
 305     return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2);
 306   }
 307   ShouldNotReachHere();
 308   return 0;
 309 }
 310 
 311 // Get the constant from a `load_const' sequence.
 312 long MacroAssembler::get_const(address a) {
 313   assert(is_load_const_at(a), "not a load of a constant");
 314   const int *p = (const int*) a;
 315   unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48);
 316   if (is_ori(*(p+1))) {
 317     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32);
 318     x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16);
 319     x |= (((unsigned long) (get_imm(a,4) & 0xffff)));
 320   } else if (is_lis(*(p+1))) {
 321     x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32);
 322     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16);
 323     x |= (((unsigned long) (get_imm(a,3) & 0xffff)));
 324   } else {
 325     ShouldNotReachHere();
 326     return (long) 0;
 327   }
 328   return (long) x;
 329 }
 330 
 331 // Patch the 64 bit constant of a `load_const' sequence. This is a low
 332 // level procedure. It neither flushes the instruction cache nor is it
 333 // mt safe.
 334 void MacroAssembler::patch_const(address a, long x) {
 335   assert(is_load_const_at(a), "not a load of a constant");
 336   int *p = (int*) a;
 337   if (is_ori(*(p+1))) {
 338     set_imm(0 + p, (x >> 48) & 0xffff);
 339     set_imm(1 + p, (x >> 32) & 0xffff);
 340     set_imm(3 + p, (x >> 16) & 0xffff);
 341     set_imm(4 + p, x & 0xffff);
 342   } else if (is_lis(*(p+1))) {
 343     set_imm(0 + p, (x >> 48) & 0xffff);
 344     set_imm(2 + p, (x >> 32) & 0xffff);
 345     set_imm(1 + p, (x >> 16) & 0xffff);
 346     set_imm(3 + p, x & 0xffff);
 347   } else {
 348     ShouldNotReachHere();
 349   }
 350 }
 351 
 352 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) {
 353   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
 354   int index = oop_recorder()->allocate_metadata_index(obj);
 355   RelocationHolder rspec = metadata_Relocation::spec(index);
 356   return AddressLiteral((address)obj, rspec);
 357 }
 358 
 359 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) {
 360   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
 361   int index = oop_recorder()->find_index(obj);
 362   RelocationHolder rspec = metadata_Relocation::spec(index);
 363   return AddressLiteral((address)obj, rspec);
 364 }
 365 
 366 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) {
 367   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
 368   int oop_index = oop_recorder()->allocate_oop_index(obj);
 369   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 370 }
 371 
 372 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) {
 373   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
 374   int oop_index = oop_recorder()->find_index(obj);
 375   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 376 }
 377 
 378 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
 379                                                       Register tmp, int offset) {
 380   intptr_t value = *delayed_value_addr;
 381   if (value != 0) {
 382     return RegisterOrConstant(value + offset);
 383   }
 384 
 385   // Load indirectly to solve generation ordering problem.
 386   // static address, no relocation
 387   int simm16_offset = load_const_optimized(tmp, delayed_value_addr, noreg, true);
 388   ld(tmp, simm16_offset, tmp); // must be aligned ((xa & 3) == 0)
 389 
 390   if (offset != 0) {
 391     addi(tmp, tmp, offset);
 392   }
 393 
 394   return RegisterOrConstant(tmp);
 395 }
 396 
 397 #ifndef PRODUCT
 398 void MacroAssembler::pd_print_patched_instruction(address branch) {
 399   Unimplemented(); // TODO: PPC port
 400 }
 401 #endif // ndef PRODUCT
 402 
 403 // Conditional far branch for destinations encodable in 24+2 bits.
 404 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) {
 405 
 406   // If requested by flag optimize, relocate the bc_far as a
 407   // runtime_call and prepare for optimizing it when the code gets
 408   // relocated.
 409   if (optimize == bc_far_optimize_on_relocate) {
 410     relocate(relocInfo::runtime_call_type);
 411   }
 412 
 413   // variant 2:
 414   //
 415   //    b!cxx SKIP
 416   //    bxx   DEST
 417   //  SKIP:
 418   //
 419 
 420   const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 421                                                 opposite_bcond(inv_boint_bcond(boint)));
 422 
 423   // We emit two branches.
 424   // First, a conditional branch which jumps around the far branch.
 425   const address not_taken_pc = pc() + 2 * BytesPerInstWord;
 426   const address bc_pc        = pc();
 427   bc(opposite_boint, biint, not_taken_pc);
 428 
 429   const int bc_instr = *(int*)bc_pc;
 430   assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition");
 431   assert(opposite_boint == inv_bo_field(bc_instr), "postcondition");
 432   assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))),
 433                                      opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))),
 434          "postcondition");
 435   assert(biint == inv_bi_field(bc_instr), "postcondition");
 436 
 437   // Second, an unconditional far branch which jumps to dest.
 438   // Note: target(dest) remembers the current pc (see CodeSection::target)
 439   //       and returns the current pc if the label is not bound yet; when
 440   //       the label gets bound, the unconditional far branch will be patched.
 441   const address target_pc = target(dest);
 442   const address b_pc  = pc();
 443   b(target_pc);
 444 
 445   assert(not_taken_pc == pc(),                     "postcondition");
 446   assert(dest.is_bound() || target_pc == b_pc, "postcondition");
 447 }
 448 
 449 bool MacroAssembler::is_bc_far_at(address instruction_addr) {
 450   return is_bc_far_variant1_at(instruction_addr) ||
 451          is_bc_far_variant2_at(instruction_addr) ||
 452          is_bc_far_variant3_at(instruction_addr);
 453 }
 454 
 455 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) {
 456   if (is_bc_far_variant1_at(instruction_addr)) {
 457     const address instruction_1_addr = instruction_addr;
 458     const int instruction_1 = *(int*)instruction_1_addr;
 459     return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr);
 460   } else if (is_bc_far_variant2_at(instruction_addr)) {
 461     const address instruction_2_addr = instruction_addr + 4;
 462     return bxx_destination(instruction_2_addr);
 463   } else if (is_bc_far_variant3_at(instruction_addr)) {
 464     return instruction_addr + 8;
 465   }
 466   // variant 4 ???
 467   ShouldNotReachHere();
 468   return NULL;
 469 }
 470 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) {
 471 
 472   if (is_bc_far_variant3_at(instruction_addr)) {
 473     // variant 3, far cond branch to the next instruction, already patched to nops:
 474     //
 475     //    nop
 476     //    endgroup
 477     //  SKIP/DEST:
 478     //
 479     return;
 480   }
 481 
 482   // first, extract boint and biint from the current branch
 483   int boint = 0;
 484   int biint = 0;
 485 
 486   ResourceMark rm;
 487   const int code_size = 2 * BytesPerInstWord;
 488   CodeBuffer buf(instruction_addr, code_size);
 489   MacroAssembler masm(&buf);
 490   if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) {
 491     // Far branch to next instruction: Optimize it by patching nops (produce variant 3).
 492     masm.nop();
 493     masm.endgroup();
 494   } else {
 495     if (is_bc_far_variant1_at(instruction_addr)) {
 496       // variant 1, the 1st instruction contains the destination address:
 497       //
 498       //    bcxx  DEST
 499       //    endgroup
 500       //
 501       const int instruction_1 = *(int*)(instruction_addr);
 502       boint = inv_bo_field(instruction_1);
 503       biint = inv_bi_field(instruction_1);
 504     } else if (is_bc_far_variant2_at(instruction_addr)) {
 505       // variant 2, the 2nd instruction contains the destination address:
 506       //
 507       //    b!cxx SKIP
 508       //    bxx   DEST
 509       //  SKIP:
 510       //
 511       const int instruction_1 = *(int*)(instruction_addr);
 512       boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))),
 513           opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1))));
 514       biint = inv_bi_field(instruction_1);
 515     } else {
 516       // variant 4???
 517       ShouldNotReachHere();
 518     }
 519 
 520     // second, set the new branch destination and optimize the code
 521     if (dest != instruction_addr + 4 && // the bc_far is still unbound!
 522         masm.is_within_range_of_bcxx(dest, instruction_addr)) {
 523       // variant 1:
 524       //
 525       //    bcxx  DEST
 526       //    endgroup
 527       //
 528       masm.bc(boint, biint, dest);
 529       masm.endgroup();
 530     } else {
 531       // variant 2:
 532       //
 533       //    b!cxx SKIP
 534       //    bxx   DEST
 535       //  SKIP:
 536       //
 537       const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 538                                                     opposite_bcond(inv_boint_bcond(boint)));
 539       const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord;
 540       masm.bc(opposite_boint, biint, not_taken_pc);
 541       masm.b(dest);
 542     }
 543   }
 544   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 545 }
 546 
 547 // Emit a NOT mt-safe patchable 64 bit absolute call/jump.
 548 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) {
 549   // get current pc
 550   uint64_t start_pc = (uint64_t) pc();
 551 
 552   const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last
 553   const address pc_of_b  = (address) (start_pc + (0*BytesPerInstWord)); // b is first
 554 
 555   // relocate here
 556   if (rt != relocInfo::none) {
 557     relocate(rt);
 558   }
 559 
 560   if ( ReoptimizeCallSequences &&
 561        (( link && is_within_range_of_b(dest, pc_of_bl)) ||
 562         (!link && is_within_range_of_b(dest, pc_of_b)))) {
 563     // variant 2:
 564     // Emit an optimized, pc-relative call/jump.
 565 
 566     if (link) {
 567       // some padding
 568       nop();
 569       nop();
 570       nop();
 571       nop();
 572       nop();
 573       nop();
 574 
 575       // do the call
 576       assert(pc() == pc_of_bl, "just checking");
 577       bl(dest, relocInfo::none);
 578     } else {
 579       // do the jump
 580       assert(pc() == pc_of_b, "just checking");
 581       b(dest, relocInfo::none);
 582 
 583       // some padding
 584       nop();
 585       nop();
 586       nop();
 587       nop();
 588       nop();
 589       nop();
 590     }
 591 
 592     // Assert that we can identify the emitted call/jump.
 593     assert(is_bxx64_patchable_variant2_at((address)start_pc, link),
 594            "can't identify emitted call");
 595   } else {
 596     // variant 1:
 597 #if defined(ABI_ELFv2)
 598     nop();
 599     calculate_address_from_global_toc(R12, dest, true, true, false);
 600     mtctr(R12);
 601     nop();
 602     nop();
 603 #else
 604     mr(R0, R11);  // spill R11 -> R0.
 605 
 606     // Load the destination address into CTR,
 607     // calculate destination relative to global toc.
 608     calculate_address_from_global_toc(R11, dest, true, true, false);
 609 
 610     mtctr(R11);
 611     mr(R11, R0);  // spill R11 <- R0.
 612     nop();
 613 #endif
 614 
 615     // do the call/jump
 616     if (link) {
 617       bctrl();
 618     } else{
 619       bctr();
 620     }
 621     // Assert that we can identify the emitted call/jump.
 622     assert(is_bxx64_patchable_variant1b_at((address)start_pc, link),
 623            "can't identify emitted call");
 624   }
 625 
 626   // Assert that we can identify the emitted call/jump.
 627   assert(is_bxx64_patchable_at((address)start_pc, link),
 628          "can't identify emitted call");
 629   assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest,
 630          "wrong encoding of dest address");
 631 }
 632 
 633 // Identify a bxx64_patchable instruction.
 634 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) {
 635   return is_bxx64_patchable_variant1b_at(instruction_addr, link)
 636     //|| is_bxx64_patchable_variant1_at(instruction_addr, link)
 637       || is_bxx64_patchable_variant2_at(instruction_addr, link);
 638 }
 639 
 640 // Does the call64_patchable instruction use a pc-relative encoding of
 641 // the call destination?
 642 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) {
 643   // variant 2 is pc-relative
 644   return is_bxx64_patchable_variant2_at(instruction_addr, link);
 645 }
 646 
 647 // Identify variant 1.
 648 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) {
 649   unsigned int* instr = (unsigned int*) instruction_addr;
 650   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 651       && is_mtctr(instr[5]) // mtctr
 652     && is_load_const_at(instruction_addr);
 653 }
 654 
 655 // Identify variant 1b: load destination relative to global toc.
 656 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) {
 657   unsigned int* instr = (unsigned int*) instruction_addr;
 658   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 659     && is_mtctr(instr[3]) // mtctr
 660     && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr);
 661 }
 662 
 663 // Identify variant 2.
 664 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) {
 665   unsigned int* instr = (unsigned int*) instruction_addr;
 666   if (link) {
 667     return is_bl (instr[6])  // bl dest is last
 668       && is_nop(instr[0])  // nop
 669       && is_nop(instr[1])  // nop
 670       && is_nop(instr[2])  // nop
 671       && is_nop(instr[3])  // nop
 672       && is_nop(instr[4])  // nop
 673       && is_nop(instr[5]); // nop
 674   } else {
 675     return is_b  (instr[0])  // b  dest is first
 676       && is_nop(instr[1])  // nop
 677       && is_nop(instr[2])  // nop
 678       && is_nop(instr[3])  // nop
 679       && is_nop(instr[4])  // nop
 680       && is_nop(instr[5])  // nop
 681       && is_nop(instr[6]); // nop
 682   }
 683 }
 684 
 685 // Set dest address of a bxx64_patchable instruction.
 686 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) {
 687   ResourceMark rm;
 688   int code_size = MacroAssembler::bxx64_patchable_size;
 689   CodeBuffer buf(instruction_addr, code_size);
 690   MacroAssembler masm(&buf);
 691   masm.bxx64_patchable(dest, relocInfo::none, link);
 692   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 693 }
 694 
 695 // Get dest address of a bxx64_patchable instruction.
 696 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) {
 697   if (is_bxx64_patchable_variant1_at(instruction_addr, link)) {
 698     return (address) (unsigned long) get_const(instruction_addr);
 699   } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) {
 700     unsigned int* instr = (unsigned int*) instruction_addr;
 701     if (link) {
 702       const int instr_idx = 6; // bl is last
 703       int branchoffset = branch_destination(instr[instr_idx], 0);
 704       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 705     } else {
 706       const int instr_idx = 0; // b is first
 707       int branchoffset = branch_destination(instr[instr_idx], 0);
 708       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 709     }
 710   // Load dest relative to global toc.
 711   } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) {
 712     return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord,
 713                                                                instruction_addr);
 714   } else {
 715     ShouldNotReachHere();
 716     return NULL;
 717   }
 718 }
 719 
 720 // Uses ordering which corresponds to ABI:
 721 //    _savegpr0_14:  std  r14,-144(r1)
 722 //    _savegpr0_15:  std  r15,-136(r1)
 723 //    _savegpr0_16:  std  r16,-128(r1)
 724 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) {
 725   std(R14, offset, dst);   offset += 8;
 726   std(R15, offset, dst);   offset += 8;
 727   std(R16, offset, dst);   offset += 8;
 728   std(R17, offset, dst);   offset += 8;
 729   std(R18, offset, dst);   offset += 8;
 730   std(R19, offset, dst);   offset += 8;
 731   std(R20, offset, dst);   offset += 8;
 732   std(R21, offset, dst);   offset += 8;
 733   std(R22, offset, dst);   offset += 8;
 734   std(R23, offset, dst);   offset += 8;
 735   std(R24, offset, dst);   offset += 8;
 736   std(R25, offset, dst);   offset += 8;
 737   std(R26, offset, dst);   offset += 8;
 738   std(R27, offset, dst);   offset += 8;
 739   std(R28, offset, dst);   offset += 8;
 740   std(R29, offset, dst);   offset += 8;
 741   std(R30, offset, dst);   offset += 8;
 742   std(R31, offset, dst);   offset += 8;
 743 
 744   stfd(F14, offset, dst);   offset += 8;
 745   stfd(F15, offset, dst);   offset += 8;
 746   stfd(F16, offset, dst);   offset += 8;
 747   stfd(F17, offset, dst);   offset += 8;
 748   stfd(F18, offset, dst);   offset += 8;
 749   stfd(F19, offset, dst);   offset += 8;
 750   stfd(F20, offset, dst);   offset += 8;
 751   stfd(F21, offset, dst);   offset += 8;
 752   stfd(F22, offset, dst);   offset += 8;
 753   stfd(F23, offset, dst);   offset += 8;
 754   stfd(F24, offset, dst);   offset += 8;
 755   stfd(F25, offset, dst);   offset += 8;
 756   stfd(F26, offset, dst);   offset += 8;
 757   stfd(F27, offset, dst);   offset += 8;
 758   stfd(F28, offset, dst);   offset += 8;
 759   stfd(F29, offset, dst);   offset += 8;
 760   stfd(F30, offset, dst);   offset += 8;
 761   stfd(F31, offset, dst);
 762 }
 763 
 764 // Uses ordering which corresponds to ABI:
 765 //    _restgpr0_14:  ld   r14,-144(r1)
 766 //    _restgpr0_15:  ld   r15,-136(r1)
 767 //    _restgpr0_16:  ld   r16,-128(r1)
 768 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) {
 769   ld(R14, offset, src);   offset += 8;
 770   ld(R15, offset, src);   offset += 8;
 771   ld(R16, offset, src);   offset += 8;
 772   ld(R17, offset, src);   offset += 8;
 773   ld(R18, offset, src);   offset += 8;
 774   ld(R19, offset, src);   offset += 8;
 775   ld(R20, offset, src);   offset += 8;
 776   ld(R21, offset, src);   offset += 8;
 777   ld(R22, offset, src);   offset += 8;
 778   ld(R23, offset, src);   offset += 8;
 779   ld(R24, offset, src);   offset += 8;
 780   ld(R25, offset, src);   offset += 8;
 781   ld(R26, offset, src);   offset += 8;
 782   ld(R27, offset, src);   offset += 8;
 783   ld(R28, offset, src);   offset += 8;
 784   ld(R29, offset, src);   offset += 8;
 785   ld(R30, offset, src);   offset += 8;
 786   ld(R31, offset, src);   offset += 8;
 787 
 788   // FP registers
 789   lfd(F14, offset, src);   offset += 8;
 790   lfd(F15, offset, src);   offset += 8;
 791   lfd(F16, offset, src);   offset += 8;
 792   lfd(F17, offset, src);   offset += 8;
 793   lfd(F18, offset, src);   offset += 8;
 794   lfd(F19, offset, src);   offset += 8;
 795   lfd(F20, offset, src);   offset += 8;
 796   lfd(F21, offset, src);   offset += 8;
 797   lfd(F22, offset, src);   offset += 8;
 798   lfd(F23, offset, src);   offset += 8;
 799   lfd(F24, offset, src);   offset += 8;
 800   lfd(F25, offset, src);   offset += 8;
 801   lfd(F26, offset, src);   offset += 8;
 802   lfd(F27, offset, src);   offset += 8;
 803   lfd(F28, offset, src);   offset += 8;
 804   lfd(F29, offset, src);   offset += 8;
 805   lfd(F30, offset, src);   offset += 8;
 806   lfd(F31, offset, src);
 807 }
 808 
 809 // For verify_oops.
 810 void MacroAssembler::save_volatile_gprs(Register dst, int offset) {
 811   std(R2,  offset, dst);   offset += 8;
 812   std(R3,  offset, dst);   offset += 8;
 813   std(R4,  offset, dst);   offset += 8;
 814   std(R5,  offset, dst);   offset += 8;
 815   std(R6,  offset, dst);   offset += 8;
 816   std(R7,  offset, dst);   offset += 8;
 817   std(R8,  offset, dst);   offset += 8;
 818   std(R9,  offset, dst);   offset += 8;
 819   std(R10, offset, dst);   offset += 8;
 820   std(R11, offset, dst);   offset += 8;
 821   std(R12, offset, dst);
 822 }
 823 
 824 // For verify_oops.
 825 void MacroAssembler::restore_volatile_gprs(Register src, int offset) {
 826   ld(R2,  offset, src);   offset += 8;
 827   ld(R3,  offset, src);   offset += 8;
 828   ld(R4,  offset, src);   offset += 8;
 829   ld(R5,  offset, src);   offset += 8;
 830   ld(R6,  offset, src);   offset += 8;
 831   ld(R7,  offset, src);   offset += 8;
 832   ld(R8,  offset, src);   offset += 8;
 833   ld(R9,  offset, src);   offset += 8;
 834   ld(R10, offset, src);   offset += 8;
 835   ld(R11, offset, src);   offset += 8;
 836   ld(R12, offset, src);
 837 }
 838 
 839 void MacroAssembler::save_LR_CR(Register tmp) {
 840   mfcr(tmp);
 841   std(tmp, _abi(cr), R1_SP);
 842   mflr(tmp);
 843   std(tmp, _abi(lr), R1_SP);
 844   // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad)
 845 }
 846 
 847 void MacroAssembler::restore_LR_CR(Register tmp) {
 848   assert(tmp != R1_SP, "must be distinct");
 849   ld(tmp, _abi(lr), R1_SP);
 850   mtlr(tmp);
 851   ld(tmp, _abi(cr), R1_SP);
 852   mtcr(tmp);
 853 }
 854 
 855 address MacroAssembler::get_PC_trash_LR(Register result) {
 856   Label L;
 857   bl(L);
 858   bind(L);
 859   address lr_pc = pc();
 860   mflr(result);
 861   return lr_pc;
 862 }
 863 
 864 void MacroAssembler::resize_frame(Register offset, Register tmp) {
 865 #ifdef ASSERT
 866   assert_different_registers(offset, tmp, R1_SP);
 867   andi_(tmp, offset, frame::alignment_in_bytes-1);
 868   asm_assert_eq("resize_frame: unaligned", 0x204);
 869 #endif
 870 
 871   // tmp <- *(SP)
 872   ld(tmp, _abi(callers_sp), R1_SP);
 873   // addr <- SP + offset;
 874   // *(addr) <- tmp;
 875   // SP <- addr
 876   stdux(tmp, R1_SP, offset);
 877 }
 878 
 879 void MacroAssembler::resize_frame(int offset, Register tmp) {
 880   assert(is_simm(offset, 16), "too big an offset");
 881   assert_different_registers(tmp, R1_SP);
 882   assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned");
 883   // tmp <- *(SP)
 884   ld(tmp, _abi(callers_sp), R1_SP);
 885   // addr <- SP + offset;
 886   // *(addr) <- tmp;
 887   // SP <- addr
 888   stdu(tmp, offset, R1_SP);
 889 }
 890 
 891 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) {
 892   // (addr == tmp1) || (addr == tmp2) is allowed here!
 893   assert(tmp1 != tmp2, "must be distinct");
 894 
 895   // compute offset w.r.t. current stack pointer
 896   // tmp_1 <- addr - SP (!)
 897   subf(tmp1, R1_SP, addr);
 898 
 899   // atomically update SP keeping back link.
 900   resize_frame(tmp1/* offset */, tmp2/* tmp */);
 901 }
 902 
 903 void MacroAssembler::push_frame(Register bytes, Register tmp) {
 904 #ifdef ASSERT
 905   assert(bytes != R0, "r0 not allowed here");
 906   andi_(R0, bytes, frame::alignment_in_bytes-1);
 907   asm_assert_eq("push_frame(Reg, Reg): unaligned", 0x203);
 908 #endif
 909   neg(tmp, bytes);
 910   stdux(R1_SP, R1_SP, tmp);
 911 }
 912 
 913 // Push a frame of size `bytes'.
 914 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) {
 915   long offset = align_addr(bytes, frame::alignment_in_bytes);
 916   if (is_simm(-offset, 16)) {
 917     stdu(R1_SP, -offset, R1_SP);
 918   } else {
 919     load_const(tmp, -offset);
 920     stdux(R1_SP, R1_SP, tmp);
 921   }
 922 }
 923 
 924 // Push a frame of size `bytes' plus abi_reg_args on top.
 925 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) {
 926   push_frame(bytes + frame::abi_reg_args_size, tmp);
 927 }
 928 
 929 // Setup up a new C frame with a spill area for non-volatile GPRs and
 930 // additional space for local variables.
 931 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes,
 932                                                       Register tmp) {
 933   push_frame(bytes + frame::abi_reg_args_size + frame::spill_nonvolatiles_size, tmp);
 934 }
 935 
 936 // Pop current C frame.
 937 void MacroAssembler::pop_frame() {
 938   ld(R1_SP, _abi(callers_sp), R1_SP);
 939 }
 940 
 941 #if defined(ABI_ELFv2)
 942 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) {
 943   // TODO(asmundak): make sure the caller uses R12 as function descriptor
 944   // most of the times.
 945   if (R12 != r_function_entry) {
 946     mr(R12, r_function_entry);
 947   }
 948   mtctr(R12);
 949   // Do a call or a branch.
 950   if (and_link) {
 951     bctrl();
 952   } else {
 953     bctr();
 954   }
 955   _last_calls_return_pc = pc();
 956 
 957   return _last_calls_return_pc;
 958 }
 959 
 960 // Call a C function via a function descriptor and use full C
 961 // calling conventions. Updates and returns _last_calls_return_pc.
 962 address MacroAssembler::call_c(Register r_function_entry) {
 963   return branch_to(r_function_entry, /*and_link=*/true);
 964 }
 965 
 966 // For tail calls: only branch, don't link, so callee returns to caller of this function.
 967 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) {
 968   return branch_to(r_function_entry, /*and_link=*/false);
 969 }
 970 
 971 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) {
 972   load_const(R12, function_entry, R0);
 973   return branch_to(R12,  /*and_link=*/true);
 974 }
 975 
 976 #else
 977 // Generic version of a call to C function via a function descriptor
 978 // with variable support for C calling conventions (TOC, ENV, etc.).
 979 // Updates and returns _last_calls_return_pc.
 980 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call,
 981                                   bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) {
 982   // we emit standard ptrgl glue code here
 983   assert((function_descriptor != R0), "function_descriptor cannot be R0");
 984 
 985   // retrieve necessary entries from the function descriptor
 986   ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor);
 987   mtctr(R0);
 988 
 989   if (load_toc_of_callee) {
 990     ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor);
 991   }
 992   if (load_env_of_callee) {
 993     ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor);
 994   } else if (load_toc_of_callee) {
 995     li(R11, 0);
 996   }
 997 
 998   // do a call or a branch
 999   if (and_link) {
1000     bctrl();
1001   } else {
1002     bctr();
1003   }
1004   _last_calls_return_pc = pc();
1005 
1006   return _last_calls_return_pc;
1007 }
1008 
1009 // Call a C function via a function descriptor and use full C calling
1010 // conventions.
1011 // We don't use the TOC in generated code, so there is no need to save
1012 // and restore its value.
1013 address MacroAssembler::call_c(Register fd) {
1014   return branch_to(fd, /*and_link=*/true,
1015                        /*save toc=*/false,
1016                        /*restore toc=*/false,
1017                        /*load toc=*/true,
1018                        /*load env=*/true);
1019 }
1020 
1021 address MacroAssembler::call_c_and_return_to_caller(Register fd) {
1022   return branch_to(fd, /*and_link=*/false,
1023                        /*save toc=*/false,
1024                        /*restore toc=*/false,
1025                        /*load toc=*/true,
1026                        /*load env=*/true);
1027 }
1028 
1029 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) {
1030   if (rt != relocInfo::none) {
1031     // this call needs to be relocatable
1032     if (!ReoptimizeCallSequences
1033         || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1034         || fd == NULL   // support code-size estimation
1035         || !fd->is_friend_function()
1036         || fd->entry() == NULL) {
1037       // it's not a friend function as defined by class FunctionDescriptor,
1038       // so do a full call-c here.
1039       load_const(R11, (address)fd, R0);
1040 
1041       bool has_env = (fd != NULL && fd->env() != NULL);
1042       return branch_to(R11, /*and_link=*/true,
1043                             /*save toc=*/false,
1044                             /*restore toc=*/false,
1045                             /*load toc=*/true,
1046                             /*load env=*/has_env);
1047     } else {
1048       // It's a friend function. Load the entry point and don't care about
1049       // toc and env. Use an optimizable call instruction, but ensure the
1050       // same code-size as in the case of a non-friend function.
1051       nop();
1052       nop();
1053       nop();
1054       bl64_patchable(fd->entry(), rt);
1055       _last_calls_return_pc = pc();
1056       return _last_calls_return_pc;
1057     }
1058   } else {
1059     // This call does not need to be relocatable, do more aggressive
1060     // optimizations.
1061     if (!ReoptimizeCallSequences
1062       || !fd->is_friend_function()) {
1063       // It's not a friend function as defined by class FunctionDescriptor,
1064       // so do a full call-c here.
1065       load_const(R11, (address)fd, R0);
1066       return branch_to(R11, /*and_link=*/true,
1067                             /*save toc=*/false,
1068                             /*restore toc=*/false,
1069                             /*load toc=*/true,
1070                             /*load env=*/true);
1071     } else {
1072       // it's a friend function, load the entry point and don't care about
1073       // toc and env.
1074       address dest = fd->entry();
1075       if (is_within_range_of_b(dest, pc())) {
1076         bl(dest);
1077       } else {
1078         bl64_patchable(dest, rt);
1079       }
1080       _last_calls_return_pc = pc();
1081       return _last_calls_return_pc;
1082     }
1083   }
1084 }
1085 
1086 // Call a C function.  All constants needed reside in TOC.
1087 //
1088 // Read the address to call from the TOC.
1089 // Read env from TOC, if fd specifies an env.
1090 // Read new TOC from TOC.
1091 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd,
1092                                          relocInfo::relocType rt, Register toc) {
1093   if (!ReoptimizeCallSequences
1094     || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1095     || !fd->is_friend_function()) {
1096     // It's not a friend function as defined by class FunctionDescriptor,
1097     // so do a full call-c here.
1098     assert(fd->entry() != NULL, "function must be linked");
1099 
1100     AddressLiteral fd_entry(fd->entry());
1101     load_const_from_method_toc(R11, fd_entry, toc);
1102     mtctr(R11);
1103     if (fd->env() == NULL) {
1104       li(R11, 0);
1105       nop();
1106     } else {
1107       AddressLiteral fd_env(fd->env());
1108       load_const_from_method_toc(R11, fd_env, toc);
1109     }
1110     AddressLiteral fd_toc(fd->toc());
1111     load_toc_from_toc(R2_TOC, fd_toc, toc);
1112     // R2_TOC is killed.
1113     bctrl();
1114     _last_calls_return_pc = pc();
1115   } else {
1116     // It's a friend function, load the entry point and don't care about
1117     // toc and env. Use an optimizable call instruction, but ensure the
1118     // same code-size as in the case of a non-friend function.
1119     nop();
1120     bl64_patchable(fd->entry(), rt);
1121     _last_calls_return_pc = pc();
1122   }
1123   return _last_calls_return_pc;
1124 }
1125 #endif // ABI_ELFv2
1126 
1127 void MacroAssembler::call_VM_base(Register oop_result,
1128                                   Register last_java_sp,
1129                                   address  entry_point,
1130                                   bool     check_exceptions) {
1131   BLOCK_COMMENT("call_VM {");
1132   // Determine last_java_sp register.
1133   if (!last_java_sp->is_valid()) {
1134     last_java_sp = R1_SP;
1135   }
1136   set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1);
1137 
1138   // ARG1 must hold thread address.
1139   mr(R3_ARG1, R16_thread);
1140 #if defined(ABI_ELFv2)
1141   address return_pc = call_c(entry_point, relocInfo::none);
1142 #else
1143   address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none);
1144 #endif
1145 
1146   reset_last_Java_frame();
1147 
1148   // Check for pending exceptions.
1149   if (check_exceptions) {
1150     // We don't check for exceptions here.
1151     ShouldNotReachHere();
1152   }
1153 
1154   // Get oop result if there is one and reset the value in the thread.
1155   if (oop_result->is_valid()) {
1156     get_vm_result(oop_result);
1157   }
1158 
1159   _last_calls_return_pc = return_pc;
1160   BLOCK_COMMENT("} call_VM");
1161 }
1162 
1163 void MacroAssembler::call_VM_leaf_base(address entry_point) {
1164   BLOCK_COMMENT("call_VM_leaf {");
1165 #if defined(ABI_ELFv2)
1166   call_c(entry_point, relocInfo::none);
1167 #else
1168   call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none);
1169 #endif
1170   BLOCK_COMMENT("} call_VM_leaf");
1171 }
1172 
1173 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) {
1174   call_VM_base(oop_result, noreg, entry_point, check_exceptions);
1175 }
1176 
1177 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1,
1178                              bool check_exceptions) {
1179   // R3_ARG1 is reserved for the thread.
1180   mr_if_needed(R4_ARG2, arg_1);
1181   call_VM(oop_result, entry_point, check_exceptions);
1182 }
1183 
1184 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2,
1185                              bool check_exceptions) {
1186   // R3_ARG1 is reserved for the thread
1187   mr_if_needed(R4_ARG2, arg_1);
1188   assert(arg_2 != R4_ARG2, "smashed argument");
1189   mr_if_needed(R5_ARG3, arg_2);
1190   call_VM(oop_result, entry_point, check_exceptions);
1191 }
1192 
1193 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3,
1194                              bool check_exceptions) {
1195   // R3_ARG1 is reserved for the thread
1196   mr_if_needed(R4_ARG2, arg_1);
1197   assert(arg_2 != R4_ARG2, "smashed argument");
1198   mr_if_needed(R5_ARG3, arg_2);
1199   mr_if_needed(R6_ARG4, arg_3);
1200   call_VM(oop_result, entry_point, check_exceptions);
1201 }
1202 
1203 void MacroAssembler::call_VM_leaf(address entry_point) {
1204   call_VM_leaf_base(entry_point);
1205 }
1206 
1207 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) {
1208   mr_if_needed(R3_ARG1, arg_1);
1209   call_VM_leaf(entry_point);
1210 }
1211 
1212 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) {
1213   mr_if_needed(R3_ARG1, arg_1);
1214   assert(arg_2 != R3_ARG1, "smashed argument");
1215   mr_if_needed(R4_ARG2, arg_2);
1216   call_VM_leaf(entry_point);
1217 }
1218 
1219 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) {
1220   mr_if_needed(R3_ARG1, arg_1);
1221   assert(arg_2 != R3_ARG1, "smashed argument");
1222   mr_if_needed(R4_ARG2, arg_2);
1223   assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument");
1224   mr_if_needed(R5_ARG3, arg_3);
1225   call_VM_leaf(entry_point);
1226 }
1227 
1228 // Check whether instruction is a read access to the polling page
1229 // which was emitted by load_from_polling_page(..).
1230 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext,
1231                                                address* polling_address_ptr) {
1232   if (!is_ld(instruction))
1233     return false; // It's not a ld. Fail.
1234 
1235   int rt = inv_rt_field(instruction);
1236   int ra = inv_ra_field(instruction);
1237   int ds = inv_ds_field(instruction);
1238   if (!(ds == 0 && ra != 0 && rt == 0)) {
1239     return false; // It's not a ld(r0, X, ra). Fail.
1240   }
1241 
1242   if (!ucontext) {
1243     // Set polling address.
1244     if (polling_address_ptr != NULL) {
1245       *polling_address_ptr = NULL;
1246     }
1247     return true; // No ucontext given. Can't check value of ra. Assume true.
1248   }
1249 
1250 #ifdef LINUX
1251   // Ucontext given. Check that register ra contains the address of
1252   // the safepoing polling page.
1253   ucontext_t* uc = (ucontext_t*) ucontext;
1254   // Set polling address.
1255   address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds;
1256   if (polling_address_ptr != NULL) {
1257     *polling_address_ptr = addr;
1258   }
1259   return os::is_poll_address(addr);
1260 #else
1261   // Not on Linux, ucontext must be NULL.
1262   ShouldNotReachHere();
1263   return false;
1264 #endif
1265 }
1266 
1267 bool MacroAssembler::is_memory_serialization(int instruction, JavaThread* thread, void* ucontext) {
1268 #ifdef LINUX
1269   ucontext_t* uc = (ucontext_t*) ucontext;
1270 
1271   if (is_stwx(instruction) || is_stwux(instruction)) {
1272     int ra = inv_ra_field(instruction);
1273     int rb = inv_rb_field(instruction);
1274 
1275     // look up content of ra and rb in ucontext
1276     address ra_val=(address)uc->uc_mcontext.regs->gpr[ra];
1277     long rb_val=(long)uc->uc_mcontext.regs->gpr[rb];
1278     return os::is_memory_serialize_page(thread, ra_val+rb_val);
1279   } else if (is_stw(instruction) || is_stwu(instruction)) {
1280     int ra = inv_ra_field(instruction);
1281     int d1 = inv_d1_field(instruction);
1282 
1283     // look up content of ra in ucontext
1284     address ra_val=(address)uc->uc_mcontext.regs->gpr[ra];
1285     return os::is_memory_serialize_page(thread, ra_val+d1);
1286   } else {
1287     return false;
1288   }
1289 #else
1290   // workaround not needed on !LINUX :-)
1291   ShouldNotCallThis();
1292   return false;
1293 #endif
1294 }
1295 
1296 void MacroAssembler::bang_stack_with_offset(int offset) {
1297   // When increasing the stack, the old stack pointer will be written
1298   // to the new top of stack according to the PPC64 abi.
1299   // Therefore, stack banging is not necessary when increasing
1300   // the stack by <= os::vm_page_size() bytes.
1301   // When increasing the stack by a larger amount, this method is
1302   // called repeatedly to bang the intermediate pages.
1303 
1304   // Stack grows down, caller passes positive offset.
1305   assert(offset > 0, "must bang with positive offset");
1306 
1307   long stdoffset = -offset;
1308 
1309   if (is_simm(stdoffset, 16)) {
1310     // Signed 16 bit offset, a simple std is ok.
1311     if (UseLoadInstructionsForStackBangingPPC64) {
1312       ld(R0, (int)(signed short)stdoffset, R1_SP);
1313     } else {
1314       std(R0,(int)(signed short)stdoffset, R1_SP);
1315     }
1316   } else if (is_simm(stdoffset, 31)) {
1317     const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset);
1318     const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset);
1319 
1320     Register tmp = R11;
1321     addis(tmp, R1_SP, hi);
1322     if (UseLoadInstructionsForStackBangingPPC64) {
1323       ld(R0,  lo, tmp);
1324     } else {
1325       std(R0, lo, tmp);
1326     }
1327   } else {
1328     ShouldNotReachHere();
1329   }
1330 }
1331 
1332 // If instruction is a stack bang of the form
1333 //    std    R0,    x(Ry),       (see bang_stack_with_offset())
1334 //    stdu   R1_SP, x(R1_SP),    (see push_frame(), resize_frame())
1335 // or stdux  R1_SP, Rx, R1_SP    (see push_frame(), resize_frame())
1336 // return the banged address. Otherwise, return 0.
1337 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) {
1338 #ifdef LINUX
1339   ucontext_t* uc = (ucontext_t*) ucontext;
1340   int rs = inv_rs_field(instruction);
1341   int ra = inv_ra_field(instruction);
1342   if (   (is_ld(instruction)   && rs == 0 &&  UseLoadInstructionsForStackBangingPPC64)
1343       || (is_std(instruction)  && rs == 0 && !UseLoadInstructionsForStackBangingPPC64)
1344       || (is_stdu(instruction) && rs == 1)) {
1345     int ds = inv_ds_field(instruction);
1346     // return banged address
1347     return ds+(address)uc->uc_mcontext.regs->gpr[ra];
1348   } else if (is_stdux(instruction) && rs == 1) {
1349     int rb = inv_rb_field(instruction);
1350     address sp = (address)uc->uc_mcontext.regs->gpr[1];
1351     long rb_val = (long)uc->uc_mcontext.regs->gpr[rb];
1352     return ra != 1 || rb_val >= 0 ? NULL         // not a stack bang
1353                                   : sp + rb_val; // banged address
1354   }
1355   return NULL; // not a stack bang
1356 #else
1357   // workaround not needed on !LINUX :-)
1358   ShouldNotCallThis();
1359   return NULL;
1360 #endif
1361 }
1362 
1363 // CmpxchgX sets condition register to cmpX(current, compare).
1364 void MacroAssembler::cmpxchgw(ConditionRegister flag, Register dest_current_value,
1365                               Register compare_value, Register exchange_value,
1366                               Register addr_base, int semantics, bool cmpxchgx_hint,
1367                               Register int_flag_success, bool contention_hint) {
1368   Label retry;
1369   Label failed;
1370   Label done;
1371 
1372   // Save one branch if result is returned via register and
1373   // result register is different from the other ones.
1374   bool use_result_reg    = (int_flag_success != noreg);
1375   bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value &&
1376                             int_flag_success != exchange_value && int_flag_success != addr_base);
1377 
1378   // release/fence semantics
1379   if (semantics & MemBarRel) {
1380     release();
1381   }
1382 
1383   if (use_result_reg && preset_result_reg) {
1384     li(int_flag_success, 0); // preset (assume cas failed)
1385   }
1386 
1387   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1388   if (contention_hint) { // Don't try to reserve if cmp fails.
1389     lwz(dest_current_value, 0, addr_base);
1390     cmpw(flag, dest_current_value, compare_value);
1391     bne(flag, failed);
1392   }
1393 
1394   // atomic emulation loop
1395   bind(retry);
1396 
1397   lwarx(dest_current_value, addr_base, cmpxchgx_hint);
1398   cmpw(flag, dest_current_value, compare_value);
1399   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1400     bne_predict_not_taken(flag, failed);
1401   } else {
1402     bne(                  flag, failed);
1403   }
1404   // branch to done  => (flag == ne), (dest_current_value != compare_value)
1405   // fall through    => (flag == eq), (dest_current_value == compare_value)
1406 
1407   stwcx_(exchange_value, addr_base);
1408   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1409     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1410   } else {
1411     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1412   }
1413   // fall through    => (flag == eq), (dest_current_value == compare_value), (swapped)
1414 
1415   // Result in register (must do this at the end because int_flag_success can be the
1416   // same register as one above).
1417   if (use_result_reg) {
1418     li(int_flag_success, 1);
1419   }
1420 
1421   if (semantics & MemBarFenceAfter) {
1422     fence();
1423   } else if (semantics & MemBarAcq) {
1424     isync();
1425   }
1426 
1427   if (use_result_reg && !preset_result_reg) {
1428     b(done);
1429   }
1430 
1431   bind(failed);
1432   if (use_result_reg && !preset_result_reg) {
1433     li(int_flag_success, 0);
1434   }
1435 
1436   bind(done);
1437   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1438   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1439 }
1440 
1441 // Preforms atomic compare exchange:
1442 //   if (compare_value == *addr_base)
1443 //     *addr_base = exchange_value
1444 //     int_flag_success = 1;
1445 //   else
1446 //     int_flag_success = 0;
1447 //
1448 // ConditionRegister flag       = cmp(compare_value, *addr_base)
1449 // Register dest_current_value  = *addr_base
1450 // Register compare_value       Used to compare with value in memory
1451 // Register exchange_value      Written to memory if compare_value == *addr_base
1452 // Register addr_base           The memory location to compareXChange
1453 // Register int_flag_success    Set to 1 if exchange_value was written to *addr_base
1454 //
1455 // To avoid the costly compare exchange the value is tested beforehand.
1456 // Several special cases exist to avoid that unnecessary information is generated.
1457 //
1458 void MacroAssembler::cmpxchgd(ConditionRegister flag,
1459                               Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value,
1460                               Register addr_base, int semantics, bool cmpxchgx_hint,
1461                               Register int_flag_success, Label* failed_ext, bool contention_hint) {
1462   Label retry;
1463   Label failed_int;
1464   Label& failed = (failed_ext != NULL) ? *failed_ext : failed_int;
1465   Label done;
1466 
1467   // Save one branch if result is returned via register and result register is different from the other ones.
1468   bool use_result_reg    = (int_flag_success!=noreg);
1469   bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() &&
1470                             int_flag_success!=exchange_value && int_flag_success!=addr_base);
1471   assert(int_flag_success == noreg || failed_ext == NULL, "cannot have both");
1472 
1473   // release/fence semantics
1474   if (semantics & MemBarRel) {
1475     release();
1476   }
1477 
1478   if (use_result_reg && preset_result_reg) {
1479     li(int_flag_success, 0); // preset (assume cas failed)
1480   }
1481 
1482   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1483   if (contention_hint) { // Don't try to reserve if cmp fails.
1484     ld(dest_current_value, 0, addr_base);
1485     cmpd(flag, compare_value, dest_current_value);
1486     bne(flag, failed);
1487   }
1488 
1489   // atomic emulation loop
1490   bind(retry);
1491 
1492   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1493   cmpd(flag, compare_value, dest_current_value);
1494   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1495     bne_predict_not_taken(flag, failed);
1496   } else {
1497     bne(                  flag, failed);
1498   }
1499 
1500   stdcx_(exchange_value, addr_base);
1501   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1502     bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
1503   } else {
1504     bne(                  CCR0, retry); // stXcx_ sets CCR0
1505   }
1506 
1507   // result in register (must do this at the end because int_flag_success can be the same register as one above)
1508   if (use_result_reg) {
1509     li(int_flag_success, 1);
1510   }
1511 
1512   // POWER6 doesn't need isync in CAS.
1513   // Always emit isync to be on the safe side.
1514   if (semantics & MemBarFenceAfter) {
1515     fence();
1516   } else if (semantics & MemBarAcq) {
1517     isync();
1518   }
1519 
1520   if (use_result_reg && !preset_result_reg) {
1521     b(done);
1522   }
1523 
1524   bind(failed_int);
1525   if (use_result_reg && !preset_result_reg) {
1526     li(int_flag_success, 0);
1527   }
1528 
1529   bind(done);
1530   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1531   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1532 }
1533 
1534 // Look up the method for a megamorphic invokeinterface call.
1535 // The target method is determined by <intf_klass, itable_index>.
1536 // The receiver klass is in recv_klass.
1537 // On success, the result will be in method_result, and execution falls through.
1538 // On failure, execution transfers to the given label.
1539 void MacroAssembler::lookup_interface_method(Register recv_klass,
1540                                              Register intf_klass,
1541                                              RegisterOrConstant itable_index,
1542                                              Register method_result,
1543                                              Register scan_temp,
1544                                              Register sethi_temp,
1545                                              Label& L_no_such_interface) {
1546   assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
1547   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
1548          "caller must use same register for non-constant itable index as for method");
1549 
1550   // Compute start of first itableOffsetEntry (which is at the end of the vtable).
1551   int vtable_base = InstanceKlass::vtable_start_offset() * wordSize;
1552   int itentry_off = itableMethodEntry::method_offset_in_bytes();
1553   int logMEsize   = exact_log2(itableMethodEntry::size() * wordSize);
1554   int scan_step   = itableOffsetEntry::size() * wordSize;
1555   int log_vte_size= exact_log2(vtableEntry::size() * wordSize);
1556 
1557   lwz(scan_temp, InstanceKlass::vtable_length_offset() * wordSize, recv_klass);
1558   // %%% We should store the aligned, prescaled offset in the klassoop.
1559   // Then the next several instructions would fold away.
1560 
1561   sldi(scan_temp, scan_temp, log_vte_size);
1562   addi(scan_temp, scan_temp, vtable_base);
1563   add(scan_temp, recv_klass, scan_temp);
1564 
1565   // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1566   if (itable_index.is_register()) {
1567     Register itable_offset = itable_index.as_register();
1568     sldi(itable_offset, itable_offset, logMEsize);
1569     if (itentry_off) addi(itable_offset, itable_offset, itentry_off);
1570     add(recv_klass, itable_offset, recv_klass);
1571   } else {
1572     long itable_offset = (long)itable_index.as_constant();
1573     load_const_optimized(sethi_temp, (itable_offset<<logMEsize)+itentry_off); // static address, no relocation
1574     add(recv_klass, sethi_temp, recv_klass);
1575   }
1576 
1577   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
1578   //   if (scan->interface() == intf) {
1579   //     result = (klass + scan->offset() + itable_index);
1580   //   }
1581   // }
1582   Label search, found_method;
1583 
1584   for (int peel = 1; peel >= 0; peel--) {
1585     // %%%% Could load both offset and interface in one ldx, if they were
1586     // in the opposite order. This would save a load.
1587     ld(method_result, itableOffsetEntry::interface_offset_in_bytes(), scan_temp);
1588 
1589     // Check that this entry is non-null. A null entry means that
1590     // the receiver class doesn't implement the interface, and wasn't the
1591     // same as when the caller was compiled.
1592     cmpd(CCR0, method_result, intf_klass);
1593 
1594     if (peel) {
1595       beq(CCR0, found_method);
1596     } else {
1597       bne(CCR0, search);
1598       // (invert the test to fall through to found_method...)
1599     }
1600 
1601     if (!peel) break;
1602 
1603     bind(search);
1604 
1605     cmpdi(CCR0, method_result, 0);
1606     beq(CCR0, L_no_such_interface);
1607     addi(scan_temp, scan_temp, scan_step);
1608   }
1609 
1610   bind(found_method);
1611 
1612   // Got a hit.
1613   int ito_offset = itableOffsetEntry::offset_offset_in_bytes();
1614   lwz(scan_temp, ito_offset, scan_temp);
1615   ldx(method_result, scan_temp, recv_klass);
1616 }
1617 
1618 // virtual method calling
1619 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1620                                            RegisterOrConstant vtable_index,
1621                                            Register method_result) {
1622 
1623   assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg());
1624 
1625   const int base = InstanceKlass::vtable_start_offset() * wordSize;
1626   assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1627 
1628   if (vtable_index.is_register()) {
1629     sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord);
1630     add(recv_klass, vtable_index.as_register(), recv_klass);
1631   } else {
1632     addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord);
1633   }
1634   ld(R19_method, base + vtableEntry::method_offset_in_bytes(), recv_klass);
1635 }
1636 
1637 /////////////////////////////////////////// subtype checking ////////////////////////////////////////////
1638 
1639 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1640                                                    Register super_klass,
1641                                                    Register temp1_reg,
1642                                                    Register temp2_reg,
1643                                                    Label& L_success,
1644                                                    Label& L_failure) {
1645 
1646   const Register check_cache_offset = temp1_reg;
1647   const Register cached_super       = temp2_reg;
1648 
1649   assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super);
1650 
1651   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1652   int sc_offset  = in_bytes(Klass::secondary_super_cache_offset());
1653 
1654   // If the pointers are equal, we are done (e.g., String[] elements).
1655   // This self-check enables sharing of secondary supertype arrays among
1656   // non-primary types such as array-of-interface. Otherwise, each such
1657   // type would need its own customized SSA.
1658   // We move this check to the front of the fast path because many
1659   // type checks are in fact trivially successful in this manner,
1660   // so we get a nicely predicted branch right at the start of the check.
1661   cmpd(CCR0, sub_klass, super_klass);
1662   beq(CCR0, L_success);
1663 
1664   // Check the supertype display:
1665   lwz(check_cache_offset, sco_offset, super_klass);
1666   // The loaded value is the offset from KlassOopDesc.
1667 
1668   ldx(cached_super, check_cache_offset, sub_klass);
1669   cmpd(CCR0, cached_super, super_klass);
1670   beq(CCR0, L_success);
1671 
1672   // This check has worked decisively for primary supers.
1673   // Secondary supers are sought in the super_cache ('super_cache_addr').
1674   // (Secondary supers are interfaces and very deeply nested subtypes.)
1675   // This works in the same check above because of a tricky aliasing
1676   // between the super_cache and the primary super display elements.
1677   // (The 'super_check_addr' can address either, as the case requires.)
1678   // Note that the cache is updated below if it does not help us find
1679   // what we need immediately.
1680   // So if it was a primary super, we can just fail immediately.
1681   // Otherwise, it's the slow path for us (no success at this point).
1682 
1683   cmpwi(CCR0, check_cache_offset, sc_offset);
1684   bne(CCR0, L_failure);
1685   // bind(slow_path); // fallthru
1686 }
1687 
1688 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1689                                                    Register super_klass,
1690                                                    Register temp1_reg,
1691                                                    Register temp2_reg,
1692                                                    Label* L_success,
1693                                                    Register result_reg) {
1694   const Register array_ptr = temp1_reg; // current value from cache array
1695   const Register temp      = temp2_reg;
1696 
1697   assert_different_registers(sub_klass, super_klass, array_ptr, temp);
1698 
1699   int source_offset = in_bytes(Klass::secondary_supers_offset());
1700   int target_offset = in_bytes(Klass::secondary_super_cache_offset());
1701 
1702   int length_offset = Array<Klass*>::length_offset_in_bytes();
1703   int base_offset   = Array<Klass*>::base_offset_in_bytes();
1704 
1705   Label hit, loop, failure, fallthru;
1706 
1707   ld(array_ptr, source_offset, sub_klass);
1708 
1709   //assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated.");
1710   lwz(temp, length_offset, array_ptr);
1711   cmpwi(CCR0, temp, 0);
1712   beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0
1713 
1714   mtctr(temp); // load ctr
1715 
1716   bind(loop);
1717   // Oops in table are NO MORE compressed.
1718   ld(temp, base_offset, array_ptr);
1719   cmpd(CCR0, temp, super_klass);
1720   beq(CCR0, hit);
1721   addi(array_ptr, array_ptr, BytesPerWord);
1722   bdnz(loop);
1723 
1724   bind(failure);
1725   if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss)
1726   b(fallthru);
1727 
1728   bind(hit);
1729   std(super_klass, target_offset, sub_klass); // save result to cache
1730   if (result_reg != noreg) li(result_reg, 0); // load zero result (indicates a hit)
1731   if (L_success != NULL) b(*L_success);
1732 
1733   bind(fallthru);
1734 }
1735 
1736 // Try fast path, then go to slow one if not successful
1737 void MacroAssembler::check_klass_subtype(Register sub_klass,
1738                          Register super_klass,
1739                          Register temp1_reg,
1740                          Register temp2_reg,
1741                          Label& L_success) {
1742   Label L_failure;
1743   check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, L_success, L_failure);
1744   check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success);
1745   bind(L_failure); // Fallthru if not successful.
1746 }
1747 
1748 void MacroAssembler::check_method_handle_type(Register mtype_reg, Register mh_reg,
1749                                               Register temp_reg,
1750                                               Label& wrong_method_type) {
1751   assert_different_registers(mtype_reg, mh_reg, temp_reg);
1752   // Compare method type against that of the receiver.
1753   load_heap_oop_not_null(temp_reg, delayed_value(java_lang_invoke_MethodHandle::type_offset_in_bytes, temp_reg), mh_reg);
1754   cmpd(CCR0, temp_reg, mtype_reg);
1755   bne(CCR0, wrong_method_type);
1756 }
1757 
1758 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot,
1759                                                    Register temp_reg,
1760                                                    int extra_slot_offset) {
1761   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
1762   int stackElementSize = Interpreter::stackElementSize;
1763   int offset = extra_slot_offset * stackElementSize;
1764   if (arg_slot.is_constant()) {
1765     offset += arg_slot.as_constant() * stackElementSize;
1766     return offset;
1767   } else {
1768     assert(temp_reg != noreg, "must specify");
1769     sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize));
1770     if (offset != 0)
1771       addi(temp_reg, temp_reg, offset);
1772     return temp_reg;
1773   }
1774 }
1775 
1776 void MacroAssembler::biased_locking_enter(ConditionRegister cr_reg, Register obj_reg,
1777                                           Register mark_reg, Register temp_reg,
1778                                           Register temp2_reg, Label& done, Label* slow_case) {
1779   assert(UseBiasedLocking, "why call this otherwise?");
1780 
1781 #ifdef ASSERT
1782   assert_different_registers(obj_reg, mark_reg, temp_reg, temp2_reg);
1783 #endif
1784 
1785   Label cas_label;
1786 
1787   // Branch to done if fast path fails and no slow_case provided.
1788   Label *slow_case_int = (slow_case != NULL) ? slow_case : &done;
1789 
1790   // Biased locking
1791   // See whether the lock is currently biased toward our thread and
1792   // whether the epoch is still valid
1793   // Note that the runtime guarantees sufficient alignment of JavaThread
1794   // pointers to allow age to be placed into low bits
1795   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits,
1796          "biased locking makes assumptions about bit layout");
1797 
1798   if (PrintBiasedLockingStatistics) {
1799     load_const(temp_reg, (address) BiasedLocking::total_entry_count_addr(), temp2_reg);
1800     lwz(temp2_reg, 0, temp_reg);
1801     addi(temp2_reg, temp2_reg, 1);
1802     stw(temp2_reg, 0, temp_reg);
1803   }
1804 
1805   andi(temp_reg, mark_reg, markOopDesc::biased_lock_mask_in_place);
1806   cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern);
1807   bne(cr_reg, cas_label);
1808 
1809   load_klass(temp_reg, obj_reg);
1810 
1811   load_const_optimized(temp2_reg, ~((int) markOopDesc::age_mask_in_place));
1812   ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
1813   orr(temp_reg, R16_thread, temp_reg);
1814   xorr(temp_reg, mark_reg, temp_reg);
1815   andr(temp_reg, temp_reg, temp2_reg);
1816   cmpdi(cr_reg, temp_reg, 0);
1817   if (PrintBiasedLockingStatistics) {
1818     Label l;
1819     bne(cr_reg, l);
1820     load_const(mark_reg, (address) BiasedLocking::biased_lock_entry_count_addr());
1821     lwz(temp2_reg, 0, mark_reg);
1822     addi(temp2_reg, temp2_reg, 1);
1823     stw(temp2_reg, 0, mark_reg);
1824     // restore mark_reg
1825     ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);
1826     bind(l);
1827   }
1828   beq(cr_reg, done);
1829 
1830   Label try_revoke_bias;
1831   Label try_rebias;
1832 
1833   // At this point we know that the header has the bias pattern and
1834   // that we are not the bias owner in the current epoch. We need to
1835   // figure out more details about the state of the header in order to
1836   // know what operations can be legally performed on the object's
1837   // header.
1838 
1839   // If the low three bits in the xor result aren't clear, that means
1840   // the prototype header is no longer biased and we have to revoke
1841   // the bias on this object.
1842   andi(temp2_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
1843   cmpwi(cr_reg, temp2_reg, 0);
1844   bne(cr_reg, try_revoke_bias);
1845 
1846   // Biasing is still enabled for this data type. See whether the
1847   // epoch of the current bias is still valid, meaning that the epoch
1848   // bits of the mark word are equal to the epoch bits of the
1849   // prototype header. (Note that the prototype header's epoch bits
1850   // only change at a safepoint.) If not, attempt to rebias the object
1851   // toward the current thread. Note that we must be absolutely sure
1852   // that the current epoch is invalid in order to do this because
1853   // otherwise the manipulations it performs on the mark word are
1854   // illegal.
1855 
1856   int shift_amount = 64 - markOopDesc::epoch_shift;
1857   // rotate epoch bits to right (little) end and set other bits to 0
1858   // [ big part | epoch | little part ] -> [ 0..0 | epoch ]
1859   rldicl_(temp2_reg, temp_reg, shift_amount, 64 - markOopDesc::epoch_bits);
1860   // branch if epoch bits are != 0, i.e. they differ, because the epoch has been incremented
1861   bne(CCR0, try_rebias);
1862 
1863   // The epoch of the current bias is still valid but we know nothing
1864   // about the owner; it might be set or it might be clear. Try to
1865   // acquire the bias of the object using an atomic operation. If this
1866   // fails we will go in to the runtime to revoke the object's bias.
1867   // Note that we first construct the presumed unbiased header so we
1868   // don't accidentally blow away another thread's valid bias.
1869   andi(mark_reg, mark_reg, (markOopDesc::biased_lock_mask_in_place |
1870                                 markOopDesc::age_mask_in_place |
1871                                 markOopDesc::epoch_mask_in_place));
1872   orr(temp_reg, R16_thread, mark_reg);
1873 
1874   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
1875 
1876   // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
1877   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
1878            /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
1879            /*where=*/obj_reg,
1880            MacroAssembler::MemBarAcq,
1881            MacroAssembler::cmpxchgx_hint_acquire_lock(),
1882            noreg, slow_case_int); // bail out if failed
1883 
1884   // If the biasing toward our thread failed, this means that
1885   // another thread succeeded in biasing it toward itself and we
1886   // need to revoke that bias. The revocation will occur in the
1887   // interpreter runtime in the slow case.
1888   if (PrintBiasedLockingStatistics) {
1889     load_const(temp_reg, (address) BiasedLocking::anonymously_biased_lock_entry_count_addr(), temp2_reg);
1890     lwz(temp2_reg, 0, temp_reg);
1891     addi(temp2_reg, temp2_reg, 1);
1892     stw(temp2_reg, 0, temp_reg);
1893   }
1894   b(done);
1895 
1896   bind(try_rebias);
1897   // At this point we know the epoch has expired, meaning that the
1898   // current "bias owner", if any, is actually invalid. Under these
1899   // circumstances _only_, we are allowed to use the current header's
1900   // value as the comparison value when doing the cas to acquire the
1901   // bias in the current epoch. In other words, we allow transfer of
1902   // the bias from one thread to another directly in this situation.
1903   andi(temp_reg, mark_reg, markOopDesc::age_mask_in_place);
1904   orr(temp_reg, R16_thread, temp_reg);
1905   load_klass(temp2_reg, obj_reg);
1906   ld(temp2_reg, in_bytes(Klass::prototype_header_offset()), temp2_reg);
1907   orr(temp_reg, temp_reg, temp2_reg);
1908 
1909   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
1910 
1911   // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
1912   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
1913                  /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
1914                  /*where=*/obj_reg,
1915                  MacroAssembler::MemBarAcq,
1916                  MacroAssembler::cmpxchgx_hint_acquire_lock(),
1917                  noreg, slow_case_int); // bail out if failed
1918 
1919   // If the biasing toward our thread failed, this means that
1920   // another thread succeeded in biasing it toward itself and we
1921   // need to revoke that bias. The revocation will occur in the
1922   // interpreter runtime in the slow case.
1923   if (PrintBiasedLockingStatistics) {
1924     load_const(temp_reg, (address) BiasedLocking::rebiased_lock_entry_count_addr(), temp2_reg);
1925     lwz(temp2_reg, 0, temp_reg);
1926     addi(temp2_reg, temp2_reg, 1);
1927     stw(temp2_reg, 0, temp_reg);
1928   }
1929   b(done);
1930 
1931   bind(try_revoke_bias);
1932   // The prototype mark in the klass doesn't have the bias bit set any
1933   // more, indicating that objects of this data type are not supposed
1934   // to be biased any more. We are going to try to reset the mark of
1935   // this object to the prototype value and fall through to the
1936   // CAS-based locking scheme. Note that if our CAS fails, it means
1937   // that another thread raced us for the privilege of revoking the
1938   // bias of this particular object, so it's okay to continue in the
1939   // normal locking code.
1940   load_klass(temp_reg, obj_reg);
1941   ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
1942   andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place);
1943   orr(temp_reg, temp_reg, temp2_reg);
1944 
1945   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
1946 
1947   // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
1948   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
1949                  /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
1950                  /*where=*/obj_reg,
1951                  MacroAssembler::MemBarAcq,
1952                  MacroAssembler::cmpxchgx_hint_acquire_lock());
1953 
1954   // reload markOop in mark_reg before continuing with lightweight locking
1955   ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);
1956 
1957   // Fall through to the normal CAS-based lock, because no matter what
1958   // the result of the above CAS, some thread must have succeeded in
1959   // removing the bias bit from the object's header.
1960   if (PrintBiasedLockingStatistics) {
1961     Label l;
1962     bne(cr_reg, l);
1963     load_const(temp_reg, (address) BiasedLocking::revoked_lock_entry_count_addr(), temp2_reg);
1964     lwz(temp2_reg, 0, temp_reg);
1965     addi(temp2_reg, temp2_reg, 1);
1966     stw(temp2_reg, 0, temp_reg);
1967     bind(l);
1968   }
1969 
1970   bind(cas_label);
1971 }
1972 
1973 void MacroAssembler::biased_locking_exit (ConditionRegister cr_reg, Register mark_addr, Register temp_reg, Label& done) {
1974   // Check for biased locking unlock case, which is a no-op
1975   // Note: we do not have to check the thread ID for two reasons.
1976   // First, the interpreter checks for IllegalMonitorStateException at
1977   // a higher level. Second, if the bias was revoked while we held the
1978   // lock, the object could not be rebiased toward another thread, so
1979   // the bias bit would be clear.
1980 
1981   ld(temp_reg, 0, mark_addr);
1982   andi(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
1983 
1984   cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern);
1985   beq(cr_reg, done);
1986 }
1987 
1988 // TM on PPC64.
1989 void MacroAssembler::atomic_inc_ptr(Register addr, Register result, int simm16) {
1990   Label retry;
1991   bind(retry);
1992   ldarx(result, addr, /*hint*/ false);
1993   addi(result, result, simm16);
1994   stdcx_(result, addr);
1995   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1996     bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
1997   } else {
1998     bne(                  CCR0, retry); // stXcx_ sets CCR0
1999   }
2000 }
2001 
2002 void MacroAssembler::atomic_ori_int(Register addr, Register result, int uimm16) {
2003   Label retry;
2004   bind(retry);
2005   lwarx(result, addr, /*hint*/ false);
2006   ori(result, result, uimm16);
2007   stwcx_(result, addr);
2008   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
2009     bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
2010   } else {
2011     bne(                  CCR0, retry); // stXcx_ sets CCR0
2012   }
2013 }
2014 
2015 #if INCLUDE_RTM_OPT
2016 
2017 // Update rtm_counters based on abort status
2018 // input: abort_status
2019 //        rtm_counters (RTMLockingCounters*)
2020 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters_Reg) {
2021   // Mapping to keep PreciseRTMLockingStatistics similar to x86.
2022   // x86 ppc (! means inverted, ? means not the same)
2023   //  0   31  Set if abort caused by XABORT instruction.
2024   //  1  ! 7  If set, the transaction may succeed on a retry. This bit is always clear if bit 0 is set.
2025   //  2   13  Set if another logical processor conflicted with a memory address that was part of the transaction that aborted.
2026   //  3   10  Set if an internal buffer overflowed.
2027   //  4  ?12  Set if a debug breakpoint was hit.
2028   //  5  ?32  Set if an abort occurred during execution of a nested transaction.
2029   const  int tm_failure_bit[] = {Assembler::tm_tabort, // Note: Seems like signal handler sets this, too.
2030                                  Assembler::tm_failure_persistent, // inverted: transient
2031                                  Assembler::tm_trans_cf,
2032                                  Assembler::tm_footprint_of,
2033                                  Assembler::tm_non_trans_cf,
2034                                  Assembler::tm_suspended};
2035   const bool tm_failure_inv[] = {false, true, false, false, false, false};
2036   assert(sizeof(tm_failure_bit)/sizeof(int) == RTMLockingCounters::ABORT_STATUS_LIMIT, "adapt mapping!");
2037 
2038   const Register addr_Reg = R0;
2039   // Keep track of offset to where rtm_counters_Reg had pointed to.
2040   int counters_offs = RTMLockingCounters::abort_count_offset();
2041   addi(addr_Reg, rtm_counters_Reg, counters_offs);
2042   const Register temp_Reg = rtm_counters_Reg;
2043 
2044   //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically
2045   ldx(temp_Reg, addr_Reg);
2046   addi(temp_Reg, temp_Reg, 1);
2047   stdx(temp_Reg, addr_Reg);
2048 
2049   if (PrintPreciseRTMLockingStatistics) {
2050     int counters_offs_delta = RTMLockingCounters::abortX_count_offset() - counters_offs;
2051 
2052     //mftexasr(abort_status); done by caller
2053     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
2054       counters_offs += counters_offs_delta;
2055       li(temp_Reg, counters_offs_delta); // can't use addi with R0
2056       add(addr_Reg, addr_Reg, temp_Reg); // point to next counter
2057       counters_offs_delta = sizeof(uintx);
2058 
2059       Label check_abort;
2060       rldicr_(temp_Reg, abort_status, tm_failure_bit[i], 0);
2061       if (tm_failure_inv[i]) {
2062         bne(CCR0, check_abort);
2063       } else {
2064         beq(CCR0, check_abort);
2065       }
2066       //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically
2067       ldx(temp_Reg, addr_Reg);
2068       addi(temp_Reg, temp_Reg, 1);
2069       stdx(temp_Reg, addr_Reg);
2070       bind(check_abort);
2071     }
2072   }
2073   li(temp_Reg, -counters_offs); // can't use addi with R0
2074   add(rtm_counters_Reg, addr_Reg, temp_Reg); // restore
2075 }
2076 
2077 // Branch if (random & (count-1) != 0), count is 2^n
2078 // tmp and CR0 are killed
2079 void MacroAssembler::branch_on_random_using_tb(Register tmp, int count, Label& brLabel) {
2080   mftb(tmp);
2081   andi_(tmp, tmp, count-1);
2082   bne(CCR0, brLabel);
2083 }
2084 
2085 // Perform abort ratio calculation, set no_rtm bit if high ratio.
2086 // input:  rtm_counters_Reg (RTMLockingCounters* address) - KILLED
2087 void MacroAssembler::rtm_abort_ratio_calculation(Register rtm_counters_Reg,
2088                                                  RTMLockingCounters* rtm_counters,
2089                                                  Metadata* method_data) {
2090   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
2091 
2092   if (RTMLockingCalculationDelay > 0) {
2093     // Delay calculation.
2094     ld(rtm_counters_Reg, (RegisterOrConstant)(intptr_t)RTMLockingCounters::rtm_calculation_flag_addr());
2095     cmpdi(CCR0, rtm_counters_Reg, 0);
2096     beq(CCR0, L_done);
2097     load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2098   }
2099   // Abort ratio calculation only if abort_count > RTMAbortThreshold.
2100   //   Aborted transactions = abort_count * 100
2101   //   All transactions = total_count *  RTMTotalCountIncrRate
2102   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
2103   ld(R0, RTMLockingCounters::abort_count_offset(), rtm_counters_Reg);
2104   cmpdi(CCR0, R0, RTMAbortThreshold);
2105   blt(CCR0, L_check_always_rtm2);
2106   mulli(R0, R0, 100);
2107 
2108   const Register tmpReg = rtm_counters_Reg;
2109   ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2110   mulli(tmpReg, tmpReg, RTMTotalCountIncrRate);
2111   mulli(tmpReg, tmpReg, RTMAbortRatio);
2112   cmpd(CCR0, R0, tmpReg);
2113   blt(CCR0, L_check_always_rtm1); // jump to reload
2114   if (method_data != NULL) {
2115     // Set rtm_state to "no rtm" in MDO.
2116     // Not using a metadata relocation. Method and Class Loader are kept alive anyway.
2117     // (See nmethod::metadata_do and CodeBuffer::finalize_oop_references.)
2118     load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2119     atomic_ori_int(R0, tmpReg, NoRTM);
2120   }
2121   b(L_done);
2122 
2123   bind(L_check_always_rtm1);
2124   load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2125   bind(L_check_always_rtm2);
2126   ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2127   cmpdi(CCR0, tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
2128   blt(CCR0, L_done);
2129   if (method_data != NULL) {
2130     // Set rtm_state to "always rtm" in MDO.
2131     // Not using a metadata relocation. See above.
2132     load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2133     atomic_ori_int(R0, tmpReg, UseRTM);
2134   }
2135   bind(L_done);
2136 }
2137 
2138 // Update counters and perform abort ratio calculation.
2139 // input: abort_status_Reg
2140 void MacroAssembler::rtm_profiling(Register abort_status_Reg, Register temp_Reg,
2141                                    RTMLockingCounters* rtm_counters,
2142                                    Metadata* method_data,
2143                                    bool profile_rtm) {
2144 
2145   assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2146   // Update rtm counters based on state at abort.
2147   // Reads abort_status_Reg, updates flags.
2148   assert_different_registers(abort_status_Reg, temp_Reg);
2149   load_const_optimized(temp_Reg, (address)rtm_counters, R0);
2150   rtm_counters_update(abort_status_Reg, temp_Reg);
2151   if (profile_rtm) {
2152     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2153     rtm_abort_ratio_calculation(temp_Reg, rtm_counters, method_data);
2154   }
2155 }
2156 
2157 // Retry on abort if abort's status indicates non-persistent failure.
2158 // inputs: retry_count_Reg
2159 //       : abort_status_Reg
2160 // output: retry_count_Reg decremented by 1
2161 void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg,
2162                                              Label& retryLabel, Label* checkRetry) {
2163   Label doneRetry;
2164   rldicr_(R0, abort_status_Reg, tm_failure_persistent, 0);
2165   bne(CCR0, doneRetry);
2166   if (checkRetry) { bind(*checkRetry); }
2167   addic_(retry_count_Reg, retry_count_Reg, -1);
2168   blt(CCR0, doneRetry);
2169   smt_yield(); // Can't use wait(). No permission (SIGILL).
2170   b(retryLabel);
2171   bind(doneRetry);
2172 }
2173 
2174 // Spin and retry if lock is busy.
2175 // inputs: box_Reg (monitor address)
2176 //       : retry_count_Reg
2177 // output: retry_count_Reg decremented by 1
2178 // CTR is killed
2179 void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register owner_addr_Reg, Label& retryLabel) {
2180   Label SpinLoop, doneRetry;
2181   addic_(retry_count_Reg, retry_count_Reg, -1);
2182   blt(CCR0, doneRetry);
2183   li(R0, RTMSpinLoopCount);
2184   mtctr(R0);
2185 
2186   bind(SpinLoop);
2187   smt_yield(); // Can't use waitrsv(). No permission (SIGILL).
2188   bdz(retryLabel);
2189   ld(R0, 0, owner_addr_Reg);
2190   cmpdi(CCR0, R0, 0);
2191   bne(CCR0, SpinLoop);
2192   b(retryLabel);
2193 
2194   bind(doneRetry);
2195 }
2196 
2197 // Use RTM for normal stack locks.
2198 // Input: objReg (object to lock)
2199 void MacroAssembler::rtm_stack_locking(ConditionRegister flag,
2200                                        Register obj, Register mark_word, Register tmp,
2201                                        Register retry_on_abort_count_Reg,
2202                                        RTMLockingCounters* stack_rtm_counters,
2203                                        Metadata* method_data, bool profile_rtm,
2204                                        Label& DONE_LABEL, Label& IsInflated) {
2205   assert(UseRTMForStackLocks, "why call this otherwise?");
2206   assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
2207   Label L_rtm_retry, L_decrement_retry, L_on_abort;
2208 
2209   if (RTMRetryCount > 0) {
2210     load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
2211     bind(L_rtm_retry);
2212   }
2213   andi_(R0, mark_word, markOopDesc::monitor_value);  // inflated vs stack-locked|neutral|biased
2214   bne(CCR0, IsInflated);
2215 
2216   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2217     Label L_noincrement;
2218     if (RTMTotalCountIncrRate > 1) {
2219       branch_on_random_using_tb(tmp, (int)RTMTotalCountIncrRate, L_noincrement);
2220     }
2221     assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
2222     load_const_optimized(tmp, (address)stack_rtm_counters->total_count_addr(), R0);
2223     //atomic_inc_ptr(tmp, /*temp, will be reloaded*/mark_word); We don't increment atomically
2224     ldx(mark_word, tmp);
2225     addi(mark_word, mark_word, 1);
2226     stdx(mark_word, tmp);
2227     bind(L_noincrement);
2228   }
2229   tbegin_();
2230   beq(CCR0, L_on_abort);
2231   ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);         // Reload in transaction, conflicts need to be tracked.
2232   andi(R0, mark_word, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
2233   cmpwi(flag, R0, markOopDesc::unlocked_value);                // bits = 001 unlocked
2234   beq(flag, DONE_LABEL);                                       // all done if unlocked
2235 
2236   if (UseRTMXendForLockBusy) {
2237     tend_();
2238     b(L_decrement_retry);
2239   } else {
2240     tabort_();
2241   }
2242   bind(L_on_abort);
2243   const Register abort_status_Reg = tmp;
2244   mftexasr(abort_status_Reg);
2245   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2246     rtm_profiling(abort_status_Reg, /*temp*/mark_word, stack_rtm_counters, method_data, profile_rtm);
2247   }
2248   ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // reload
2249   if (RTMRetryCount > 0) {
2250     // Retry on lock abort if abort status is not permanent.
2251     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry, &L_decrement_retry);
2252   } else {
2253     bind(L_decrement_retry);
2254   }
2255 }
2256 
2257 // Use RTM for inflating locks
2258 // inputs: obj       (object to lock)
2259 //         mark_word (current header - KILLED)
2260 //         boxReg    (on-stack box address (displaced header location) - KILLED)
2261 void MacroAssembler::rtm_inflated_locking(ConditionRegister flag,
2262                                           Register obj, Register mark_word, Register boxReg,
2263                                           Register retry_on_busy_count_Reg, Register retry_on_abort_count_Reg,
2264                                           RTMLockingCounters* rtm_counters,
2265                                           Metadata* method_data, bool profile_rtm,
2266                                           Label& DONE_LABEL) {
2267   assert(UseRTMLocking, "why call this otherwise?");
2268   Label L_rtm_retry, L_decrement_retry, L_on_abort;
2269   // Clean monitor_value bit to get valid pointer.
2270   int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markOopDesc::monitor_value;
2271 
2272   // Store non-null, using boxReg instead of (intptr_t)markOopDesc::unused_mark().
2273   std(boxReg, BasicLock::displaced_header_offset_in_bytes(), boxReg);
2274   const Register tmpReg = boxReg;
2275   const Register owner_addr_Reg = mark_word;
2276   addi(owner_addr_Reg, mark_word, owner_offset);
2277 
2278   if (RTMRetryCount > 0) {
2279     load_const_optimized(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy.
2280     load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort.
2281     bind(L_rtm_retry);
2282   }
2283   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2284     Label L_noincrement;
2285     if (RTMTotalCountIncrRate > 1) {
2286       branch_on_random_using_tb(R0, (int)RTMTotalCountIncrRate, L_noincrement);
2287     }
2288     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2289     load_const(R0, (address)rtm_counters->total_count_addr(), tmpReg);
2290     //atomic_inc_ptr(R0, tmpReg); We don't increment atomically
2291     ldx(tmpReg, R0);
2292     addi(tmpReg, tmpReg, 1);
2293     stdx(tmpReg, R0);
2294     bind(L_noincrement);
2295   }
2296   tbegin_();
2297   beq(CCR0, L_on_abort);
2298   // We don't reload mark word. Will only be reset at safepoint.
2299   ld(R0, 0, owner_addr_Reg); // Load in transaction, conflicts need to be tracked.
2300   cmpdi(flag, R0, 0);
2301   beq(flag, DONE_LABEL);
2302 
2303   if (UseRTMXendForLockBusy) {
2304     tend_();
2305     b(L_decrement_retry);
2306   } else {
2307     tabort_();
2308   }
2309   bind(L_on_abort);
2310   const Register abort_status_Reg = tmpReg;
2311   mftexasr(abort_status_Reg);
2312   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2313     rtm_profiling(abort_status_Reg, /*temp*/ owner_addr_Reg, rtm_counters, method_data, profile_rtm);
2314     // Restore owner_addr_Reg
2315     ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);
2316 #ifdef ASSERT
2317     andi_(R0, mark_word, markOopDesc::monitor_value);
2318     asm_assert_ne("must be inflated", 0xa754); // Deflating only allowed at safepoint.
2319 #endif
2320     addi(owner_addr_Reg, mark_word, owner_offset);
2321   }
2322   if (RTMRetryCount > 0) {
2323     // Retry on lock abort if abort status is not permanent.
2324     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
2325   }
2326 
2327   // Appears unlocked - try to swing _owner from null to non-null.
2328   cmpxchgd(flag, /*current val*/ R0, (intptr_t)0, /*new val*/ R16_thread, owner_addr_Reg,
2329            MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2330            MacroAssembler::cmpxchgx_hint_acquire_lock(), noreg, &L_decrement_retry, true);
2331 
2332   if (RTMRetryCount > 0) {
2333     // success done else retry
2334     b(DONE_LABEL);
2335     bind(L_decrement_retry);
2336     // Spin and retry if lock is busy.
2337     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, owner_addr_Reg, L_rtm_retry);
2338   } else {
2339     bind(L_decrement_retry);
2340   }
2341 }
2342 
2343 #endif //  INCLUDE_RTM_OPT
2344 
2345 // "The box" is the space on the stack where we copy the object mark.
2346 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box,
2347                                                Register temp, Register displaced_header, Register current_header,
2348                                                bool try_bias,
2349                                                RTMLockingCounters* rtm_counters,
2350                                                RTMLockingCounters* stack_rtm_counters,
2351                                                Metadata* method_data,
2352                                                bool use_rtm, bool profile_rtm) {
2353   assert_different_registers(oop, box, temp, displaced_header, current_header);
2354   assert(flag != CCR0, "bad condition register");
2355   Label cont;
2356   Label object_has_monitor;
2357   Label cas_failed;
2358 
2359   // Load markOop from object into displaced_header.
2360   ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop);
2361 
2362 
2363   // Always do locking in runtime.
2364   if (EmitSync & 0x01) {
2365     cmpdi(flag, oop, 0); // Oop can't be 0 here => always false.
2366     return;
2367   }
2368 
2369   if (try_bias) {
2370     biased_locking_enter(flag, oop, displaced_header, temp, current_header, cont);
2371   }
2372 
2373 #if INCLUDE_RTM_OPT
2374   if (UseRTMForStackLocks && use_rtm) {
2375     rtm_stack_locking(flag, oop, displaced_header, temp, /*temp*/ current_header,
2376                       stack_rtm_counters, method_data, profile_rtm,
2377                       cont, object_has_monitor);
2378   }
2379 #endif // INCLUDE_RTM_OPT
2380 
2381   // Handle existing monitor.
2382   if ((EmitSync & 0x02) == 0) {
2383     // The object has an existing monitor iff (mark & monitor_value) != 0.
2384     andi_(temp, displaced_header, markOopDesc::monitor_value);
2385     bne(CCR0, object_has_monitor);
2386   }
2387 
2388   // Set displaced_header to be (markOop of object | UNLOCK_VALUE).
2389   ori(displaced_header, displaced_header, markOopDesc::unlocked_value);
2390 
2391   // Load Compare Value application register.
2392 
2393   // Initialize the box. (Must happen before we update the object mark!)
2394   std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2395 
2396   // Must fence, otherwise, preceding store(s) may float below cmpxchg.
2397   // Compare object markOop with mark and if equal exchange scratch1 with object markOop.
2398   // CmpxchgX sets cr_reg to cmpX(current, displaced).
2399   membar(Assembler::StoreStore);
2400   cmpxchgd(/*flag=*/flag,
2401            /*current_value=*/current_header,
2402            /*compare_value=*/displaced_header,
2403            /*exchange_value=*/box,
2404            /*where=*/oop,
2405            MacroAssembler::MemBarAcq,
2406            MacroAssembler::cmpxchgx_hint_acquire_lock(),
2407            noreg,
2408            &cas_failed);
2409   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2410 
2411   // If the compare-and-exchange succeeded, then we found an unlocked
2412   // object and we have now locked it.
2413   b(cont);
2414 
2415   bind(cas_failed);
2416   // We did not see an unlocked object so try the fast recursive case.
2417 
2418   // Check if the owner is self by comparing the value in the markOop of object
2419   // (current_header) with the stack pointer.
2420   sub(current_header, current_header, R1_SP);
2421   load_const_optimized(temp, (address) (~(os::vm_page_size()-1) |
2422                                         markOopDesc::lock_mask_in_place));
2423 
2424   and_(R0/*==0?*/, current_header, temp);
2425   // If condition is true we are cont and hence we can store 0 as the
2426   // displaced header in the box, which indicates that it is a recursive lock.
2427   mcrf(flag,CCR0);
2428   std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box);
2429 
2430   // Handle existing monitor.
2431   if ((EmitSync & 0x02) == 0) {
2432     b(cont);
2433 
2434     bind(object_has_monitor);
2435     // The object's monitor m is unlocked iff m->owner == NULL,
2436     // otherwise m->owner may contain a thread or a stack address.
2437 
2438 #if INCLUDE_RTM_OPT
2439     // Use the same RTM locking code in 32- and 64-bit VM.
2440     if (use_rtm) {
2441       rtm_inflated_locking(flag, oop, displaced_header, box, temp, /*temp*/ current_header,
2442                            rtm_counters, method_data, profile_rtm, cont);
2443     } else {
2444 #endif // INCLUDE_RTM_OPT
2445 
2446     // Try to CAS m->owner from NULL to current thread.
2447     addi(temp, displaced_header, ObjectMonitor::owner_offset_in_bytes()-markOopDesc::monitor_value);
2448     li(displaced_header, 0);
2449     // CmpxchgX sets flag to cmpX(current, displaced).
2450     cmpxchgd(/*flag=*/flag,
2451              /*current_value=*/current_header,
2452              /*compare_value=*/(intptr_t)0,
2453              /*exchange_value=*/R16_thread,
2454              /*where=*/temp,
2455              MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2456              MacroAssembler::cmpxchgx_hint_acquire_lock());
2457 
2458     // Store a non-null value into the box.
2459     std(box, BasicLock::displaced_header_offset_in_bytes(), box);
2460 
2461 #   ifdef ASSERT
2462     bne(flag, cont);
2463     // We have acquired the monitor, check some invariants.
2464     addi(/*monitor=*/temp, temp, -ObjectMonitor::owner_offset_in_bytes());
2465     // Invariant 1: _recursions should be 0.
2466     //assert(ObjectMonitor::recursions_size_in_bytes() == 8, "unexpected size");
2467     asm_assert_mem8_is_zero(ObjectMonitor::recursions_offset_in_bytes(), temp,
2468                             "monitor->_recursions should be 0", -1);
2469     // Invariant 2: OwnerIsThread shouldn't be 0.
2470     //assert(ObjectMonitor::OwnerIsThread_size_in_bytes() == 4, "unexpected size");
2471     //asm_assert_mem4_isnot_zero(ObjectMonitor::OwnerIsThread_offset_in_bytes(), temp,
2472     //                           "monitor->OwnerIsThread shouldn't be 0", -1);
2473 #   endif
2474 
2475 #if INCLUDE_RTM_OPT
2476     } // use_rtm()
2477 #endif
2478   }
2479 
2480   bind(cont);
2481   // flag == EQ indicates success
2482   // flag == NE indicates failure
2483 }
2484 
2485 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box,
2486                                                  Register temp, Register displaced_header, Register current_header,
2487                                                  bool try_bias, bool use_rtm) {
2488   assert_different_registers(oop, box, temp, displaced_header, current_header);
2489   assert(flag != CCR0, "bad condition register");
2490   Label cont;
2491   Label object_has_monitor;
2492 
2493   // Always do locking in runtime.
2494   if (EmitSync & 0x01) {
2495     cmpdi(flag, oop, 0); // Oop can't be 0 here => always false.
2496     return;
2497   }
2498 
2499   if (try_bias) {
2500     biased_locking_exit(flag, oop, current_header, cont);
2501   }
2502 
2503 #if INCLUDE_RTM_OPT
2504   if (UseRTMForStackLocks && use_rtm) {
2505     assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
2506     Label L_regular_unlock;
2507     ld(current_header, oopDesc::mark_offset_in_bytes(), oop);         // fetch markword
2508     andi(R0, current_header, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
2509     cmpwi(flag, R0, markOopDesc::unlocked_value);                     // bits = 001 unlocked
2510     bne(flag, L_regular_unlock);                                      // else RegularLock
2511     tend_();                                                          // otherwise end...
2512     b(cont);                                                          // ... and we're done
2513     bind(L_regular_unlock);
2514   }
2515 #endif
2516 
2517   // Find the lock address and load the displaced header from the stack.
2518   ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2519 
2520   // If the displaced header is 0, we have a recursive unlock.
2521   cmpdi(flag, displaced_header, 0);
2522   beq(flag, cont);
2523 
2524   // Handle existing monitor.
2525   if ((EmitSync & 0x02) == 0) {
2526     // The object has an existing monitor iff (mark & monitor_value) != 0.
2527     RTM_OPT_ONLY( if (!(UseRTMForStackLocks && use_rtm)) ) // skip load if already done
2528     ld(current_header, oopDesc::mark_offset_in_bytes(), oop);
2529     andi_(R0, current_header, markOopDesc::monitor_value);
2530     bne(CCR0, object_has_monitor);
2531   }
2532 
2533   // Check if it is still a light weight lock, this is is true if we see
2534   // the stack address of the basicLock in the markOop of the object.
2535   // Cmpxchg sets flag to cmpd(current_header, box).
2536   cmpxchgd(/*flag=*/flag,
2537            /*current_value=*/current_header,
2538            /*compare_value=*/box,
2539            /*exchange_value=*/displaced_header,
2540            /*where=*/oop,
2541            MacroAssembler::MemBarRel,
2542            MacroAssembler::cmpxchgx_hint_release_lock(),
2543            noreg,
2544            &cont);
2545 
2546   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2547 
2548   // Handle existing monitor.
2549   if ((EmitSync & 0x02) == 0) {
2550     b(cont);
2551 
2552     bind(object_has_monitor);
2553     addi(current_header, current_header, -markOopDesc::monitor_value); // monitor
2554     ld(temp,             ObjectMonitor::owner_offset_in_bytes(), current_header);
2555 
2556     // It's inflated.
2557 #if INCLUDE_RTM_OPT
2558     if (use_rtm) {
2559       Label L_regular_inflated_unlock;
2560       // Clean monitor_value bit to get valid pointer
2561       cmpdi(flag, temp, 0);
2562       bne(flag, L_regular_inflated_unlock);
2563       tend_();
2564       b(cont);
2565       bind(L_regular_inflated_unlock);
2566     }
2567 #endif
2568 
2569     ld(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header);
2570     xorr(temp, R16_thread, temp);      // Will be 0 if we are the owner.
2571     orr(temp, temp, displaced_header); // Will be 0 if there are 0 recursions.
2572     cmpdi(flag, temp, 0);
2573     bne(flag, cont);
2574 
2575     ld(temp,             ObjectMonitor::EntryList_offset_in_bytes(), current_header);
2576     ld(displaced_header, ObjectMonitor::cxq_offset_in_bytes(), current_header);
2577     orr(temp, temp, displaced_header); // Will be 0 if both are 0.
2578     cmpdi(flag, temp, 0);
2579     bne(flag, cont);
2580     release();
2581     std(temp, ObjectMonitor::owner_offset_in_bytes(), current_header);
2582   }
2583 
2584   bind(cont);
2585   // flag == EQ indicates success
2586   // flag == NE indicates failure
2587 }
2588 
2589 // Write serialization page so VM thread can do a pseudo remote membar.
2590 // We use the current thread pointer to calculate a thread specific
2591 // offset to write to within the page. This minimizes bus traffic
2592 // due to cache line collision.
2593 void MacroAssembler::serialize_memory(Register thread, Register tmp1, Register tmp2) {
2594   srdi(tmp2, thread, os::get_serialize_page_shift_count());
2595 
2596   int mask = os::vm_page_size() - sizeof(int);
2597   if (Assembler::is_simm(mask, 16)) {
2598     andi(tmp2, tmp2, mask);
2599   } else {
2600     lis(tmp1, (int)((signed short) (mask >> 16)));
2601     ori(tmp1, tmp1, mask & 0x0000ffff);
2602     andr(tmp2, tmp2, tmp1);
2603   }
2604 
2605   load_const(tmp1, (long) os::get_memory_serialize_page());
2606   release();
2607   stwx(R0, tmp1, tmp2);
2608 }
2609 
2610 
2611 // GC barrier helper macros
2612 
2613 // Write the card table byte if needed.
2614 void MacroAssembler::card_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp) {
2615   CardTableModRefBS* bs =
2616     barrier_set_cast<CardTableModRefBS>(Universe::heap()->barrier_set());
2617   assert(bs->kind() == BarrierSet::CardTableForRS ||
2618          bs->kind() == BarrierSet::CardTableExtension, "wrong barrier");
2619 #ifdef ASSERT
2620   cmpdi(CCR0, Rnew_val, 0);
2621   asm_assert_ne("null oop not allowed", 0x321);
2622 #endif
2623   card_table_write(bs->byte_map_base, Rtmp, Rstore_addr);
2624 }
2625 
2626 // Write the card table byte.
2627 void MacroAssembler::card_table_write(jbyte* byte_map_base, Register Rtmp, Register Robj) {
2628   assert_different_registers(Robj, Rtmp, R0);
2629   load_const_optimized(Rtmp, (address)byte_map_base, R0);
2630   srdi(Robj, Robj, CardTableModRefBS::card_shift);
2631   li(R0, 0); // dirty
2632   if (UseConcMarkSweepGC) membar(Assembler::StoreStore);
2633   stbx(R0, Rtmp, Robj);
2634 }
2635 
2636 #if INCLUDE_ALL_GCS
2637 // General G1 pre-barrier generator.
2638 // Goal: record the previous value if it is not null.
2639 void MacroAssembler::g1_write_barrier_pre(Register Robj, RegisterOrConstant offset, Register Rpre_val,
2640                                           Register Rtmp1, Register Rtmp2, bool needs_frame) {
2641   Label runtime, filtered;
2642 
2643   // Is marking active?
2644   if (in_bytes(PtrQueue::byte_width_of_active()) == 4) {
2645     lwz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + PtrQueue::byte_offset_of_active()), R16_thread);
2646   } else {
2647     guarantee(in_bytes(PtrQueue::byte_width_of_active()) == 1, "Assumption");
2648     lbz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + PtrQueue::byte_offset_of_active()), R16_thread);
2649   }
2650   cmpdi(CCR0, Rtmp1, 0);
2651   beq(CCR0, filtered);
2652 
2653   // Do we need to load the previous value?
2654   if (Robj != noreg) {
2655     // Load the previous value...
2656     if (UseCompressedOops) {
2657       lwz(Rpre_val, offset, Robj);
2658     } else {
2659       ld(Rpre_val, offset, Robj);
2660     }
2661     // Previous value has been loaded into Rpre_val.
2662   }
2663   assert(Rpre_val != noreg, "must have a real register");
2664 
2665   // Is the previous value null?
2666   cmpdi(CCR0, Rpre_val, 0);
2667   beq(CCR0, filtered);
2668 
2669   if (Robj != noreg && UseCompressedOops) {
2670     decode_heap_oop_not_null(Rpre_val);
2671   }
2672 
2673   // OK, it's not filtered, so we'll need to call enqueue. In the normal
2674   // case, pre_val will be a scratch G-reg, but there are some cases in
2675   // which it's an O-reg. In the first case, do a normal call. In the
2676   // latter, do a save here and call the frameless version.
2677 
2678   // Can we store original value in the thread's buffer?
2679   // Is index == 0?
2680   // (The index field is typed as size_t.)
2681   const Register Rbuffer = Rtmp1, Rindex = Rtmp2;
2682 
2683   ld(Rindex, in_bytes(JavaThread::satb_mark_queue_offset() + PtrQueue::byte_offset_of_index()), R16_thread);
2684   cmpdi(CCR0, Rindex, 0);
2685   beq(CCR0, runtime); // If index == 0, goto runtime.
2686   ld(Rbuffer, in_bytes(JavaThread::satb_mark_queue_offset() + PtrQueue::byte_offset_of_buf()), R16_thread);
2687 
2688   addi(Rindex, Rindex, -wordSize); // Decrement index.
2689   std(Rindex, in_bytes(JavaThread::satb_mark_queue_offset() + PtrQueue::byte_offset_of_index()), R16_thread);
2690 
2691   // Record the previous value.
2692   stdx(Rpre_val, Rbuffer, Rindex);
2693   b(filtered);
2694 
2695   bind(runtime);
2696 
2697   // VM call need frame to access(write) O register.
2698   if (needs_frame) {
2699     save_LR_CR(Rtmp1);
2700     push_frame_reg_args(0, Rtmp2);
2701   }
2702 
2703   if (Rpre_val->is_volatile() && Robj == noreg) mr(R31, Rpre_val); // Save pre_val across C call if it was preloaded.
2704   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), Rpre_val, R16_thread);
2705   if (Rpre_val->is_volatile() && Robj == noreg) mr(Rpre_val, R31); // restore
2706 
2707   if (needs_frame) {
2708     pop_frame();
2709     restore_LR_CR(Rtmp1);
2710   }
2711 
2712   bind(filtered);
2713 }
2714 
2715 // General G1 post-barrier generator
2716 // Store cross-region card.
2717 void MacroAssembler::g1_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp1, Register Rtmp2, Register Rtmp3, Label *filtered_ext) {
2718   Label runtime, filtered_int;
2719   Label& filtered = (filtered_ext != NULL) ? *filtered_ext : filtered_int;
2720   assert_different_registers(Rstore_addr, Rnew_val, Rtmp1, Rtmp2);
2721 
2722   G1SATBCardTableLoggingModRefBS* bs =
2723     barrier_set_cast<G1SATBCardTableLoggingModRefBS>(Universe::heap()->barrier_set());
2724 
2725   // Does store cross heap regions?
2726   if (G1RSBarrierRegionFilter) {
2727     xorr(Rtmp1, Rstore_addr, Rnew_val);
2728     srdi_(Rtmp1, Rtmp1, HeapRegion::LogOfHRGrainBytes);
2729     beq(CCR0, filtered);
2730   }
2731 
2732   // Crosses regions, storing NULL?
2733 #ifdef ASSERT
2734   cmpdi(CCR0, Rnew_val, 0);
2735   asm_assert_ne("null oop not allowed (G1)", 0x322); // Checked by caller on PPC64, so following branch is obsolete:
2736   //beq(CCR0, filtered);
2737 #endif
2738 
2739   // Storing region crossing non-NULL, is card already dirty?
2740   assert(sizeof(*bs->byte_map_base) == sizeof(jbyte), "adjust this code");
2741   const Register Rcard_addr = Rtmp1;
2742   Register Rbase = Rtmp2;
2743   load_const_optimized(Rbase, (address)bs->byte_map_base, /*temp*/ Rtmp3);
2744 
2745   srdi(Rcard_addr, Rstore_addr, CardTableModRefBS::card_shift);
2746 
2747   // Get the address of the card.
2748   lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr);
2749   cmpwi(CCR0, Rtmp3, (int)G1SATBCardTableModRefBS::g1_young_card_val());
2750   beq(CCR0, filtered);
2751 
2752   membar(Assembler::StoreLoad);
2753   lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr);  // Reload after membar.
2754   cmpwi(CCR0, Rtmp3 /* card value */, CardTableModRefBS::dirty_card_val());
2755   beq(CCR0, filtered);
2756 
2757   // Storing a region crossing, non-NULL oop, card is clean.
2758   // Dirty card and log.
2759   li(Rtmp3, CardTableModRefBS::dirty_card_val());
2760   //release(); // G1: oops are allowed to get visible after dirty marking.
2761   stbx(Rtmp3, Rbase, Rcard_addr);
2762 
2763   add(Rcard_addr, Rbase, Rcard_addr); // This is the address which needs to get enqueued.
2764   Rbase = noreg; // end of lifetime
2765 
2766   const Register Rqueue_index = Rtmp2,
2767                  Rqueue_buf   = Rtmp3;
2768   ld(Rqueue_index, in_bytes(JavaThread::dirty_card_queue_offset() + PtrQueue::byte_offset_of_index()), R16_thread);
2769   cmpdi(CCR0, Rqueue_index, 0);
2770   beq(CCR0, runtime); // index == 0 then jump to runtime
2771   ld(Rqueue_buf, in_bytes(JavaThread::dirty_card_queue_offset() + PtrQueue::byte_offset_of_buf()), R16_thread);
2772 
2773   addi(Rqueue_index, Rqueue_index, -wordSize); // decrement index
2774   std(Rqueue_index, in_bytes(JavaThread::dirty_card_queue_offset() + PtrQueue::byte_offset_of_index()), R16_thread);
2775 
2776   stdx(Rcard_addr, Rqueue_buf, Rqueue_index); // store card
2777   b(filtered);
2778 
2779   bind(runtime);
2780 
2781   // Save the live input values.
2782   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), Rcard_addr, R16_thread);
2783 
2784   bind(filtered_int);
2785 }
2786 #endif // INCLUDE_ALL_GCS
2787 
2788 // Values for last_Java_pc, and last_Java_sp must comply to the rules
2789 // in frame_ppc.hpp.
2790 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) {
2791   // Always set last_Java_pc and flags first because once last_Java_sp
2792   // is visible has_last_Java_frame is true and users will look at the
2793   // rest of the fields. (Note: flags should always be zero before we
2794   // get here so doesn't need to be set.)
2795 
2796   // Verify that last_Java_pc was zeroed on return to Java
2797   asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread,
2798                           "last_Java_pc not zeroed before leaving Java", 0x200);
2799 
2800   // When returning from calling out from Java mode the frame anchor's
2801   // last_Java_pc will always be set to NULL. It is set here so that
2802   // if we are doing a call to native (not VM) that we capture the
2803   // known pc and don't have to rely on the native call having a
2804   // standard frame linkage where we can find the pc.
2805   if (last_Java_pc != noreg)
2806     std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
2807 
2808   // Set last_Java_sp last.
2809   std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
2810 }
2811 
2812 void MacroAssembler::reset_last_Java_frame(void) {
2813   asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()),
2814                              R16_thread, "SP was not set, still zero", 0x202);
2815 
2816   BLOCK_COMMENT("reset_last_Java_frame {");
2817   li(R0, 0);
2818 
2819   // _last_Java_sp = 0
2820   std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
2821 
2822   // _last_Java_pc = 0
2823   std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
2824   BLOCK_COMMENT("} reset_last_Java_frame");
2825 }
2826 
2827 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) {
2828   assert_different_registers(sp, tmp1);
2829 
2830   // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via
2831   // TOP_IJAVA_FRAME_ABI.
2832   // FIXME: assert that we really have a TOP_IJAVA_FRAME here!
2833 #ifdef CC_INTERP
2834   ld(tmp1/*pc*/, _top_ijava_frame_abi(frame_manager_lr), sp);
2835 #else
2836   address entry = pc();
2837   load_const_optimized(tmp1, entry);
2838 #endif
2839 
2840   set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1);
2841 }
2842 
2843 void MacroAssembler::get_vm_result(Register oop_result) {
2844   // Read:
2845   //   R16_thread
2846   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
2847   //
2848   // Updated:
2849   //   oop_result
2850   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
2851 
2852   verify_thread();
2853 
2854   ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread);
2855   li(R0, 0);
2856   std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread);
2857 
2858   verify_oop(oop_result);
2859 }
2860 
2861 void MacroAssembler::get_vm_result_2(Register metadata_result) {
2862   // Read:
2863   //   R16_thread
2864   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
2865   //
2866   // Updated:
2867   //   metadata_result
2868   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
2869 
2870   ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
2871   li(R0, 0);
2872   std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
2873 }
2874 
2875 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) {
2876   Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided.
2877   if (Universe::narrow_klass_base() != 0) {
2878     // Use dst as temp if it is free.
2879     sub_const_optimized(dst, current, Universe::narrow_klass_base(), R0);
2880     current = dst;
2881   }
2882   if (Universe::narrow_klass_shift() != 0) {
2883     srdi(dst, current, Universe::narrow_klass_shift());
2884     current = dst;
2885   }
2886   return current;
2887 }
2888 
2889 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) {
2890   if (UseCompressedClassPointers) {
2891     Register compressedKlass = encode_klass_not_null(ck, klass);
2892     stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop);
2893   } else {
2894     std(klass, oopDesc::klass_offset_in_bytes(), dst_oop);
2895   }
2896 }
2897 
2898 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) {
2899   if (UseCompressedClassPointers) {
2900     if (val == noreg) {
2901       val = R0;
2902       li(val, 0);
2903     }
2904     stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed
2905   }
2906 }
2907 
2908 int MacroAssembler::instr_size_for_decode_klass_not_null() {
2909   if (!UseCompressedClassPointers) return 0;
2910   int num_instrs = 1;  // shift or move
2911   if (Universe::narrow_klass_base() != 0) num_instrs = 7;  // shift + load const + add
2912   return num_instrs * BytesPerInstWord;
2913 }
2914 
2915 void MacroAssembler::decode_klass_not_null(Register dst, Register src) {
2916   assert(dst != R0, "Dst reg may not be R0, as R0 is used here.");
2917   if (src == noreg) src = dst;
2918   Register shifted_src = src;
2919   if (Universe::narrow_klass_shift() != 0 ||
2920       Universe::narrow_klass_base() == 0 && src != dst) {  // Move required.
2921     shifted_src = dst;
2922     sldi(shifted_src, src, Universe::narrow_klass_shift());
2923   }
2924   if (Universe::narrow_klass_base() != 0) {
2925     add_const_optimized(dst, shifted_src, Universe::narrow_klass_base(), R0);
2926   }
2927 }
2928 
2929 void MacroAssembler::load_klass(Register dst, Register src) {
2930   if (UseCompressedClassPointers) {
2931     lwz(dst, oopDesc::klass_offset_in_bytes(), src);
2932     // Attention: no null check here!
2933     decode_klass_not_null(dst, dst);
2934   } else {
2935     ld(dst, oopDesc::klass_offset_in_bytes(), src);
2936   }
2937 }
2938 
2939 void MacroAssembler::load_klass_with_trap_null_check(Register dst, Register src) {
2940   if (!os::zero_page_read_protected()) {
2941     if (TrapBasedNullChecks) {
2942       trap_null_check(src);
2943     }
2944   }
2945   load_klass(dst, src);
2946 }
2947 
2948 void MacroAssembler::reinit_heapbase(Register d, Register tmp) {
2949   if (Universe::heap() != NULL) {
2950     load_const_optimized(R30, Universe::narrow_ptrs_base(), tmp);
2951   } else {
2952     // Heap not yet allocated. Load indirectly.
2953     int simm16_offset = load_const_optimized(R30, Universe::narrow_ptrs_base_addr(), tmp, true);
2954     ld(R30, simm16_offset, R30);
2955   }
2956 }
2957 
2958 // Clear Array
2959 // Kills both input registers. tmp == R0 is allowed.
2960 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp) {
2961   // Procedure for large arrays (uses data cache block zero instruction).
2962     Label startloop, fast, fastloop, small_rest, restloop, done;
2963     const int cl_size         = VM_Version::get_cache_line_size(),
2964               cl_dwords       = cl_size>>3,
2965               cl_dw_addr_bits = exact_log2(cl_dwords),
2966               dcbz_min        = 1;                     // Min count of dcbz executions, needs to be >0.
2967 
2968 //2:
2969     cmpdi(CCR1, cnt_dwords, ((dcbz_min+1)<<cl_dw_addr_bits)-1); // Big enough? (ensure >=dcbz_min lines included).
2970     blt(CCR1, small_rest);                                      // Too small.
2971     rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits);           // Extract dword offset within first cache line.
2972     beq(CCR0, fast);                                            // Already 128byte aligned.
2973 
2974     subfic(tmp, tmp, cl_dwords);
2975     mtctr(tmp);                        // Set ctr to hit 128byte boundary (0<ctr<cl_dwords).
2976     subf(cnt_dwords, tmp, cnt_dwords); // rest.
2977     li(tmp, 0);
2978 //10:
2979   bind(startloop);                     // Clear at the beginning to reach 128byte boundary.
2980     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
2981     addi(base_ptr, base_ptr, 8);
2982     bdnz(startloop);
2983 //13:
2984   bind(fast);                                  // Clear 128byte blocks.
2985     srdi(tmp, cnt_dwords, cl_dw_addr_bits);    // Loop count for 128byte loop (>0).
2986     andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords.
2987     mtctr(tmp);                                // Load counter.
2988 //16:
2989   bind(fastloop);
2990     dcbz(base_ptr);                    // Clear 128byte aligned block.
2991     addi(base_ptr, base_ptr, cl_size);
2992     bdnz(fastloop);
2993     if (InsertEndGroupPPC64) { endgroup(); } else { nop(); }
2994 //20:
2995   bind(small_rest);
2996     cmpdi(CCR0, cnt_dwords, 0);        // size 0?
2997     beq(CCR0, done);                   // rest == 0
2998     li(tmp, 0);
2999     mtctr(cnt_dwords);                 // Load counter.
3000 //24:
3001   bind(restloop);                      // Clear rest.
3002     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3003     addi(base_ptr, base_ptr, 8);
3004     bdnz(restloop);
3005 //27:
3006   bind(done);
3007 }
3008 
3009 /////////////////////////////////////////// String intrinsics ////////////////////////////////////////////
3010 
3011 // Search for a single jchar in an jchar[].
3012 //
3013 // Assumes that result differs from all other registers.
3014 //
3015 // Haystack, needle are the addresses of jchar-arrays.
3016 // NeedleChar is needle[0] if it is known at compile time.
3017 // Haycnt is the length of the haystack. We assume haycnt >=1.
3018 //
3019 // Preserves haystack, haycnt, kills all other registers.
3020 //
3021 // If needle == R0, we search for the constant needleChar.
3022 void MacroAssembler::string_indexof_1(Register result, Register haystack, Register haycnt,
3023                                       Register needle, jchar needleChar,
3024                                       Register tmp1, Register tmp2) {
3025 
3026   assert_different_registers(result, haystack, haycnt, needle, tmp1, tmp2);
3027 
3028   Label L_InnerLoop, L_FinalCheck, L_Found1, L_Found2, L_Found3, L_NotFound, L_End;
3029   Register needle0 = needle, // Contains needle[0].
3030            addr = tmp1,
3031            ch1 = tmp2,
3032            ch2 = R0;
3033 
3034 //2 (variable) or 3 (const):
3035    if (needle != R0) lhz(needle0, 0, needle); // Preload needle character, needle has len==1.
3036    dcbtct(haystack, 0x00);                        // Indicate R/O access to haystack.
3037 
3038    srwi_(tmp2, haycnt, 1);   // Shift right by exact_log2(UNROLL_FACTOR).
3039    mr(addr, haystack);
3040    beq(CCR0, L_FinalCheck);
3041    mtctr(tmp2);              // Move to count register.
3042 //8:
3043   bind(L_InnerLoop);             // Main work horse (2x unrolled search loop).
3044    lhz(ch1, 0, addr);        // Load characters from haystack.
3045    lhz(ch2, 2, addr);
3046    (needle != R0) ? cmpw(CCR0, ch1, needle0) : cmplwi(CCR0, ch1, needleChar);
3047    (needle != R0) ? cmpw(CCR1, ch2, needle0) : cmplwi(CCR1, ch2, needleChar);
3048    beq(CCR0, L_Found1);   // Did we find the needle?
3049    beq(CCR1, L_Found2);
3050    addi(addr, addr, 4);
3051    bdnz(L_InnerLoop);
3052 //16:
3053   bind(L_FinalCheck);
3054    andi_(R0, haycnt, 1);
3055    beq(CCR0, L_NotFound);
3056    lhz(ch1, 0, addr);        // One position left at which we have to compare.
3057    (needle != R0) ? cmpw(CCR1, ch1, needle0) : cmplwi(CCR1, ch1, needleChar);
3058    beq(CCR1, L_Found3);
3059 //21:
3060   bind(L_NotFound);
3061    li(result, -1);           // Not found.
3062    b(L_End);
3063 
3064   bind(L_Found2);
3065    addi(addr, addr, 2);
3066 //24:
3067   bind(L_Found1);
3068   bind(L_Found3);                  // Return index ...
3069    subf(addr, haystack, addr); // relative to haystack,
3070    srdi(result, addr, 1);      // in characters.
3071   bind(L_End);
3072 }
3073 
3074 
3075 // Implementation of IndexOf for jchar arrays.
3076 //
3077 // The length of haystack and needle are not constant, i.e. passed in a register.
3078 //
3079 // Preserves registers haystack, needle.
3080 // Kills registers haycnt, needlecnt.
3081 // Assumes that result differs from all other registers.
3082 // Haystack, needle are the addresses of jchar-arrays.
3083 // Haycnt, needlecnt are the lengths of them, respectively.
3084 //
3085 // Needlecntval must be zero or 15-bit unsigned immediate and > 1.
3086 void MacroAssembler::string_indexof(Register result, Register haystack, Register haycnt,
3087                                     Register needle, ciTypeArray* needle_values, Register needlecnt, int needlecntval,
3088                                     Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
3089 
3090   // Ensure 0<needlecnt<=haycnt in ideal graph as prerequisite!
3091   Label L_TooShort, L_Found, L_NotFound, L_End;
3092   Register last_addr = haycnt, // Kill haycnt at the beginning.
3093            addr      = tmp1,
3094            n_start   = tmp2,
3095            ch1       = tmp3,
3096            ch2       = R0;
3097 
3098   // **************************************************************************************************
3099   // Prepare for main loop: optimized for needle count >=2, bail out otherwise.
3100   // **************************************************************************************************
3101 
3102 //1 (variable) or 3 (const):
3103    dcbtct(needle, 0x00);    // Indicate R/O access to str1.
3104    dcbtct(haystack, 0x00);  // Indicate R/O access to str2.
3105 
3106   // Compute last haystack addr to use if no match gets found.
3107   if (needlecntval == 0) { // variable needlecnt
3108 //3:
3109    subf(ch1, needlecnt, haycnt);      // Last character index to compare is haycnt-needlecnt.
3110    addi(addr, haystack, -2);          // Accesses use pre-increment.
3111    cmpwi(CCR6, needlecnt, 2);
3112    blt(CCR6, L_TooShort);          // Variable needlecnt: handle short needle separately.
3113    slwi(ch1, ch1, 1);                 // Scale to number of bytes.
3114    lwz(n_start, 0, needle);           // Load first 2 characters of needle.
3115    add(last_addr, haystack, ch1);     // Point to last address to compare (haystack+2*(haycnt-needlecnt)).
3116    addi(needlecnt, needlecnt, -2);    // Rest of needle.
3117   } else { // constant needlecnt
3118   guarantee(needlecntval != 1, "IndexOf with single-character needle must be handled separately");
3119   assert((needlecntval & 0x7fff) == needlecntval, "wrong immediate");
3120 //5:
3121    addi(ch1, haycnt, -needlecntval);  // Last character index to compare is haycnt-needlecnt.
3122    lwz(n_start, 0, needle);           // Load first 2 characters of needle.
3123    addi(addr, haystack, -2);          // Accesses use pre-increment.
3124    slwi(ch1, ch1, 1);                 // Scale to number of bytes.
3125    add(last_addr, haystack, ch1);     // Point to last address to compare (haystack+2*(haycnt-needlecnt)).
3126    li(needlecnt, needlecntval-2);     // Rest of needle.
3127   }
3128 
3129   // Main Loop (now we have at least 3 characters).
3130 //11:
3131   Label L_OuterLoop, L_InnerLoop, L_FinalCheck, L_Comp1, L_Comp2, L_Comp3;
3132   bind(L_OuterLoop); // Search for 1st 2 characters.
3133   Register addr_diff = tmp4;
3134    subf(addr_diff, addr, last_addr); // Difference between already checked address and last address to check.
3135    addi(addr, addr, 2);              // This is the new address we want to use for comparing.
3136    srdi_(ch2, addr_diff, 2);
3137    beq(CCR0, L_FinalCheck);       // 2 characters left?
3138    mtctr(ch2);                       // addr_diff/4
3139 //16:
3140   bind(L_InnerLoop);                // Main work horse (2x unrolled search loop)
3141    lwz(ch1, 0, addr);           // Load 2 characters of haystack (ignore alignment).
3142    lwz(ch2, 2, addr);
3143    cmpw(CCR0, ch1, n_start); // Compare 2 characters (1 would be sufficient but try to reduce branches to CompLoop).
3144    cmpw(CCR1, ch2, n_start);
3145    beq(CCR0, L_Comp1);       // Did we find the needle start?
3146    beq(CCR1, L_Comp2);
3147    addi(addr, addr, 4);
3148    bdnz(L_InnerLoop);
3149 //24:
3150   bind(L_FinalCheck);
3151    rldicl_(addr_diff, addr_diff, 64-1, 63); // Remaining characters not covered by InnerLoop: (addr_diff>>1)&1.
3152    beq(CCR0, L_NotFound);
3153    lwz(ch1, 0, addr);                       // One position left at which we have to compare.
3154    cmpw(CCR1, ch1, n_start);
3155    beq(CCR1, L_Comp3);
3156 //29:
3157   bind(L_NotFound);
3158    li(result, -1); // not found
3159    b(L_End);
3160 
3161 
3162    // **************************************************************************************************
3163    // Special Case: unfortunately, the variable needle case can be called with needlecnt<2
3164    // **************************************************************************************************
3165 //31:
3166  if ((needlecntval>>1) !=1 ) { // Const needlecnt is 2 or 3? Reduce code size.
3167   int nopcnt = 5;
3168   if (needlecntval !=0 ) ++nopcnt; // Balance alignment (other case: see below).
3169   if (needlecntval == 0) {         // We have to handle these cases separately.
3170   Label L_OneCharLoop;
3171   bind(L_TooShort);
3172    mtctr(haycnt);
3173    lhz(n_start, 0, needle);    // First character of needle
3174   bind(L_OneCharLoop);
3175    lhzu(ch1, 2, addr);
3176    cmpw(CCR1, ch1, n_start);
3177    beq(CCR1, L_Found);      // Did we find the one character needle?
3178    bdnz(L_OneCharLoop);
3179    li(result, -1);             // Not found.
3180    b(L_End);
3181   } // 8 instructions, so no impact on alignment.
3182   for (int x = 0; x < nopcnt; ++x) nop();
3183  }
3184 
3185   // **************************************************************************************************
3186   // Regular Case Part II: compare rest of needle (first 2 characters have been compared already)
3187   // **************************************************************************************************
3188 
3189   // Compare the rest
3190 //36 if needlecntval==0, else 37:
3191   bind(L_Comp2);
3192    addi(addr, addr, 2); // First comparison has failed, 2nd one hit.
3193   bind(L_Comp1);            // Addr points to possible needle start.
3194   bind(L_Comp3);            // Could have created a copy and use a different return address but saving code size here.
3195   if (needlecntval != 2) {  // Const needlecnt==2?
3196    if (needlecntval != 3) {
3197     if (needlecntval == 0) beq(CCR6, L_Found); // Variable needlecnt==2?
3198     Register ind_reg = tmp4;
3199     li(ind_reg, 2*2);   // First 2 characters are already compared, use index 2.
3200     mtctr(needlecnt);   // Decremented by 2, still > 0.
3201 //40:
3202    Label L_CompLoop;
3203    bind(L_CompLoop);
3204     lhzx(ch2, needle, ind_reg);
3205     lhzx(ch1, addr, ind_reg);
3206     cmpw(CCR1, ch1, ch2);
3207     bne(CCR1, L_OuterLoop);
3208     addi(ind_reg, ind_reg, 2);
3209     bdnz(L_CompLoop);
3210    } else { // No loop required if there's only one needle character left.
3211     lhz(ch2, 2*2, needle);
3212     lhz(ch1, 2*2, addr);
3213     cmpw(CCR1, ch1, ch2);
3214     bne(CCR1, L_OuterLoop);
3215    }
3216   }
3217   // Return index ...
3218 //46:
3219   bind(L_Found);
3220    subf(addr, haystack, addr); // relative to haystack, ...
3221    srdi(result, addr, 1);      // in characters.
3222 //48:
3223   bind(L_End);
3224 }
3225 
3226 // Implementation of Compare for jchar arrays.
3227 //
3228 // Kills the registers str1, str2, cnt1, cnt2.
3229 // Kills cr0, ctr.
3230 // Assumes that result differes from the input registers.
3231 void MacroAssembler::string_compare(Register str1_reg, Register str2_reg, Register cnt1_reg, Register cnt2_reg,
3232                                     Register result_reg, Register tmp_reg) {
3233    assert_different_registers(result_reg, str1_reg, str2_reg, cnt1_reg, cnt2_reg, tmp_reg);
3234 
3235    Label Ldone, Lslow_case, Lslow_loop, Lfast_loop;
3236    Register cnt_diff = R0,
3237             limit_reg = cnt1_reg,
3238             chr1_reg = result_reg,
3239             chr2_reg = cnt2_reg,
3240             addr_diff = str2_reg;
3241 
3242    // Offset 0 should be 32 byte aligned.
3243 //-4:
3244     dcbtct(str1_reg, 0x00);  // Indicate R/O access to str1.
3245     dcbtct(str2_reg, 0x00);  // Indicate R/O access to str2.
3246 //-2:
3247    // Compute min(cnt1, cnt2) and check if 0 (bail out if we don't need to compare characters).
3248     subf(result_reg, cnt2_reg, cnt1_reg);  // difference between cnt1/2
3249     subf_(addr_diff, str1_reg, str2_reg);  // alias?
3250     beq(CCR0, Ldone);                   // return cnt difference if both ones are identical
3251     srawi(limit_reg, result_reg, 31);      // generate signmask (cnt1/2 must be non-negative so cnt_diff can't overflow)
3252     mr(cnt_diff, result_reg);
3253     andr(limit_reg, result_reg, limit_reg); // difference or zero (negative): cnt1<cnt2 ? cnt1-cnt2 : 0
3254     add_(limit_reg, cnt2_reg, limit_reg);  // min(cnt1, cnt2)==0?
3255     beq(CCR0, Ldone);                   // return cnt difference if one has 0 length
3256 
3257     lhz(chr1_reg, 0, str1_reg);            // optional: early out if first characters mismatch
3258     lhzx(chr2_reg, str1_reg, addr_diff);   // optional: early out if first characters mismatch
3259     addi(tmp_reg, limit_reg, -1);          // min(cnt1, cnt2)-1
3260     subf_(result_reg, chr2_reg, chr1_reg); // optional: early out if first characters mismatch
3261     bne(CCR0, Ldone);                   // optional: early out if first characters mismatch
3262 
3263    // Set loop counter by scaling down tmp_reg
3264     srawi_(chr2_reg, tmp_reg, exact_log2(4)); // (min(cnt1, cnt2)-1)/4
3265     ble(CCR0, Lslow_case);                 // need >4 characters for fast loop
3266     andi(limit_reg, tmp_reg, 4-1);            // remaining characters
3267 
3268    // Adapt str1_reg str2_reg for the first loop iteration
3269     mtctr(chr2_reg);                 // (min(cnt1, cnt2)-1)/4
3270     addi(limit_reg, limit_reg, 4+1); // compare last 5-8 characters in slow_case if mismatch found in fast_loop
3271 //16:
3272    // Compare the rest of the characters
3273    bind(Lfast_loop);
3274     ld(chr1_reg, 0, str1_reg);
3275     ldx(chr2_reg, str1_reg, addr_diff);
3276     cmpd(CCR0, chr2_reg, chr1_reg);
3277     bne(CCR0, Lslow_case); // return chr1_reg
3278     addi(str1_reg, str1_reg, 4*2);
3279     bdnz(Lfast_loop);
3280     addi(limit_reg, limit_reg, -4); // no mismatch found in fast_loop, only 1-4 characters missing
3281 //23:
3282    bind(Lslow_case);
3283     mtctr(limit_reg);
3284 //24:
3285    bind(Lslow_loop);
3286     lhz(chr1_reg, 0, str1_reg);
3287     lhzx(chr2_reg, str1_reg, addr_diff);
3288     subf_(result_reg, chr2_reg, chr1_reg);
3289     bne(CCR0, Ldone); // return chr1_reg
3290     addi(str1_reg, str1_reg, 1*2);
3291     bdnz(Lslow_loop);
3292 //30:
3293    // If strings are equal up to min length, return the length difference.
3294     mr(result_reg, cnt_diff);
3295     nop(); // alignment
3296 //32:
3297    // Otherwise, return the difference between the first mismatched chars.
3298    bind(Ldone);
3299 }
3300 
3301 
3302 // Compare char[] arrays.
3303 //
3304 // str1_reg   USE only
3305 // str2_reg   USE only
3306 // cnt_reg    USE_DEF, due to tmp reg shortage
3307 // result_reg DEF only, might compromise USE only registers
3308 void MacroAssembler::char_arrays_equals(Register str1_reg, Register str2_reg, Register cnt_reg, Register result_reg,
3309                                         Register tmp1_reg, Register tmp2_reg, Register tmp3_reg, Register tmp4_reg,
3310                                         Register tmp5_reg) {
3311 
3312   // Str1 may be the same register as str2 which can occur e.g. after scalar replacement.
3313   assert_different_registers(result_reg, str1_reg, cnt_reg, tmp1_reg, tmp2_reg, tmp3_reg, tmp4_reg, tmp5_reg);
3314   assert_different_registers(result_reg, str2_reg, cnt_reg, tmp1_reg, tmp2_reg, tmp3_reg, tmp4_reg, tmp5_reg);
3315 
3316   // Offset 0 should be 32 byte aligned.
3317   Label Linit_cbc, Lcbc, Lloop, Ldone_true, Ldone_false;
3318   Register index_reg = tmp5_reg;
3319   Register cbc_iter  = tmp4_reg;
3320 
3321 //-1:
3322   dcbtct(str1_reg, 0x00);  // Indicate R/O access to str1.
3323   dcbtct(str2_reg, 0x00);  // Indicate R/O access to str2.
3324 //1:
3325   andi(cbc_iter, cnt_reg, 4-1);            // Remaining iterations after 4 java characters per iteration loop.
3326   li(index_reg, 0); // init
3327   li(result_reg, 0); // assume false
3328   srwi_(tmp2_reg, cnt_reg, exact_log2(4)); // Div: 4 java characters per iteration (main loop).
3329 
3330   cmpwi(CCR1, cbc_iter, 0);             // CCR1 = (cbc_iter==0)
3331   beq(CCR0, Linit_cbc);                 // too short
3332     mtctr(tmp2_reg);
3333 //8:
3334     bind(Lloop);
3335       ldx(tmp1_reg, str1_reg, index_reg);
3336       ldx(tmp2_reg, str2_reg, index_reg);
3337       cmpd(CCR0, tmp1_reg, tmp2_reg);
3338       bne(CCR0, Ldone_false);  // Unequal char pair found -> done.
3339       addi(index_reg, index_reg, 4*sizeof(jchar));
3340       bdnz(Lloop);
3341 //14:
3342   bind(Linit_cbc);
3343   beq(CCR1, Ldone_true);
3344     mtctr(cbc_iter);
3345 //16:
3346     bind(Lcbc);
3347       lhzx(tmp1_reg, str1_reg, index_reg);
3348       lhzx(tmp2_reg, str2_reg, index_reg);
3349       cmpw(CCR0, tmp1_reg, tmp2_reg);
3350       bne(CCR0, Ldone_false);  // Unequal char pair found -> done.
3351       addi(index_reg, index_reg, 1*sizeof(jchar));
3352       bdnz(Lcbc);
3353     nop();
3354   bind(Ldone_true);
3355   li(result_reg, 1);
3356 //24:
3357   bind(Ldone_false);
3358 }
3359 
3360 
3361 void MacroAssembler::char_arrays_equalsImm(Register str1_reg, Register str2_reg, int cntval, Register result_reg,
3362                                            Register tmp1_reg, Register tmp2_reg) {
3363   // Str1 may be the same register as str2 which can occur e.g. after scalar replacement.
3364   assert_different_registers(result_reg, str1_reg, tmp1_reg, tmp2_reg);
3365   assert_different_registers(result_reg, str2_reg, tmp1_reg, tmp2_reg);
3366   assert(sizeof(jchar) == 2, "must be");
3367   assert(cntval >= 0 && ((cntval & 0x7fff) == cntval), "wrong immediate");
3368 
3369   Label Ldone_false;
3370 
3371   if (cntval < 16) { // short case
3372     if (cntval != 0) li(result_reg, 0); // assume false
3373 
3374     const int num_bytes = cntval*sizeof(jchar);
3375     int index = 0;
3376     for (int next_index; (next_index = index + 8) <= num_bytes; index = next_index) {
3377       ld(tmp1_reg, index, str1_reg);
3378       ld(tmp2_reg, index, str2_reg);
3379       cmpd(CCR0, tmp1_reg, tmp2_reg);
3380       bne(CCR0, Ldone_false);
3381     }
3382     if (cntval & 2) {
3383       lwz(tmp1_reg, index, str1_reg);
3384       lwz(tmp2_reg, index, str2_reg);
3385       cmpw(CCR0, tmp1_reg, tmp2_reg);
3386       bne(CCR0, Ldone_false);
3387       index += 4;
3388     }
3389     if (cntval & 1) {
3390       lhz(tmp1_reg, index, str1_reg);
3391       lhz(tmp2_reg, index, str2_reg);
3392       cmpw(CCR0, tmp1_reg, tmp2_reg);
3393       bne(CCR0, Ldone_false);
3394     }
3395     // fallthrough: true
3396   } else {
3397     Label Lloop;
3398     Register index_reg = tmp1_reg;
3399     const int loopcnt = cntval/4;
3400     assert(loopcnt > 0, "must be");
3401     // Offset 0 should be 32 byte aligned.
3402     //2:
3403     dcbtct(str1_reg, 0x00);  // Indicate R/O access to str1.
3404     dcbtct(str2_reg, 0x00);  // Indicate R/O access to str2.
3405     li(tmp2_reg, loopcnt);
3406     li(index_reg, 0); // init
3407     li(result_reg, 0); // assume false
3408     mtctr(tmp2_reg);
3409     //8:
3410     bind(Lloop);
3411     ldx(R0, str1_reg, index_reg);
3412     ldx(tmp2_reg, str2_reg, index_reg);
3413     cmpd(CCR0, R0, tmp2_reg);
3414     bne(CCR0, Ldone_false);  // Unequal char pair found -> done.
3415     addi(index_reg, index_reg, 4*sizeof(jchar));
3416     bdnz(Lloop);
3417     //14:
3418     if (cntval & 2) {
3419       lwzx(R0, str1_reg, index_reg);
3420       lwzx(tmp2_reg, str2_reg, index_reg);
3421       cmpw(CCR0, R0, tmp2_reg);
3422       bne(CCR0, Ldone_false);
3423       if (cntval & 1) addi(index_reg, index_reg, 2*sizeof(jchar));
3424     }
3425     if (cntval & 1) {
3426       lhzx(R0, str1_reg, index_reg);
3427       lhzx(tmp2_reg, str2_reg, index_reg);
3428       cmpw(CCR0, R0, tmp2_reg);
3429       bne(CCR0, Ldone_false);
3430     }
3431     // fallthru: true
3432   }
3433   li(result_reg, 1);
3434   bind(Ldone_false);
3435 }
3436 
3437 // Helpers for Intrinsic Emitters
3438 //
3439 // Revert the byte order of a 32bit value in a register
3440 //   src: 0x44556677
3441 //   dst: 0x77665544
3442 // Three steps to obtain the result:
3443 //  1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word
3444 //     into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared.
3445 //     This value initializes dst.
3446 //  2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost
3447 //     byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go.
3448 //     This value is mask inserted into dst with a [0..23] mask of 1s.
3449 //  3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position.
3450 //     This value is mask inserted into dst with a [8..15] mask of 1s.
3451 void MacroAssembler::load_reverse_32(Register dst, Register src) {
3452   assert_different_registers(dst, src);
3453 
3454   rldicl(dst, src, (4+1)*8, 56);       // Rotate byte 4 into position 7 (rightmost), clear all to the left.
3455   rlwimi(dst, src,     3*8,  0, 23);   // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone.
3456   rlwimi(dst, src,     1*8,  8, 15);   // Insert byte 6 into position 5, leave the rest alone.
3457 }
3458 
3459 // Calculate the column addresses of the crc32 lookup table into distinct registers.
3460 // This loop-invariant calculation is moved out of the loop body, reducing the loop
3461 // body size from 20 to 16 instructions.
3462 // Returns the offset that was used to calculate the address of column tc3.
3463 // Due to register shortage, setting tc3 may overwrite table. With the return offset
3464 // at hand, the original table address can be easily reconstructed.
3465 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {
3466 
3467 #ifdef VM_LITTLE_ENDIAN
3468   // This is what we implement (the DOLIT4 part):
3469   // ========================================================================= */
3470   // #define DOLIT4 c ^= *buf4++; \
3471   //         c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \
3472   //             crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24]
3473   // #define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4
3474   // ========================================================================= */
3475   const int ix0 = 3*(4*CRC32_COLUMN_SIZE);
3476   const int ix1 = 2*(4*CRC32_COLUMN_SIZE);
3477   const int ix2 = 1*(4*CRC32_COLUMN_SIZE);
3478   const int ix3 = 0*(4*CRC32_COLUMN_SIZE);
3479 #else
3480   // This is what we implement (the DOBIG4 part):
3481   // =========================================================================
3482   // #define DOBIG4 c ^= *++buf4; \
3483   //         c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \
3484   //             crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24]
3485   // #define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4
3486   // =========================================================================
3487   const int ix0 = 4*(4*CRC32_COLUMN_SIZE);
3488   const int ix1 = 5*(4*CRC32_COLUMN_SIZE);
3489   const int ix2 = 6*(4*CRC32_COLUMN_SIZE);
3490   const int ix3 = 7*(4*CRC32_COLUMN_SIZE);
3491 #endif
3492   assert_different_registers(table, tc0, tc1, tc2);
3493   assert(table == tc3, "must be!");
3494 
3495   if (ix0 != 0) addi(tc0, table, ix0);
3496   if (ix1 != 0) addi(tc1, table, ix1);
3497   if (ix2 != 0) addi(tc2, table, ix2);
3498   if (ix3 != 0) addi(tc3, table, ix3);
3499 
3500   return ix3;
3501 }
3502 
3503 /**
3504  * uint32_t crc;
3505  * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
3506  */
3507 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) {
3508   assert_different_registers(crc, table, tmp);
3509   assert_different_registers(val, table);
3510 
3511   if (crc == val) {                   // Must rotate first to use the unmodified value.
3512     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3513                                       // As we use a word (4-byte) instruction, we have to adapt the mask bit positions.
3514     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
3515   } else {
3516     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
3517     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3518   }
3519   lwzx(tmp, table, tmp);
3520   xorr(crc, crc, tmp);
3521 }
3522 
3523 /**
3524  * uint32_t crc;
3525  * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
3526  */
3527 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
3528   fold_byte_crc32(crc, crc, table, tmp);
3529 }
3530 
3531 /**
3532  * Emits code to update CRC-32 with a byte value according to constants in table.
3533  *
3534  * @param [in,out]crc   Register containing the crc.
3535  * @param [in]val       Register containing the byte to fold into the CRC.
3536  * @param [in]table     Register containing the table of crc constants.
3537  *
3538  * uint32_t crc;
3539  * val = crc_table[(val ^ crc) & 0xFF];
3540  * crc = val ^ (crc >> 8);
3541  */
3542 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3543   BLOCK_COMMENT("update_byte_crc32:");
3544   xorr(val, val, crc);
3545   fold_byte_crc32(crc, val, table, val);
3546 }
3547 
3548 /**
3549  * @param crc   register containing existing CRC (32-bit)
3550  * @param buf   register pointing to input byte buffer (byte*)
3551  * @param len   register containing number of bytes
3552  * @param table register pointing to CRC table
3553  */
3554 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,
3555                                            Register data, bool loopAlignment, bool invertCRC) {
3556   assert_different_registers(crc, buf, len, table, data);
3557 
3558   Label L_mainLoop, L_done;
3559   const int mainLoop_stepping  = 1;
3560   const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4;
3561 
3562   // Process all bytes in a single-byte loop.
3563   cmpdi(CCR0, len, 0);                           // Anything to do?
3564   mtctr(len);
3565   beq(CCR0, L_done);
3566 
3567   if (invertCRC) {
3568     nand(crc, crc, crc);                         // ~c
3569   }
3570 
3571   align(mainLoop_alignment);
3572   BIND(L_mainLoop);
3573     lbz(data, 0, buf);                           // Byte from buffer, zero-extended.
3574     addi(buf, buf, mainLoop_stepping);           // Advance buffer position.
3575     update_byte_crc32(crc, data, table);
3576     bdnz(L_mainLoop);                            // Iterate.
3577 
3578   if (invertCRC) {
3579     nand(crc, crc, crc);                         // ~c
3580   }
3581 
3582   bind(L_done);
3583 }
3584 
3585 /**
3586  * Emits code to update CRC-32 with a 4-byte value according to constants in table
3587  * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
3588  */
3589 // A not on the lookup table address(es):
3590 // The lookup table consists of two sets of four columns each.
3591 // The columns {0..3} are used for little-endian machines.
3592 // The columns {4..7} are used for big-endian machines.
3593 // To save the effort of adding the column offset to the table address each time
3594 // a table element is looked up, it is possible to pass the pre-calculated
3595 // column addresses.
3596 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary.
3597 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,
3598                                         Register t0,  Register t1,  Register t2,  Register t3,
3599                                         Register tc0, Register tc1, Register tc2, Register tc3) {
3600   assert_different_registers(crc, t3);
3601 
3602   // XOR crc with next four bytes of buffer.
3603   lwz(t3, bufDisp, buf);
3604   if (bufInc != 0) {
3605     addi(buf, buf, bufInc);
3606   }
3607   xorr(t3, t3, crc);
3608 
3609   // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices.
3610   rlwinm(t0, t3,  2,         24-2, 31-2);  // ((t1 >>  0) & 0xff) << 2
3611   rlwinm(t1, t3,  32+(2- 8), 24-2, 31-2);  // ((t1 >>  8) & 0xff) << 2
3612   rlwinm(t2, t3,  32+(2-16), 24-2, 31-2);  // ((t1 >> 16) & 0xff) << 2
3613   rlwinm(t3, t3,  32+(2-24), 24-2, 31-2);  // ((t1 >> 24) & 0xff) << 2
3614 
3615   // Use the pre-calculated column addresses.
3616   // Load pre-calculated table values.
3617   lwzx(t0, tc0, t0);
3618   lwzx(t1, tc1, t1);
3619   lwzx(t2, tc2, t2);
3620   lwzx(t3, tc3, t3);
3621 
3622   // Calculate new crc from table values.
3623   xorr(t0,  t0, t1);
3624   xorr(t2,  t2, t3);
3625   xorr(crc, t0, t2);  // Now crc contains the final checksum value.
3626 }
3627 
3628 /**
3629  * @param crc   register containing existing CRC (32-bit)
3630  * @param buf   register pointing to input byte buffer (byte*)
3631  * @param len   register containing number of bytes
3632  * @param table register pointing to CRC table
3633  *
3634  * Uses R9..R12 as work register. Must be saved/restored by caller!
3635  */
3636 void MacroAssembler::kernel_crc32_2word(Register crc, Register buf, Register len, Register table,
3637                                         Register t0,  Register t1,  Register t2,  Register t3,
3638                                         Register tc0, Register tc1, Register tc2, Register tc3) {
3639   assert_different_registers(crc, buf, len, table);
3640 
3641   Label L_mainLoop, L_tail;
3642   Register  tmp  = t0;
3643   Register  data = t0;
3644   Register  tmp2 = t1;
3645   const int mainLoop_stepping  = 8;
3646   const int tailLoop_stepping  = 1;
3647   const int log_stepping       = exact_log2(mainLoop_stepping);
3648   const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
3649   const int complexThreshold   = 2*mainLoop_stepping;
3650 
3651   // Don't test for len <= 0 here. This pathological case should not occur anyway.
3652   // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles.
3653   // The situation itself is detected and handled correctly by the conditional branches
3654   // following  aghi(len, -stepping) and aghi(len, +stepping).
3655   assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
3656 
3657   BLOCK_COMMENT("kernel_crc32_2word {");
3658 
3659   nand(crc, crc, crc);                           // ~c
3660 
3661   // Check for short (<mainLoop_stepping) buffer.
3662   cmpdi(CCR0, len, complexThreshold);
3663   blt(CCR0, L_tail);
3664 
3665   // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
3666   // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
3667   {
3668     // Align buf addr to mainLoop_stepping boundary.
3669     neg(tmp2, buf);                           // Calculate # preLoop iterations for alignment.
3670     rldicl(tmp2, tmp2, 0, 64-log_stepping);   // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
3671 
3672     if (complexThreshold > mainLoop_stepping) {
3673       sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3674     } else {
3675       sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
3676       cmpdi(CCR0, tmp, mainLoop_stepping);
3677       blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
3678       mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3679     }
3680     update_byteLoop_crc32(crc, buf, tmp2, table, data, false, false);
3681   }
3682 
3683   srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
3684   andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
3685   mtctr(tmp2);
3686 
3687 #ifdef VM_LITTLE_ENDIAN
3688   Register crc_rv = crc;
3689 #else
3690   Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
3691                                                  // Occupies tmp, but frees up crc.
3692   load_reverse_32(crc_rv, crc);                  // Revert byte order because we are dealing with big-endian data.
3693   tmp = crc;
3694 #endif
3695 
3696   int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
3697 
3698   align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
3699   BIND(L_mainLoop);
3700     update_1word_crc32(crc_rv, buf, table, 0, 0, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
3701     update_1word_crc32(crc_rv, buf, table, 4, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
3702     bdnz(L_mainLoop);
3703 
3704 #ifndef VM_LITTLE_ENDIAN
3705   load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
3706   tmp = crc_rv;                                  // Tmp uses it's original register again.
3707 #endif
3708 
3709   // Restore original table address for tailLoop.
3710   if (reconstructTableOffset != 0) {
3711     addi(table, table, -reconstructTableOffset);
3712   }
3713 
3714   // Process last few (<complexThreshold) bytes of buffer.
3715   BIND(L_tail);
3716   update_byteLoop_crc32(crc, buf, len, table, data, false, false);
3717 
3718   nand(crc, crc, crc);                           // ~c
3719   BLOCK_COMMENT("} kernel_crc32_2word");
3720 }
3721 
3722 /**
3723  * @param crc   register containing existing CRC (32-bit)
3724  * @param buf   register pointing to input byte buffer (byte*)
3725  * @param len   register containing number of bytes
3726  * @param table register pointing to CRC table
3727  *
3728  * uses R9..R12 as work register. Must be saved/restored by caller!
3729  */
3730 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table,
3731                                         Register t0,  Register t1,  Register t2,  Register t3,
3732                                         Register tc0, Register tc1, Register tc2, Register tc3) {
3733   assert_different_registers(crc, buf, len, table);
3734 
3735   Label L_mainLoop, L_tail;
3736   Register  tmp          = t0;
3737   Register  data         = t0;
3738   Register  tmp2         = t1;
3739   const int mainLoop_stepping  = 4;
3740   const int tailLoop_stepping  = 1;
3741   const int log_stepping       = exact_log2(mainLoop_stepping);
3742   const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
3743   const int complexThreshold   = 2*mainLoop_stepping;
3744 
3745   // Don't test for len <= 0 here. This pathological case should not occur anyway.
3746   // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles.
3747   // The situation itself is detected and handled correctly by the conditional branches
3748   // following  aghi(len, -stepping) and aghi(len, +stepping).
3749   assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
3750 
3751   BLOCK_COMMENT("kernel_crc32_1word {");
3752 
3753   nand(crc, crc, crc);                           // ~c
3754 
3755   // Check for short (<mainLoop_stepping) buffer.
3756   cmpdi(CCR0, len, complexThreshold);
3757   blt(CCR0, L_tail);
3758 
3759   // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
3760   // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
3761   {
3762     // Align buf addr to mainLoop_stepping boundary.
3763     neg(tmp2, buf);                              // Calculate # preLoop iterations for alignment.
3764     rldicl(tmp2, tmp2, 0, 64-log_stepping);      // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
3765 
3766     if (complexThreshold > mainLoop_stepping) {
3767       sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3768     } else {
3769       sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
3770       cmpdi(CCR0, tmp, mainLoop_stepping);
3771       blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
3772       mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3773     }
3774     update_byteLoop_crc32(crc, buf, tmp2, table, data, false, false);
3775   }
3776 
3777   srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
3778   andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
3779   mtctr(tmp2);
3780 
3781 #ifdef VM_LITTLE_ENDIAN
3782   Register crc_rv = crc;
3783 #else
3784   Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
3785                                                  // Occupies tmp, but frees up crc.
3786   load_reverse_32(crc_rv, crc);                  // evert byte order because we are dealing with big-endian data.
3787   tmp = crc;
3788 #endif
3789 
3790   int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
3791 
3792   align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
3793   BIND(L_mainLoop);
3794     update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
3795     bdnz(L_mainLoop);
3796 
3797 #ifndef VM_LITTLE_ENDIAN
3798   load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
3799   tmp = crc_rv;                                  // Tmp uses it's original register again.
3800 #endif
3801 
3802   // Restore original table address for tailLoop.
3803   if (reconstructTableOffset != 0) {
3804     addi(table, table, -reconstructTableOffset);
3805   }
3806 
3807   // Process last few (<complexThreshold) bytes of buffer.
3808   BIND(L_tail);
3809   update_byteLoop_crc32(crc, buf, len, table, data, false, false);
3810 
3811   nand(crc, crc, crc);                           // ~c
3812   BLOCK_COMMENT("} kernel_crc32_1word");
3813 }
3814 
3815 /**
3816  * @param crc   register containing existing CRC (32-bit)
3817  * @param buf   register pointing to input byte buffer (byte*)
3818  * @param len   register containing number of bytes
3819  * @param table register pointing to CRC table
3820  *
3821  * Uses R7_ARG5, R8_ARG6 as work registers.
3822  */
3823 void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len, Register table,
3824                                         Register t0,  Register t1,  Register t2,  Register t3) {
3825   assert_different_registers(crc, buf, len, table);
3826 
3827   Register  data = t0;                   // Holds the current byte to be folded into crc.
3828 
3829   BLOCK_COMMENT("kernel_crc32_1byte {");
3830 
3831   // Process all bytes in a single-byte loop.
3832   update_byteLoop_crc32(crc, buf, len, table, data, true, true);
3833 
3834   BLOCK_COMMENT("} kernel_crc32_1byte");
3835 }
3836 
3837 void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp) {
3838   assert_different_registers(crc, buf, /* len,  not used!! */ table, tmp);
3839 
3840   BLOCK_COMMENT("kernel_crc32_singleByte:");
3841   nand(crc, crc, crc);       // ~c
3842 
3843   lbz(tmp, 0, buf);          // Byte from buffer, zero-extended.
3844   update_byte_crc32(crc, tmp, table);
3845 
3846   nand(crc, crc, crc);       // ~c
3847 }
3848 
3849 // dest_lo += src1 + src2
3850 // dest_hi += carry1 + carry2
3851 void MacroAssembler::add2_with_carry(Register dest_hi,
3852                                      Register dest_lo,
3853                                      Register src1, Register src2) {
3854   li(R0, 0);
3855   addc(dest_lo, dest_lo, src1);
3856   adde(dest_hi, dest_hi, R0);
3857   addc(dest_lo, dest_lo, src2);
3858   adde(dest_hi, dest_hi, R0);
3859 }
3860 
3861 // Multiply 64 bit by 64 bit first loop.
3862 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,
3863                                            Register x_xstart,
3864                                            Register y, Register y_idx,
3865                                            Register z,
3866                                            Register carry,
3867                                            Register product_high, Register product,
3868                                            Register idx, Register kdx,
3869                                            Register tmp) {
3870   //  jlong carry, x[], y[], z[];
3871   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
3872   //    huge_128 product = y[idx] * x[xstart] + carry;
3873   //    z[kdx] = (jlong)product;
3874   //    carry  = (jlong)(product >>> 64);
3875   //  }
3876   //  z[xstart] = carry;
3877 
3878   Label L_first_loop, L_first_loop_exit;
3879   Label L_one_x, L_one_y, L_multiply;
3880 
3881   addic_(xstart, xstart, -1);
3882   blt(CCR0, L_one_x);   // Special case: length of x is 1.
3883 
3884   // Load next two integers of x.
3885   sldi(tmp, xstart, LogBytesPerInt);
3886   ldx(x_xstart, x, tmp);
3887 #ifdef VM_LITTLE_ENDIAN
3888   rldicl(x_xstart, x_xstart, 32, 0);
3889 #endif
3890 
3891   align(32, 16);
3892   bind(L_first_loop);
3893 
3894   cmpdi(CCR0, idx, 1);
3895   blt(CCR0, L_first_loop_exit);
3896   addi(idx, idx, -2);
3897   beq(CCR0, L_one_y);
3898 
3899   // Load next two integers of y.
3900   sldi(tmp, idx, LogBytesPerInt);
3901   ldx(y_idx, y, tmp);
3902 #ifdef VM_LITTLE_ENDIAN
3903   rldicl(y_idx, y_idx, 32, 0);
3904 #endif
3905 
3906 
3907   bind(L_multiply);
3908   multiply64(product_high, product, x_xstart, y_idx);
3909 
3910   li(tmp, 0);
3911   addc(product, product, carry);         // Add carry to result.
3912   adde(product_high, product_high, tmp); // Add carry of the last addition.
3913   addi(kdx, kdx, -2);
3914 
3915   // Store result.
3916 #ifdef VM_LITTLE_ENDIAN
3917   rldicl(product, product, 32, 0);
3918 #endif
3919   sldi(tmp, kdx, LogBytesPerInt);
3920   stdx(product, z, tmp);
3921   mr_if_needed(carry, product_high);
3922   b(L_first_loop);
3923 
3924 
3925   bind(L_one_y); // Load one 32 bit portion of y as (0,value).
3926 
3927   lwz(y_idx, 0, y);
3928   b(L_multiply);
3929 
3930 
3931   bind(L_one_x); // Load one 32 bit portion of x as (0,value).
3932 
3933   lwz(x_xstart, 0, x);
3934   b(L_first_loop);
3935 
3936   bind(L_first_loop_exit);
3937 }
3938 
3939 // Multiply 64 bit by 64 bit and add 128 bit.
3940 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y,
3941                                             Register z, Register yz_idx,
3942                                             Register idx, Register carry,
3943                                             Register product_high, Register product,
3944                                             Register tmp, int offset) {
3945 
3946   //  huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
3947   //  z[kdx] = (jlong)product;
3948 
3949   sldi(tmp, idx, LogBytesPerInt);
3950   if (offset) {
3951     addi(tmp, tmp, offset);
3952   }
3953   ldx(yz_idx, y, tmp);
3954 #ifdef VM_LITTLE_ENDIAN
3955   rldicl(yz_idx, yz_idx, 32, 0);
3956 #endif
3957 
3958   multiply64(product_high, product, x_xstart, yz_idx);
3959   ldx(yz_idx, z, tmp);
3960 #ifdef VM_LITTLE_ENDIAN
3961   rldicl(yz_idx, yz_idx, 32, 0);
3962 #endif
3963 
3964   add2_with_carry(product_high, product, carry, yz_idx);
3965 
3966   sldi(tmp, idx, LogBytesPerInt);
3967   if (offset) {
3968     addi(tmp, tmp, offset);
3969   }
3970 #ifdef VM_LITTLE_ENDIAN
3971   rldicl(product, product, 32, 0);
3972 #endif
3973   stdx(product, z, tmp);
3974 }
3975 
3976 // Multiply 128 bit by 128 bit. Unrolled inner loop.
3977 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart,
3978                                              Register y, Register z,
3979                                              Register yz_idx, Register idx, Register carry,
3980                                              Register product_high, Register product,
3981                                              Register carry2, Register tmp) {
3982 
3983   //  jlong carry, x[], y[], z[];
3984   //  int kdx = ystart+1;
3985   //  for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
3986   //    huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
3987   //    z[kdx+idx+1] = (jlong)product;
3988   //    jlong carry2 = (jlong)(product >>> 64);
3989   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
3990   //    z[kdx+idx] = (jlong)product;
3991   //    carry = (jlong)(product >>> 64);
3992   //  }
3993   //  idx += 2;
3994   //  if (idx > 0) {
3995   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
3996   //    z[kdx+idx] = (jlong)product;
3997   //    carry = (jlong)(product >>> 64);
3998   //  }
3999 
4000   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
4001   const Register jdx = R0;
4002 
4003   // Scale the index.
4004   srdi_(jdx, idx, 2);
4005   beq(CCR0, L_third_loop_exit);
4006   mtctr(jdx);
4007 
4008   align(32, 16);
4009   bind(L_third_loop);
4010 
4011   addi(idx, idx, -4);
4012 
4013   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8);
4014   mr_if_needed(carry2, product_high);
4015 
4016   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0);
4017   mr_if_needed(carry, product_high);
4018   bdnz(L_third_loop);
4019 
4020   bind(L_third_loop_exit);  // Handle any left-over operand parts.
4021 
4022   andi_(idx, idx, 0x3);
4023   beq(CCR0, L_post_third_loop_done);
4024 
4025   Label L_check_1;
4026 
4027   addic_(idx, idx, -2);
4028   blt(CCR0, L_check_1);
4029 
4030   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0);
4031   mr_if_needed(carry, product_high);
4032 
4033   bind(L_check_1);
4034 
4035   addi(idx, idx, 0x2);
4036   andi_(idx, idx, 0x1) ;
4037   addic_(idx, idx, -1);
4038   blt(CCR0, L_post_third_loop_done);
4039 
4040   sldi(tmp, idx, LogBytesPerInt);
4041   lwzx(yz_idx, y, tmp);
4042   multiply64(product_high, product, x_xstart, yz_idx);
4043   lwzx(yz_idx, z, tmp);
4044 
4045   add2_with_carry(product_high, product, yz_idx, carry);
4046 
4047   sldi(tmp, idx, LogBytesPerInt);
4048   stwx(product, z, tmp);
4049   srdi(product, product, 32);
4050 
4051   sldi(product_high, product_high, 32);
4052   orr(product, product, product_high);
4053   mr_if_needed(carry, product);
4054 
4055   bind(L_post_third_loop_done);
4056 }   // multiply_128_x_128_loop
4057 
4058 void MacroAssembler::multiply_to_len(Register x, Register xlen,
4059                                      Register y, Register ylen,
4060                                      Register z, Register zlen,
4061                                      Register tmp1, Register tmp2,
4062                                      Register tmp3, Register tmp4,
4063                                      Register tmp5, Register tmp6,
4064                                      Register tmp7, Register tmp8,
4065                                      Register tmp9, Register tmp10,
4066                                      Register tmp11, Register tmp12,
4067                                      Register tmp13) {
4068 
4069   ShortBranchVerifier sbv(this);
4070 
4071   assert_different_registers(x, xlen, y, ylen, z, zlen,
4072                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
4073   assert_different_registers(x, xlen, y, ylen, z, zlen,
4074                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp7);
4075   assert_different_registers(x, xlen, y, ylen, z, zlen,
4076                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp8);
4077 
4078   const Register idx = tmp1;
4079   const Register kdx = tmp2;
4080   const Register xstart = tmp3;
4081 
4082   const Register y_idx = tmp4;
4083   const Register carry = tmp5;
4084   const Register product = tmp6;
4085   const Register product_high = tmp7;
4086   const Register x_xstart = tmp8;
4087   const Register tmp = tmp9;
4088 
4089   // First Loop.
4090   //
4091   //  final static long LONG_MASK = 0xffffffffL;
4092   //  int xstart = xlen - 1;
4093   //  int ystart = ylen - 1;
4094   //  long carry = 0;
4095   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
4096   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
4097   //    z[kdx] = (int)product;
4098   //    carry = product >>> 32;
4099   //  }
4100   //  z[xstart] = (int)carry;
4101 
4102   mr_if_needed(idx, ylen);        // idx = ylen
4103   mr_if_needed(kdx, zlen);        // kdx = xlen + ylen
4104   li(carry, 0);                   // carry = 0
4105 
4106   Label L_done;
4107 
4108   addic_(xstart, xlen, -1);
4109   blt(CCR0, L_done);
4110 
4111   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z,
4112                         carry, product_high, product, idx, kdx, tmp);
4113 
4114   Label L_second_loop;
4115 
4116   cmpdi(CCR0, kdx, 0);
4117   beq(CCR0, L_second_loop);
4118 
4119   Label L_carry;
4120 
4121   addic_(kdx, kdx, -1);
4122   beq(CCR0, L_carry);
4123 
4124   // Store lower 32 bits of carry.
4125   sldi(tmp, kdx, LogBytesPerInt);
4126   stwx(carry, z, tmp);
4127   srdi(carry, carry, 32);
4128   addi(kdx, kdx, -1);
4129 
4130 
4131   bind(L_carry);
4132 
4133   // Store upper 32 bits of carry.
4134   sldi(tmp, kdx, LogBytesPerInt);
4135   stwx(carry, z, tmp);
4136 
4137   // Second and third (nested) loops.
4138   //
4139   //  for (int i = xstart-1; i >= 0; i--) { // Second loop
4140   //    carry = 0;
4141   //    for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
4142   //      long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
4143   //                     (z[k] & LONG_MASK) + carry;
4144   //      z[k] = (int)product;
4145   //      carry = product >>> 32;
4146   //    }
4147   //    z[i] = (int)carry;
4148   //  }
4149   //
4150   //  i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
4151 
4152   bind(L_second_loop);
4153 
4154   li(carry, 0);                   // carry = 0;
4155 
4156   addic_(xstart, xstart, -1);     // i = xstart-1;
4157   blt(CCR0, L_done);
4158 
4159   Register zsave = tmp10;
4160 
4161   mr(zsave, z);
4162 
4163 
4164   Label L_last_x;
4165 
4166   sldi(tmp, xstart, LogBytesPerInt);
4167   add(z, z, tmp);                 // z = z + k - j
4168   addi(z, z, 4);
4169   addic_(xstart, xstart, -1);     // i = xstart-1;
4170   blt(CCR0, L_last_x);
4171 
4172   sldi(tmp, xstart, LogBytesPerInt);
4173   ldx(x_xstart, x, tmp);
4174 #ifdef VM_LITTLE_ENDIAN
4175   rldicl(x_xstart, x_xstart, 32, 0);
4176 #endif
4177 
4178 
4179   Label L_third_loop_prologue;
4180 
4181   bind(L_third_loop_prologue);
4182 
4183   Register xsave = tmp11;
4184   Register xlensave = tmp12;
4185   Register ylensave = tmp13;
4186 
4187   mr(xsave, x);
4188   mr(xlensave, xstart);
4189   mr(ylensave, ylen);
4190 
4191 
4192   multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen,
4193                           carry, product_high, product, x, tmp);
4194 
4195   mr(z, zsave);
4196   mr(x, xsave);
4197   mr(xlen, xlensave);   // This is the decrement of the loop counter!
4198   mr(ylen, ylensave);
4199 
4200   addi(tmp3, xlen, 1);
4201   sldi(tmp, tmp3, LogBytesPerInt);
4202   stwx(carry, z, tmp);
4203   addic_(tmp3, tmp3, -1);
4204   blt(CCR0, L_done);
4205 
4206   srdi(carry, carry, 32);
4207   sldi(tmp, tmp3, LogBytesPerInt);
4208   stwx(carry, z, tmp);
4209   b(L_second_loop);
4210 
4211   // Next infrequent code is moved outside loops.
4212   bind(L_last_x);
4213 
4214   lwz(x_xstart, 0, x);
4215   b(L_third_loop_prologue);
4216 
4217   bind(L_done);
4218 }   // multiply_to_len
4219 
4220 void MacroAssembler::asm_assert(bool check_equal, const char *msg, int id) {
4221 #ifdef ASSERT
4222   Label ok;
4223   if (check_equal) {
4224     beq(CCR0, ok);
4225   } else {
4226     bne(CCR0, ok);
4227   }
4228   stop(msg, id);
4229   bind(ok);
4230 #endif
4231 }
4232 
4233 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset,
4234                                           Register mem_base, const char* msg, int id) {
4235 #ifdef ASSERT
4236   switch (size) {
4237     case 4:
4238       lwz(R0, mem_offset, mem_base);
4239       cmpwi(CCR0, R0, 0);
4240       break;
4241     case 8:
4242       ld(R0, mem_offset, mem_base);
4243       cmpdi(CCR0, R0, 0);
4244       break;
4245     default:
4246       ShouldNotReachHere();
4247   }
4248   asm_assert(check_equal, msg, id);
4249 #endif // ASSERT
4250 }
4251 
4252 void MacroAssembler::verify_thread() {
4253   if (VerifyThread) {
4254     unimplemented("'VerifyThread' currently not implemented on PPC");
4255   }
4256 }
4257 
4258 // READ: oop. KILL: R0. Volatile floats perhaps.
4259 void MacroAssembler::verify_oop(Register oop, const char* msg) {
4260   if (!VerifyOops) {
4261     return;
4262   }
4263 
4264   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4265   const Register tmp = R11; // Will be preserved.
4266   const int nbytes_save = 11*8; // Volatile gprs except R0.
4267   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
4268 
4269   if (oop == tmp) mr(R4_ARG2, oop);
4270   save_LR_CR(tmp); // save in old frame
4271   push_frame_reg_args(nbytes_save, tmp);
4272   // load FunctionDescriptor** / entry_address *
4273   load_const_optimized(tmp, fd, R0);
4274   // load FunctionDescriptor* / entry_address
4275   ld(tmp, 0, tmp);
4276   if (oop != tmp) mr_if_needed(R4_ARG2, oop);
4277   load_const_optimized(R3_ARG1, (address)msg, R0);
4278   // Call destination for its side effect.
4279   call_c(tmp);
4280 
4281   pop_frame();
4282   restore_LR_CR(tmp);
4283   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
4284 }
4285 
4286 const char* stop_types[] = {
4287   "stop",
4288   "untested",
4289   "unimplemented",
4290   "shouldnotreachhere"
4291 };
4292 
4293 static void stop_on_request(int tp, const char* msg) {
4294   tty->print("PPC assembly code requires stop: (%s) %s\n", stop_types[tp%/*stop_end*/4], msg);
4295   guarantee(false, "PPC assembly code requires stop: %s", msg);
4296 }
4297 
4298 // Call a C-function that prints output.
4299 void MacroAssembler::stop(int type, const char* msg, int id) {
4300 #ifndef PRODUCT
4301   block_comment(err_msg("stop: %s %s {", stop_types[type%stop_end], msg));
4302 #else
4303   block_comment("stop {");
4304 #endif
4305 
4306   // setup arguments
4307   load_const_optimized(R3_ARG1, type);
4308   load_const_optimized(R4_ARG2, (void *)msg, /*tmp=*/R0);
4309   call_VM_leaf(CAST_FROM_FN_PTR(address, stop_on_request), R3_ARG1, R4_ARG2);
4310   illtrap();
4311   emit_int32(id);
4312   block_comment("} stop;");
4313 }
4314 
4315 #ifndef PRODUCT
4316 // Write pattern 0x0101010101010101 in memory region [low-before, high+after].
4317 // Val, addr are temp registers.
4318 // If low == addr, addr is killed.
4319 // High is preserved.
4320 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) {
4321   if (!ZapMemory) return;
4322 
4323   assert_different_registers(low, val);
4324 
4325   BLOCK_COMMENT("zap memory region {");
4326   load_const_optimized(val, 0x0101010101010101);
4327   int size = before + after;
4328   if (low == high && size < 5 && size > 0) {
4329     int offset = -before*BytesPerWord;
4330     for (int i = 0; i < size; ++i) {
4331       std(val, offset, low);
4332       offset += (1*BytesPerWord);
4333     }
4334   } else {
4335     addi(addr, low, -before*BytesPerWord);
4336     assert_different_registers(high, val);
4337     if (after) addi(high, high, after * BytesPerWord);
4338     Label loop;
4339     bind(loop);
4340     std(val, 0, addr);
4341     addi(addr, addr, 8);
4342     cmpd(CCR6, addr, high);
4343     ble(CCR6, loop);
4344     if (after) addi(high, high, -after * BytesPerWord);  // Correct back to old value.
4345   }
4346   BLOCK_COMMENT("} zap memory region");
4347 }
4348 
4349 #endif // !PRODUCT
4350 
4351 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() {
4352   int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true);
4353   assert(sizeof(bool) == 1, "PowerPC ABI");
4354   masm->lbz(temp, simm16_offset, temp);
4355   masm->cmpwi(CCR0, temp, 0);
4356   masm->beq(CCR0, _label);
4357 }
4358 
4359 SkipIfEqualZero::~SkipIfEqualZero() {
4360   _masm->bind(_label);
4361 }