1 /*
   2  * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright 2012, 2015 SAP AG. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "compiler/disassembler.hpp"
  29 #include "gc/shared/cardTableModRefBS.hpp"
  30 #include "gc/shared/collectedHeap.inline.hpp"
  31 #include "interpreter/interpreter.hpp"
  32 #include "memory/resourceArea.hpp"
  33 #include "prims/methodHandles.hpp"
  34 #include "runtime/biasedLocking.hpp"
  35 #include "runtime/icache.hpp"
  36 #include "runtime/interfaceSupport.hpp"
  37 #include "runtime/objectMonitor.hpp"
  38 #include "runtime/os.hpp"
  39 #include "runtime/sharedRuntime.hpp"
  40 #include "runtime/stubRoutines.hpp"
  41 #include "utilities/macros.hpp"
  42 #if INCLUDE_ALL_GCS
  43 #include "gc/g1/g1CollectedHeap.inline.hpp"
  44 #include "gc/g1/g1SATBCardTableModRefBS.hpp"
  45 #include "gc/g1/heapRegion.hpp"
  46 #endif // INCLUDE_ALL_GCS
  47 
  48 #ifdef PRODUCT
  49 #define BLOCK_COMMENT(str) // nothing
  50 #else
  51 #define BLOCK_COMMENT(str) block_comment(str)
  52 #endif
  53 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  54 
  55 #ifdef ASSERT
  56 // On RISC, there's no benefit to verifying instruction boundaries.
  57 bool AbstractAssembler::pd_check_instruction_mark() { return false; }
  58 #endif
  59 
  60 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) {
  61   assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range");
  62   if (Assembler::is_simm(si31, 16)) {
  63     ld(d, si31, a);
  64     if (emit_filler_nop) nop();
  65   } else {
  66     const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31);
  67     const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31);
  68     addis(d, a, hi);
  69     ld(d, lo, d);
  70   }
  71 }
  72 
  73 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) {
  74   assert_different_registers(d, a);
  75   ld_largeoffset_unchecked(d, si31, a, emit_filler_nop);
  76 }
  77 
  78 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base,
  79                                       size_t size_in_bytes, bool is_signed) {
  80   switch (size_in_bytes) {
  81   case  8:              ld(dst, offs, base);                         break;
  82   case  4:  is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break;
  83   case  2:  is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break;
  84   case  1:  lbz(dst, offs, base); if (is_signed) extsb(dst, dst);    break; // lba doesn't exist :(
  85   default:  ShouldNotReachHere();
  86   }
  87 }
  88 
  89 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base,
  90                                        size_t size_in_bytes) {
  91   switch (size_in_bytes) {
  92   case  8:  std(dst, offs, base); break;
  93   case  4:  stw(dst, offs, base); break;
  94   case  2:  sth(dst, offs, base); break;
  95   case  1:  stb(dst, offs, base); break;
  96   default:  ShouldNotReachHere();
  97   }
  98 }
  99 
 100 void MacroAssembler::align(int modulus, int max, int rem) {
 101   int padding = (rem + modulus - (offset() % modulus)) % modulus;
 102   if (padding > max) return;
 103   for (int c = (padding >> 2); c > 0; --c) { nop(); }
 104 }
 105 
 106 // Issue instructions that calculate given TOC from global TOC.
 107 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16,
 108                                                        bool add_relocation, bool emit_dummy_addr) {
 109   int offset = -1;
 110   if (emit_dummy_addr) {
 111     offset = -128; // dummy address
 112   } else if (addr != (address)(intptr_t)-1) {
 113     offset = MacroAssembler::offset_to_global_toc(addr);
 114   }
 115 
 116   if (hi16) {
 117     addis(dst, R29, MacroAssembler::largeoffset_si16_si16_hi(offset));
 118   }
 119   if (lo16) {
 120     if (add_relocation) {
 121       // Relocate at the addi to avoid confusion with a load from the method's TOC.
 122       relocate(internal_word_Relocation::spec(addr));
 123     }
 124     addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset));
 125   }
 126 }
 127 
 128 int MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) {
 129   const int offset = MacroAssembler::offset_to_global_toc(addr);
 130 
 131   const address inst2_addr = a;
 132   const int inst2 = *(int *)inst2_addr;
 133 
 134   // The relocation points to the second instruction, the addi,
 135   // and the addi reads and writes the same register dst.
 136   const int dst = inv_rt_field(inst2);
 137   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 138 
 139   // Now, find the preceding addis which writes to dst.
 140   int inst1 = 0;
 141   address inst1_addr = inst2_addr - BytesPerInstWord;
 142   while (inst1_addr >= bound) {
 143     inst1 = *(int *) inst1_addr;
 144     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 145       // Stop, found the addis which writes dst.
 146       break;
 147     }
 148     inst1_addr -= BytesPerInstWord;
 149   }
 150 
 151   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 152   set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset));
 153   set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset));
 154   return (int)((intptr_t)addr - (intptr_t)inst1_addr);
 155 }
 156 
 157 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) {
 158   const address inst2_addr = a;
 159   const int inst2 = *(int *)inst2_addr;
 160 
 161   // The relocation points to the second instruction, the addi,
 162   // and the addi reads and writes the same register dst.
 163   const int dst = inv_rt_field(inst2);
 164   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 165 
 166   // Now, find the preceding addis which writes to dst.
 167   int inst1 = 0;
 168   address inst1_addr = inst2_addr - BytesPerInstWord;
 169   while (inst1_addr >= bound) {
 170     inst1 = *(int *) inst1_addr;
 171     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 172       // stop, found the addis which writes dst
 173       break;
 174     }
 175     inst1_addr -= BytesPerInstWord;
 176   }
 177 
 178   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 179 
 180   int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0);
 181   // -1 is a special case
 182   if (offset == -1) {
 183     return (address)(intptr_t)-1;
 184   } else {
 185     return global_toc() + offset;
 186   }
 187 }
 188 
 189 #ifdef _LP64
 190 // Patch compressed oops or klass constants.
 191 // Assembler sequence is
 192 // 1) compressed oops:
 193 //    lis  rx = const.hi
 194 //    ori rx = rx | const.lo
 195 // 2) compressed klass:
 196 //    lis  rx = const.hi
 197 //    clrldi rx = rx & 0xFFFFffff // clearMS32b, optional
 198 //    ori rx = rx | const.lo
 199 // Clrldi will be passed by.
 200 int MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) {
 201   assert(UseCompressedOops, "Should only patch compressed oops");
 202 
 203   const address inst2_addr = a;
 204   const int inst2 = *(int *)inst2_addr;
 205 
 206   // The relocation points to the second instruction, the ori,
 207   // and the ori reads and writes the same register dst.
 208   const int dst = inv_rta_field(inst2);
 209   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 210   // Now, find the preceding addis which writes to dst.
 211   int inst1 = 0;
 212   address inst1_addr = inst2_addr - BytesPerInstWord;
 213   bool inst1_found = false;
 214   while (inst1_addr >= bound) {
 215     inst1 = *(int *)inst1_addr;
 216     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; }
 217     inst1_addr -= BytesPerInstWord;
 218   }
 219   assert(inst1_found, "inst is not lis");
 220 
 221   int xc = (data >> 16) & 0xffff;
 222   int xd = (data >>  0) & 0xffff;
 223 
 224   set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo
 225   set_imm((int *)inst2_addr,        (xd)); // unsigned int
 226   return (int)((intptr_t)inst2_addr - (intptr_t)inst1_addr);
 227 }
 228 
 229 // Get compressed oop or klass constant.
 230 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) {
 231   assert(UseCompressedOops, "Should only patch compressed oops");
 232 
 233   const address inst2_addr = a;
 234   const int inst2 = *(int *)inst2_addr;
 235 
 236   // The relocation points to the second instruction, the ori,
 237   // and the ori reads and writes the same register dst.
 238   const int dst = inv_rta_field(inst2);
 239   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 240   // Now, find the preceding lis which writes to dst.
 241   int inst1 = 0;
 242   address inst1_addr = inst2_addr - BytesPerInstWord;
 243   bool inst1_found = false;
 244 
 245   while (inst1_addr >= bound) {
 246     inst1 = *(int *) inst1_addr;
 247     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;}
 248     inst1_addr -= BytesPerInstWord;
 249   }
 250   assert(inst1_found, "inst is not lis");
 251 
 252   uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff));
 253   uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16);
 254 
 255   return (int) (xl | xh);
 256 }
 257 #endif // _LP64
 258 
 259 void MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a, Register toc) {
 260   int toc_offset = 0;
 261   // Use RelocationHolder::none for the constant pool entry, otherwise
 262   // we will end up with a failing NativeCall::verify(x) where x is
 263   // the address of the constant pool entry.
 264   // FIXME: We should insert relocation information for oops at the constant
 265   // pool entries instead of inserting it at the loads; patching of a constant
 266   // pool entry should be less expensive.
 267   address oop_address = address_constant((address)a.value(), RelocationHolder::none);
 268   // Relocate at the pc of the load.
 269   relocate(a.rspec());
 270   toc_offset = (int)(oop_address - code()->consts()->start());
 271   ld_largeoffset_unchecked(dst, toc_offset, toc, true);
 272 }
 273 
 274 bool MacroAssembler::is_load_const_from_method_toc_at(address a) {
 275   const address inst1_addr = a;
 276   const int inst1 = *(int *)inst1_addr;
 277 
 278    // The relocation points to the ld or the addis.
 279    return (is_ld(inst1)) ||
 280           (is_addis(inst1) && inv_ra_field(inst1) != 0);
 281 }
 282 
 283 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) {
 284   assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc");
 285 
 286   const address inst1_addr = a;
 287   const int inst1 = *(int *)inst1_addr;
 288 
 289   if (is_ld(inst1)) {
 290     return inv_d1_field(inst1);
 291   } else if (is_addis(inst1)) {
 292     const int dst = inv_rt_field(inst1);
 293 
 294     // Now, find the succeeding ld which reads and writes to dst.
 295     address inst2_addr = inst1_addr + BytesPerInstWord;
 296     int inst2 = 0;
 297     while (true) {
 298       inst2 = *(int *) inst2_addr;
 299       if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) {
 300         // Stop, found the ld which reads and writes dst.
 301         break;
 302       }
 303       inst2_addr += BytesPerInstWord;
 304     }
 305     return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2);
 306   }
 307   ShouldNotReachHere();
 308   return 0;
 309 }
 310 
 311 // Get the constant from a `load_const' sequence.
 312 long MacroAssembler::get_const(address a) {
 313   assert(is_load_const_at(a), "not a load of a constant");
 314   const int *p = (const int*) a;
 315   unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48);
 316   if (is_ori(*(p+1))) {
 317     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32);
 318     x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16);
 319     x |= (((unsigned long) (get_imm(a,4) & 0xffff)));
 320   } else if (is_lis(*(p+1))) {
 321     x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32);
 322     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16);
 323     x |= (((unsigned long) (get_imm(a,3) & 0xffff)));
 324   } else {
 325     ShouldNotReachHere();
 326     return (long) 0;
 327   }
 328   return (long) x;
 329 }
 330 
 331 // Patch the 64 bit constant of a `load_const' sequence. This is a low
 332 // level procedure. It neither flushes the instruction cache nor is it
 333 // mt safe.
 334 void MacroAssembler::patch_const(address a, long x) {
 335   assert(is_load_const_at(a), "not a load of a constant");
 336   int *p = (int*) a;
 337   if (is_ori(*(p+1))) {
 338     set_imm(0 + p, (x >> 48) & 0xffff);
 339     set_imm(1 + p, (x >> 32) & 0xffff);
 340     set_imm(3 + p, (x >> 16) & 0xffff);
 341     set_imm(4 + p, x & 0xffff);
 342   } else if (is_lis(*(p+1))) {
 343     set_imm(0 + p, (x >> 48) & 0xffff);
 344     set_imm(2 + p, (x >> 32) & 0xffff);
 345     set_imm(1 + p, (x >> 16) & 0xffff);
 346     set_imm(3 + p, x & 0xffff);
 347   } else {
 348     ShouldNotReachHere();
 349   }
 350 }
 351 
 352 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) {
 353   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
 354   int index = oop_recorder()->allocate_metadata_index(obj);
 355   RelocationHolder rspec = metadata_Relocation::spec(index);
 356   return AddressLiteral((address)obj, rspec);
 357 }
 358 
 359 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) {
 360   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
 361   int index = oop_recorder()->find_index(obj);
 362   RelocationHolder rspec = metadata_Relocation::spec(index);
 363   return AddressLiteral((address)obj, rspec);
 364 }
 365 
 366 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) {
 367   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
 368   int oop_index = oop_recorder()->allocate_oop_index(obj);
 369   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 370 }
 371 
 372 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) {
 373   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
 374   int oop_index = oop_recorder()->find_index(obj);
 375   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 376 }
 377 
 378 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
 379                                                       Register tmp, int offset) {
 380   intptr_t value = *delayed_value_addr;
 381   if (value != 0) {
 382     return RegisterOrConstant(value + offset);
 383   }
 384 
 385   // Load indirectly to solve generation ordering problem.
 386   // static address, no relocation
 387   int simm16_offset = load_const_optimized(tmp, delayed_value_addr, noreg, true);
 388   ld(tmp, simm16_offset, tmp); // must be aligned ((xa & 3) == 0)
 389 
 390   if (offset != 0) {
 391     addi(tmp, tmp, offset);
 392   }
 393 
 394   return RegisterOrConstant(tmp);
 395 }
 396 
 397 #ifndef PRODUCT
 398 void MacroAssembler::pd_print_patched_instruction(address branch) {
 399   Unimplemented(); // TODO: PPC port
 400 }
 401 #endif // ndef PRODUCT
 402 
 403 // Conditional far branch for destinations encodable in 24+2 bits.
 404 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) {
 405 
 406   // If requested by flag optimize, relocate the bc_far as a
 407   // runtime_call and prepare for optimizing it when the code gets
 408   // relocated.
 409   if (optimize == bc_far_optimize_on_relocate) {
 410     relocate(relocInfo::runtime_call_type);
 411   }
 412 
 413   // variant 2:
 414   //
 415   //    b!cxx SKIP
 416   //    bxx   DEST
 417   //  SKIP:
 418   //
 419 
 420   const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 421                                                 opposite_bcond(inv_boint_bcond(boint)));
 422 
 423   // We emit two branches.
 424   // First, a conditional branch which jumps around the far branch.
 425   const address not_taken_pc = pc() + 2 * BytesPerInstWord;
 426   const address bc_pc        = pc();
 427   bc(opposite_boint, biint, not_taken_pc);
 428 
 429   const int bc_instr = *(int*)bc_pc;
 430   assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition");
 431   assert(opposite_boint == inv_bo_field(bc_instr), "postcondition");
 432   assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))),
 433                                      opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))),
 434          "postcondition");
 435   assert(biint == inv_bi_field(bc_instr), "postcondition");
 436 
 437   // Second, an unconditional far branch which jumps to dest.
 438   // Note: target(dest) remembers the current pc (see CodeSection::target)
 439   //       and returns the current pc if the label is not bound yet; when
 440   //       the label gets bound, the unconditional far branch will be patched.
 441   const address target_pc = target(dest);
 442   const address b_pc  = pc();
 443   b(target_pc);
 444 
 445   assert(not_taken_pc == pc(),                     "postcondition");
 446   assert(dest.is_bound() || target_pc == b_pc, "postcondition");
 447 }
 448 
 449 bool MacroAssembler::is_bc_far_at(address instruction_addr) {
 450   return is_bc_far_variant1_at(instruction_addr) ||
 451          is_bc_far_variant2_at(instruction_addr) ||
 452          is_bc_far_variant3_at(instruction_addr);
 453 }
 454 
 455 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) {
 456   if (is_bc_far_variant1_at(instruction_addr)) {
 457     const address instruction_1_addr = instruction_addr;
 458     const int instruction_1 = *(int*)instruction_1_addr;
 459     return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr);
 460   } else if (is_bc_far_variant2_at(instruction_addr)) {
 461     const address instruction_2_addr = instruction_addr + 4;
 462     return bxx_destination(instruction_2_addr);
 463   } else if (is_bc_far_variant3_at(instruction_addr)) {
 464     return instruction_addr + 8;
 465   }
 466   // variant 4 ???
 467   ShouldNotReachHere();
 468   return NULL;
 469 }
 470 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) {
 471 
 472   if (is_bc_far_variant3_at(instruction_addr)) {
 473     // variant 3, far cond branch to the next instruction, already patched to nops:
 474     //
 475     //    nop
 476     //    endgroup
 477     //  SKIP/DEST:
 478     //
 479     return;
 480   }
 481 
 482   // first, extract boint and biint from the current branch
 483   int boint = 0;
 484   int biint = 0;
 485 
 486   ResourceMark rm;
 487   const int code_size = 2 * BytesPerInstWord;
 488   CodeBuffer buf(instruction_addr, code_size);
 489   MacroAssembler masm(&buf);
 490   if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) {
 491     // Far branch to next instruction: Optimize it by patching nops (produce variant 3).
 492     masm.nop();
 493     masm.endgroup();
 494   } else {
 495     if (is_bc_far_variant1_at(instruction_addr)) {
 496       // variant 1, the 1st instruction contains the destination address:
 497       //
 498       //    bcxx  DEST
 499       //    endgroup
 500       //
 501       const int instruction_1 = *(int*)(instruction_addr);
 502       boint = inv_bo_field(instruction_1);
 503       biint = inv_bi_field(instruction_1);
 504     } else if (is_bc_far_variant2_at(instruction_addr)) {
 505       // variant 2, the 2nd instruction contains the destination address:
 506       //
 507       //    b!cxx SKIP
 508       //    bxx   DEST
 509       //  SKIP:
 510       //
 511       const int instruction_1 = *(int*)(instruction_addr);
 512       boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))),
 513           opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1))));
 514       biint = inv_bi_field(instruction_1);
 515     } else {
 516       // variant 4???
 517       ShouldNotReachHere();
 518     }
 519 
 520     // second, set the new branch destination and optimize the code
 521     if (dest != instruction_addr + 4 && // the bc_far is still unbound!
 522         masm.is_within_range_of_bcxx(dest, instruction_addr)) {
 523       // variant 1:
 524       //
 525       //    bcxx  DEST
 526       //    endgroup
 527       //
 528       masm.bc(boint, biint, dest);
 529       masm.endgroup();
 530     } else {
 531       // variant 2:
 532       //
 533       //    b!cxx SKIP
 534       //    bxx   DEST
 535       //  SKIP:
 536       //
 537       const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 538                                                     opposite_bcond(inv_boint_bcond(boint)));
 539       const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord;
 540       masm.bc(opposite_boint, biint, not_taken_pc);
 541       masm.b(dest);
 542     }
 543   }
 544   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 545 }
 546 
 547 // Emit a NOT mt-safe patchable 64 bit absolute call/jump.
 548 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) {
 549   // get current pc
 550   uint64_t start_pc = (uint64_t) pc();
 551 
 552   const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last
 553   const address pc_of_b  = (address) (start_pc + (0*BytesPerInstWord)); // b is first
 554 
 555   // relocate here
 556   if (rt != relocInfo::none) {
 557     relocate(rt);
 558   }
 559 
 560   if ( ReoptimizeCallSequences &&
 561        (( link && is_within_range_of_b(dest, pc_of_bl)) ||
 562         (!link && is_within_range_of_b(dest, pc_of_b)))) {
 563     // variant 2:
 564     // Emit an optimized, pc-relative call/jump.
 565 
 566     if (link) {
 567       // some padding
 568       nop();
 569       nop();
 570       nop();
 571       nop();
 572       nop();
 573       nop();
 574 
 575       // do the call
 576       assert(pc() == pc_of_bl, "just checking");
 577       bl(dest, relocInfo::none);
 578     } else {
 579       // do the jump
 580       assert(pc() == pc_of_b, "just checking");
 581       b(dest, relocInfo::none);
 582 
 583       // some padding
 584       nop();
 585       nop();
 586       nop();
 587       nop();
 588       nop();
 589       nop();
 590     }
 591 
 592     // Assert that we can identify the emitted call/jump.
 593     assert(is_bxx64_patchable_variant2_at((address)start_pc, link),
 594            "can't identify emitted call");
 595   } else {
 596     // variant 1:
 597     mr(R0, R11);  // spill R11 -> R0.
 598 
 599     // Load the destination address into CTR,
 600     // calculate destination relative to global toc.
 601     calculate_address_from_global_toc(R11, dest, true, true, false);
 602 
 603     mtctr(R11);
 604     mr(R11, R0);  // spill R11 <- R0.
 605     nop();
 606 
 607     // do the call/jump
 608     if (link) {
 609       bctrl();
 610     } else{
 611       bctr();
 612     }
 613     // Assert that we can identify the emitted call/jump.
 614     assert(is_bxx64_patchable_variant1b_at((address)start_pc, link),
 615            "can't identify emitted call");
 616   }
 617 
 618   // Assert that we can identify the emitted call/jump.
 619   assert(is_bxx64_patchable_at((address)start_pc, link),
 620          "can't identify emitted call");
 621   assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest,
 622          "wrong encoding of dest address");
 623 }
 624 
 625 // Identify a bxx64_patchable instruction.
 626 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) {
 627   return is_bxx64_patchable_variant1b_at(instruction_addr, link)
 628     //|| is_bxx64_patchable_variant1_at(instruction_addr, link)
 629       || is_bxx64_patchable_variant2_at(instruction_addr, link);
 630 }
 631 
 632 // Does the call64_patchable instruction use a pc-relative encoding of
 633 // the call destination?
 634 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) {
 635   // variant 2 is pc-relative
 636   return is_bxx64_patchable_variant2_at(instruction_addr, link);
 637 }
 638 
 639 // Identify variant 1.
 640 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) {
 641   unsigned int* instr = (unsigned int*) instruction_addr;
 642   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 643       && is_mtctr(instr[5]) // mtctr
 644     && is_load_const_at(instruction_addr);
 645 }
 646 
 647 // Identify variant 1b: load destination relative to global toc.
 648 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) {
 649   unsigned int* instr = (unsigned int*) instruction_addr;
 650   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 651     && is_mtctr(instr[3]) // mtctr
 652     && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr);
 653 }
 654 
 655 // Identify variant 2.
 656 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) {
 657   unsigned int* instr = (unsigned int*) instruction_addr;
 658   if (link) {
 659     return is_bl (instr[6])  // bl dest is last
 660       && is_nop(instr[0])  // nop
 661       && is_nop(instr[1])  // nop
 662       && is_nop(instr[2])  // nop
 663       && is_nop(instr[3])  // nop
 664       && is_nop(instr[4])  // nop
 665       && is_nop(instr[5]); // nop
 666   } else {
 667     return is_b  (instr[0])  // b  dest is first
 668       && is_nop(instr[1])  // nop
 669       && is_nop(instr[2])  // nop
 670       && is_nop(instr[3])  // nop
 671       && is_nop(instr[4])  // nop
 672       && is_nop(instr[5])  // nop
 673       && is_nop(instr[6]); // nop
 674   }
 675 }
 676 
 677 // Set dest address of a bxx64_patchable instruction.
 678 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) {
 679   ResourceMark rm;
 680   int code_size = MacroAssembler::bxx64_patchable_size;
 681   CodeBuffer buf(instruction_addr, code_size);
 682   MacroAssembler masm(&buf);
 683   masm.bxx64_patchable(dest, relocInfo::none, link);
 684   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 685 }
 686 
 687 // Get dest address of a bxx64_patchable instruction.
 688 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) {
 689   if (is_bxx64_patchable_variant1_at(instruction_addr, link)) {
 690     return (address) (unsigned long) get_const(instruction_addr);
 691   } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) {
 692     unsigned int* instr = (unsigned int*) instruction_addr;
 693     if (link) {
 694       const int instr_idx = 6; // bl is last
 695       int branchoffset = branch_destination(instr[instr_idx], 0);
 696       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 697     } else {
 698       const int instr_idx = 0; // b is first
 699       int branchoffset = branch_destination(instr[instr_idx], 0);
 700       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 701     }
 702   // Load dest relative to global toc.
 703   } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) {
 704     return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord,
 705                                                                instruction_addr);
 706   } else {
 707     ShouldNotReachHere();
 708     return NULL;
 709   }
 710 }
 711 
 712 // Uses ordering which corresponds to ABI:
 713 //    _savegpr0_14:  std  r14,-144(r1)
 714 //    _savegpr0_15:  std  r15,-136(r1)
 715 //    _savegpr0_16:  std  r16,-128(r1)
 716 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) {
 717   std(R14, offset, dst);   offset += 8;
 718   std(R15, offset, dst);   offset += 8;
 719   std(R16, offset, dst);   offset += 8;
 720   std(R17, offset, dst);   offset += 8;
 721   std(R18, offset, dst);   offset += 8;
 722   std(R19, offset, dst);   offset += 8;
 723   std(R20, offset, dst);   offset += 8;
 724   std(R21, offset, dst);   offset += 8;
 725   std(R22, offset, dst);   offset += 8;
 726   std(R23, offset, dst);   offset += 8;
 727   std(R24, offset, dst);   offset += 8;
 728   std(R25, offset, dst);   offset += 8;
 729   std(R26, offset, dst);   offset += 8;
 730   std(R27, offset, dst);   offset += 8;
 731   std(R28, offset, dst);   offset += 8;
 732   std(R29, offset, dst);   offset += 8;
 733   std(R30, offset, dst);   offset += 8;
 734   std(R31, offset, dst);   offset += 8;
 735 
 736   stfd(F14, offset, dst);   offset += 8;
 737   stfd(F15, offset, dst);   offset += 8;
 738   stfd(F16, offset, dst);   offset += 8;
 739   stfd(F17, offset, dst);   offset += 8;
 740   stfd(F18, offset, dst);   offset += 8;
 741   stfd(F19, offset, dst);   offset += 8;
 742   stfd(F20, offset, dst);   offset += 8;
 743   stfd(F21, offset, dst);   offset += 8;
 744   stfd(F22, offset, dst);   offset += 8;
 745   stfd(F23, offset, dst);   offset += 8;
 746   stfd(F24, offset, dst);   offset += 8;
 747   stfd(F25, offset, dst);   offset += 8;
 748   stfd(F26, offset, dst);   offset += 8;
 749   stfd(F27, offset, dst);   offset += 8;
 750   stfd(F28, offset, dst);   offset += 8;
 751   stfd(F29, offset, dst);   offset += 8;
 752   stfd(F30, offset, dst);   offset += 8;
 753   stfd(F31, offset, dst);
 754 }
 755 
 756 // Uses ordering which corresponds to ABI:
 757 //    _restgpr0_14:  ld   r14,-144(r1)
 758 //    _restgpr0_15:  ld   r15,-136(r1)
 759 //    _restgpr0_16:  ld   r16,-128(r1)
 760 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) {
 761   ld(R14, offset, src);   offset += 8;
 762   ld(R15, offset, src);   offset += 8;
 763   ld(R16, offset, src);   offset += 8;
 764   ld(R17, offset, src);   offset += 8;
 765   ld(R18, offset, src);   offset += 8;
 766   ld(R19, offset, src);   offset += 8;
 767   ld(R20, offset, src);   offset += 8;
 768   ld(R21, offset, src);   offset += 8;
 769   ld(R22, offset, src);   offset += 8;
 770   ld(R23, offset, src);   offset += 8;
 771   ld(R24, offset, src);   offset += 8;
 772   ld(R25, offset, src);   offset += 8;
 773   ld(R26, offset, src);   offset += 8;
 774   ld(R27, offset, src);   offset += 8;
 775   ld(R28, offset, src);   offset += 8;
 776   ld(R29, offset, src);   offset += 8;
 777   ld(R30, offset, src);   offset += 8;
 778   ld(R31, offset, src);   offset += 8;
 779 
 780   // FP registers
 781   lfd(F14, offset, src);   offset += 8;
 782   lfd(F15, offset, src);   offset += 8;
 783   lfd(F16, offset, src);   offset += 8;
 784   lfd(F17, offset, src);   offset += 8;
 785   lfd(F18, offset, src);   offset += 8;
 786   lfd(F19, offset, src);   offset += 8;
 787   lfd(F20, offset, src);   offset += 8;
 788   lfd(F21, offset, src);   offset += 8;
 789   lfd(F22, offset, src);   offset += 8;
 790   lfd(F23, offset, src);   offset += 8;
 791   lfd(F24, offset, src);   offset += 8;
 792   lfd(F25, offset, src);   offset += 8;
 793   lfd(F26, offset, src);   offset += 8;
 794   lfd(F27, offset, src);   offset += 8;
 795   lfd(F28, offset, src);   offset += 8;
 796   lfd(F29, offset, src);   offset += 8;
 797   lfd(F30, offset, src);   offset += 8;
 798   lfd(F31, offset, src);
 799 }
 800 
 801 // For verify_oops.
 802 void MacroAssembler::save_volatile_gprs(Register dst, int offset) {
 803   std(R2,  offset, dst);   offset += 8;
 804   std(R3,  offset, dst);   offset += 8;
 805   std(R4,  offset, dst);   offset += 8;
 806   std(R5,  offset, dst);   offset += 8;
 807   std(R6,  offset, dst);   offset += 8;
 808   std(R7,  offset, dst);   offset += 8;
 809   std(R8,  offset, dst);   offset += 8;
 810   std(R9,  offset, dst);   offset += 8;
 811   std(R10, offset, dst);   offset += 8;
 812   std(R11, offset, dst);   offset += 8;
 813   std(R12, offset, dst);
 814 }
 815 
 816 // For verify_oops.
 817 void MacroAssembler::restore_volatile_gprs(Register src, int offset) {
 818   ld(R2,  offset, src);   offset += 8;
 819   ld(R3,  offset, src);   offset += 8;
 820   ld(R4,  offset, src);   offset += 8;
 821   ld(R5,  offset, src);   offset += 8;
 822   ld(R6,  offset, src);   offset += 8;
 823   ld(R7,  offset, src);   offset += 8;
 824   ld(R8,  offset, src);   offset += 8;
 825   ld(R9,  offset, src);   offset += 8;
 826   ld(R10, offset, src);   offset += 8;
 827   ld(R11, offset, src);   offset += 8;
 828   ld(R12, offset, src);
 829 }
 830 
 831 void MacroAssembler::save_LR_CR(Register tmp) {
 832   mfcr(tmp);
 833   std(tmp, _abi(cr), R1_SP);
 834   mflr(tmp);
 835   std(tmp, _abi(lr), R1_SP);
 836   // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad)
 837 }
 838 
 839 void MacroAssembler::restore_LR_CR(Register tmp) {
 840   assert(tmp != R1_SP, "must be distinct");
 841   ld(tmp, _abi(lr), R1_SP);
 842   mtlr(tmp);
 843   ld(tmp, _abi(cr), R1_SP);
 844   mtcr(tmp);
 845 }
 846 
 847 address MacroAssembler::get_PC_trash_LR(Register result) {
 848   Label L;
 849   bl(L);
 850   bind(L);
 851   address lr_pc = pc();
 852   mflr(result);
 853   return lr_pc;
 854 }
 855 
 856 void MacroAssembler::resize_frame(Register offset, Register tmp) {
 857 #ifdef ASSERT
 858   assert_different_registers(offset, tmp, R1_SP);
 859   andi_(tmp, offset, frame::alignment_in_bytes-1);
 860   asm_assert_eq("resize_frame: unaligned", 0x204);
 861 #endif
 862 
 863   // tmp <- *(SP)
 864   ld(tmp, _abi(callers_sp), R1_SP);
 865   // addr <- SP + offset;
 866   // *(addr) <- tmp;
 867   // SP <- addr
 868   stdux(tmp, R1_SP, offset);
 869 }
 870 
 871 void MacroAssembler::resize_frame(int offset, Register tmp) {
 872   assert(is_simm(offset, 16), "too big an offset");
 873   assert_different_registers(tmp, R1_SP);
 874   assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned");
 875   // tmp <- *(SP)
 876   ld(tmp, _abi(callers_sp), R1_SP);
 877   // addr <- SP + offset;
 878   // *(addr) <- tmp;
 879   // SP <- addr
 880   stdu(tmp, offset, R1_SP);
 881 }
 882 
 883 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) {
 884   // (addr == tmp1) || (addr == tmp2) is allowed here!
 885   assert(tmp1 != tmp2, "must be distinct");
 886 
 887   // compute offset w.r.t. current stack pointer
 888   // tmp_1 <- addr - SP (!)
 889   subf(tmp1, R1_SP, addr);
 890 
 891   // atomically update SP keeping back link.
 892   resize_frame(tmp1/* offset */, tmp2/* tmp */);
 893 }
 894 
 895 void MacroAssembler::push_frame(Register bytes, Register tmp) {
 896 #ifdef ASSERT
 897   assert(bytes != R0, "r0 not allowed here");
 898   andi_(R0, bytes, frame::alignment_in_bytes-1);
 899   asm_assert_eq("push_frame(Reg, Reg): unaligned", 0x203);
 900 #endif
 901   neg(tmp, bytes);
 902   stdux(R1_SP, R1_SP, tmp);
 903 }
 904 
 905 // Push a frame of size `bytes'.
 906 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) {
 907   long offset = align_addr(bytes, frame::alignment_in_bytes);
 908   if (is_simm(-offset, 16)) {
 909     stdu(R1_SP, -offset, R1_SP);
 910   } else {
 911     load_const(tmp, -offset);
 912     stdux(R1_SP, R1_SP, tmp);
 913   }
 914 }
 915 
 916 // Push a frame of size `bytes' plus abi_reg_args on top.
 917 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) {
 918   push_frame(bytes + frame::abi_reg_args_size, tmp);
 919 }
 920 
 921 // Setup up a new C frame with a spill area for non-volatile GPRs and
 922 // additional space for local variables.
 923 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes,
 924                                                       Register tmp) {
 925   push_frame(bytes + frame::abi_reg_args_size + frame::spill_nonvolatiles_size, tmp);
 926 }
 927 
 928 // Pop current C frame.
 929 void MacroAssembler::pop_frame() {
 930   ld(R1_SP, _abi(callers_sp), R1_SP);
 931 }
 932 
 933 #if defined(ABI_ELFv2)
 934 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) {
 935   // TODO(asmundak): make sure the caller uses R12 as function descriptor
 936   // most of the times.
 937   if (R12 != r_function_entry) {
 938     mr(R12, r_function_entry);
 939   }
 940   mtctr(R12);
 941   // Do a call or a branch.
 942   if (and_link) {
 943     bctrl();
 944   } else {
 945     bctr();
 946   }
 947   _last_calls_return_pc = pc();
 948 
 949   return _last_calls_return_pc;
 950 }
 951 
 952 // Call a C function via a function descriptor and use full C
 953 // calling conventions. Updates and returns _last_calls_return_pc.
 954 address MacroAssembler::call_c(Register r_function_entry) {
 955   return branch_to(r_function_entry, /*and_link=*/true);
 956 }
 957 
 958 // For tail calls: only branch, don't link, so callee returns to caller of this function.
 959 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) {
 960   return branch_to(r_function_entry, /*and_link=*/false);
 961 }
 962 
 963 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) {
 964   load_const(R12, function_entry, R0);
 965   return branch_to(R12,  /*and_link=*/true);
 966 }
 967 
 968 #else
 969 // Generic version of a call to C function via a function descriptor
 970 // with variable support for C calling conventions (TOC, ENV, etc.).
 971 // Updates and returns _last_calls_return_pc.
 972 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call,
 973                                   bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) {
 974   // we emit standard ptrgl glue code here
 975   assert((function_descriptor != R0), "function_descriptor cannot be R0");
 976 
 977   // retrieve necessary entries from the function descriptor
 978   ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor);
 979   mtctr(R0);
 980 
 981   if (load_toc_of_callee) {
 982     ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor);
 983   }
 984   if (load_env_of_callee) {
 985     ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor);
 986   } else if (load_toc_of_callee) {
 987     li(R11, 0);
 988   }
 989 
 990   // do a call or a branch
 991   if (and_link) {
 992     bctrl();
 993   } else {
 994     bctr();
 995   }
 996   _last_calls_return_pc = pc();
 997 
 998   return _last_calls_return_pc;
 999 }
1000 
1001 // Call a C function via a function descriptor and use full C calling
1002 // conventions.
1003 // We don't use the TOC in generated code, so there is no need to save
1004 // and restore its value.
1005 address MacroAssembler::call_c(Register fd) {
1006   return branch_to(fd, /*and_link=*/true,
1007                        /*save toc=*/false,
1008                        /*restore toc=*/false,
1009                        /*load toc=*/true,
1010                        /*load env=*/true);
1011 }
1012 
1013 address MacroAssembler::call_c_and_return_to_caller(Register fd) {
1014   return branch_to(fd, /*and_link=*/false,
1015                        /*save toc=*/false,
1016                        /*restore toc=*/false,
1017                        /*load toc=*/true,
1018                        /*load env=*/true);
1019 }
1020 
1021 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) {
1022   if (rt != relocInfo::none) {
1023     // this call needs to be relocatable
1024     if (!ReoptimizeCallSequences
1025         || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1026         || fd == NULL   // support code-size estimation
1027         || !fd->is_friend_function()
1028         || fd->entry() == NULL) {
1029       // it's not a friend function as defined by class FunctionDescriptor,
1030       // so do a full call-c here.
1031       load_const(R11, (address)fd, R0);
1032 
1033       bool has_env = (fd != NULL && fd->env() != NULL);
1034       return branch_to(R11, /*and_link=*/true,
1035                             /*save toc=*/false,
1036                             /*restore toc=*/false,
1037                             /*load toc=*/true,
1038                             /*load env=*/has_env);
1039     } else {
1040       // It's a friend function. Load the entry point and don't care about
1041       // toc and env. Use an optimizable call instruction, but ensure the
1042       // same code-size as in the case of a non-friend function.
1043       nop();
1044       nop();
1045       nop();
1046       bl64_patchable(fd->entry(), rt);
1047       _last_calls_return_pc = pc();
1048       return _last_calls_return_pc;
1049     }
1050   } else {
1051     // This call does not need to be relocatable, do more aggressive
1052     // optimizations.
1053     if (!ReoptimizeCallSequences
1054       || !fd->is_friend_function()) {
1055       // It's not a friend function as defined by class FunctionDescriptor,
1056       // so do a full call-c here.
1057       load_const(R11, (address)fd, R0);
1058       return branch_to(R11, /*and_link=*/true,
1059                             /*save toc=*/false,
1060                             /*restore toc=*/false,
1061                             /*load toc=*/true,
1062                             /*load env=*/true);
1063     } else {
1064       // it's a friend function, load the entry point and don't care about
1065       // toc and env.
1066       address dest = fd->entry();
1067       if (is_within_range_of_b(dest, pc())) {
1068         bl(dest);
1069       } else {
1070         bl64_patchable(dest, rt);
1071       }
1072       _last_calls_return_pc = pc();
1073       return _last_calls_return_pc;
1074     }
1075   }
1076 }
1077 
1078 // Call a C function.  All constants needed reside in TOC.
1079 //
1080 // Read the address to call from the TOC.
1081 // Read env from TOC, if fd specifies an env.
1082 // Read new TOC from TOC.
1083 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd,
1084                                          relocInfo::relocType rt, Register toc) {
1085   if (!ReoptimizeCallSequences
1086     || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1087     || !fd->is_friend_function()) {
1088     // It's not a friend function as defined by class FunctionDescriptor,
1089     // so do a full call-c here.
1090     assert(fd->entry() != NULL, "function must be linked");
1091 
1092     AddressLiteral fd_entry(fd->entry());
1093     load_const_from_method_toc(R11, fd_entry, toc);
1094     mtctr(R11);
1095     if (fd->env() == NULL) {
1096       li(R11, 0);
1097       nop();
1098     } else {
1099       AddressLiteral fd_env(fd->env());
1100       load_const_from_method_toc(R11, fd_env, toc);
1101     }
1102     AddressLiteral fd_toc(fd->toc());
1103     load_toc_from_toc(R2_TOC, fd_toc, toc);
1104     // R2_TOC is killed.
1105     bctrl();
1106     _last_calls_return_pc = pc();
1107   } else {
1108     // It's a friend function, load the entry point and don't care about
1109     // toc and env. Use an optimizable call instruction, but ensure the
1110     // same code-size as in the case of a non-friend function.
1111     nop();
1112     bl64_patchable(fd->entry(), rt);
1113     _last_calls_return_pc = pc();
1114   }
1115   return _last_calls_return_pc;
1116 }
1117 #endif // ABI_ELFv2
1118 
1119 void MacroAssembler::call_VM_base(Register oop_result,
1120                                   Register last_java_sp,
1121                                   address  entry_point,
1122                                   bool     check_exceptions) {
1123   BLOCK_COMMENT("call_VM {");
1124   // Determine last_java_sp register.
1125   if (!last_java_sp->is_valid()) {
1126     last_java_sp = R1_SP;
1127   }
1128   set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1);
1129 
1130   // ARG1 must hold thread address.
1131   mr(R3_ARG1, R16_thread);
1132 #if defined(ABI_ELFv2)
1133   address return_pc = call_c(entry_point, relocInfo::none);
1134 #else
1135   address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none);
1136 #endif
1137 
1138   reset_last_Java_frame();
1139 
1140   // Check for pending exceptions.
1141   if (check_exceptions) {
1142     // We don't check for exceptions here.
1143     ShouldNotReachHere();
1144   }
1145 
1146   // Get oop result if there is one and reset the value in the thread.
1147   if (oop_result->is_valid()) {
1148     get_vm_result(oop_result);
1149   }
1150 
1151   _last_calls_return_pc = return_pc;
1152   BLOCK_COMMENT("} call_VM");
1153 }
1154 
1155 void MacroAssembler::call_VM_leaf_base(address entry_point) {
1156   BLOCK_COMMENT("call_VM_leaf {");
1157 #if defined(ABI_ELFv2)
1158   call_c(entry_point, relocInfo::none);
1159 #else
1160   call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none);
1161 #endif
1162   BLOCK_COMMENT("} call_VM_leaf");
1163 }
1164 
1165 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) {
1166   call_VM_base(oop_result, noreg, entry_point, check_exceptions);
1167 }
1168 
1169 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1,
1170                              bool check_exceptions) {
1171   // R3_ARG1 is reserved for the thread.
1172   mr_if_needed(R4_ARG2, arg_1);
1173   call_VM(oop_result, entry_point, check_exceptions);
1174 }
1175 
1176 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2,
1177                              bool check_exceptions) {
1178   // R3_ARG1 is reserved for the thread
1179   mr_if_needed(R4_ARG2, arg_1);
1180   assert(arg_2 != R4_ARG2, "smashed argument");
1181   mr_if_needed(R5_ARG3, arg_2);
1182   call_VM(oop_result, entry_point, check_exceptions);
1183 }
1184 
1185 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3,
1186                              bool check_exceptions) {
1187   // R3_ARG1 is reserved for the thread
1188   mr_if_needed(R4_ARG2, arg_1);
1189   assert(arg_2 != R4_ARG2, "smashed argument");
1190   mr_if_needed(R5_ARG3, arg_2);
1191   mr_if_needed(R6_ARG4, arg_3);
1192   call_VM(oop_result, entry_point, check_exceptions);
1193 }
1194 
1195 void MacroAssembler::call_VM_leaf(address entry_point) {
1196   call_VM_leaf_base(entry_point);
1197 }
1198 
1199 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) {
1200   mr_if_needed(R3_ARG1, arg_1);
1201   call_VM_leaf(entry_point);
1202 }
1203 
1204 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) {
1205   mr_if_needed(R3_ARG1, arg_1);
1206   assert(arg_2 != R3_ARG1, "smashed argument");
1207   mr_if_needed(R4_ARG2, arg_2);
1208   call_VM_leaf(entry_point);
1209 }
1210 
1211 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) {
1212   mr_if_needed(R3_ARG1, arg_1);
1213   assert(arg_2 != R3_ARG1, "smashed argument");
1214   mr_if_needed(R4_ARG2, arg_2);
1215   assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument");
1216   mr_if_needed(R5_ARG3, arg_3);
1217   call_VM_leaf(entry_point);
1218 }
1219 
1220 // Check whether instruction is a read access to the polling page
1221 // which was emitted by load_from_polling_page(..).
1222 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext,
1223                                                address* polling_address_ptr) {
1224   if (!is_ld(instruction))
1225     return false; // It's not a ld. Fail.
1226 
1227   int rt = inv_rt_field(instruction);
1228   int ra = inv_ra_field(instruction);
1229   int ds = inv_ds_field(instruction);
1230   if (!(ds == 0 && ra != 0 && rt == 0)) {
1231     return false; // It's not a ld(r0, X, ra). Fail.
1232   }
1233 
1234   if (!ucontext) {
1235     // Set polling address.
1236     if (polling_address_ptr != NULL) {
1237       *polling_address_ptr = NULL;
1238     }
1239     return true; // No ucontext given. Can't check value of ra. Assume true.
1240   }
1241 
1242 #ifdef LINUX
1243   // Ucontext given. Check that register ra contains the address of
1244   // the safepoing polling page.
1245   ucontext_t* uc = (ucontext_t*) ucontext;
1246   // Set polling address.
1247   address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds;
1248   if (polling_address_ptr != NULL) {
1249     *polling_address_ptr = addr;
1250   }
1251   return os::is_poll_address(addr);
1252 #else
1253   // Not on Linux, ucontext must be NULL.
1254   ShouldNotReachHere();
1255   return false;
1256 #endif
1257 }
1258 
1259 bool MacroAssembler::is_memory_serialization(int instruction, JavaThread* thread, void* ucontext) {
1260 #ifdef LINUX
1261   ucontext_t* uc = (ucontext_t*) ucontext;
1262 
1263   if (is_stwx(instruction) || is_stwux(instruction)) {
1264     int ra = inv_ra_field(instruction);
1265     int rb = inv_rb_field(instruction);
1266 
1267     // look up content of ra and rb in ucontext
1268     address ra_val=(address)uc->uc_mcontext.regs->gpr[ra];
1269     long rb_val=(long)uc->uc_mcontext.regs->gpr[rb];
1270     return os::is_memory_serialize_page(thread, ra_val+rb_val);
1271   } else if (is_stw(instruction) || is_stwu(instruction)) {
1272     int ra = inv_ra_field(instruction);
1273     int d1 = inv_d1_field(instruction);
1274 
1275     // look up content of ra in ucontext
1276     address ra_val=(address)uc->uc_mcontext.regs->gpr[ra];
1277     return os::is_memory_serialize_page(thread, ra_val+d1);
1278   } else {
1279     return false;
1280   }
1281 #else
1282   // workaround not needed on !LINUX :-)
1283   ShouldNotCallThis();
1284   return false;
1285 #endif
1286 }
1287 
1288 void MacroAssembler::bang_stack_with_offset(int offset) {
1289   // When increasing the stack, the old stack pointer will be written
1290   // to the new top of stack according to the PPC64 abi.
1291   // Therefore, stack banging is not necessary when increasing
1292   // the stack by <= os::vm_page_size() bytes.
1293   // When increasing the stack by a larger amount, this method is
1294   // called repeatedly to bang the intermediate pages.
1295 
1296   // Stack grows down, caller passes positive offset.
1297   assert(offset > 0, "must bang with positive offset");
1298 
1299   long stdoffset = -offset;
1300 
1301   if (is_simm(stdoffset, 16)) {
1302     // Signed 16 bit offset, a simple std is ok.
1303     if (UseLoadInstructionsForStackBangingPPC64) {
1304       ld(R0, (int)(signed short)stdoffset, R1_SP);
1305     } else {
1306       std(R0,(int)(signed short)stdoffset, R1_SP);
1307     }
1308   } else if (is_simm(stdoffset, 31)) {
1309     const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset);
1310     const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset);
1311 
1312     Register tmp = R11;
1313     addis(tmp, R1_SP, hi);
1314     if (UseLoadInstructionsForStackBangingPPC64) {
1315       ld(R0,  lo, tmp);
1316     } else {
1317       std(R0, lo, tmp);
1318     }
1319   } else {
1320     ShouldNotReachHere();
1321   }
1322 }
1323 
1324 // If instruction is a stack bang of the form
1325 //    std    R0,    x(Ry),       (see bang_stack_with_offset())
1326 //    stdu   R1_SP, x(R1_SP),    (see push_frame(), resize_frame())
1327 // or stdux  R1_SP, Rx, R1_SP    (see push_frame(), resize_frame())
1328 // return the banged address. Otherwise, return 0.
1329 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) {
1330 #ifdef LINUX
1331   ucontext_t* uc = (ucontext_t*) ucontext;
1332   int rs = inv_rs_field(instruction);
1333   int ra = inv_ra_field(instruction);
1334   if (   (is_ld(instruction)   && rs == 0 &&  UseLoadInstructionsForStackBangingPPC64)
1335       || (is_std(instruction)  && rs == 0 && !UseLoadInstructionsForStackBangingPPC64)
1336       || (is_stdu(instruction) && rs == 1)) {
1337     int ds = inv_ds_field(instruction);
1338     // return banged address
1339     return ds+(address)uc->uc_mcontext.regs->gpr[ra];
1340   } else if (is_stdux(instruction) && rs == 1) {
1341     int rb = inv_rb_field(instruction);
1342     address sp = (address)uc->uc_mcontext.regs->gpr[1];
1343     long rb_val = (long)uc->uc_mcontext.regs->gpr[rb];
1344     return ra != 1 || rb_val >= 0 ? NULL         // not a stack bang
1345                                   : sp + rb_val; // banged address
1346   }
1347   return NULL; // not a stack bang
1348 #else
1349   // workaround not needed on !LINUX :-)
1350   ShouldNotCallThis();
1351   return NULL;
1352 #endif
1353 }
1354 
1355 // CmpxchgX sets condition register to cmpX(current, compare).
1356 void MacroAssembler::cmpxchgw(ConditionRegister flag, Register dest_current_value,
1357                               Register compare_value, Register exchange_value,
1358                               Register addr_base, int semantics, bool cmpxchgx_hint,
1359                               Register int_flag_success, bool contention_hint) {
1360   Label retry;
1361   Label failed;
1362   Label done;
1363 
1364   // Save one branch if result is returned via register and
1365   // result register is different from the other ones.
1366   bool use_result_reg    = (int_flag_success != noreg);
1367   bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value &&
1368                             int_flag_success != exchange_value && int_flag_success != addr_base);
1369 
1370   // release/fence semantics
1371   if (semantics & MemBarRel) {
1372     release();
1373   }
1374 
1375   if (use_result_reg && preset_result_reg) {
1376     li(int_flag_success, 0); // preset (assume cas failed)
1377   }
1378 
1379   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1380   if (contention_hint) { // Don't try to reserve if cmp fails.
1381     lwz(dest_current_value, 0, addr_base);
1382     cmpw(flag, dest_current_value, compare_value);
1383     bne(flag, failed);
1384   }
1385 
1386   // atomic emulation loop
1387   bind(retry);
1388 
1389   lwarx(dest_current_value, addr_base, cmpxchgx_hint);
1390   cmpw(flag, dest_current_value, compare_value);
1391   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1392     bne_predict_not_taken(flag, failed);
1393   } else {
1394     bne(                  flag, failed);
1395   }
1396   // branch to done  => (flag == ne), (dest_current_value != compare_value)
1397   // fall through    => (flag == eq), (dest_current_value == compare_value)
1398 
1399   stwcx_(exchange_value, addr_base);
1400   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1401     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1402   } else {
1403     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1404   }
1405   // fall through    => (flag == eq), (dest_current_value == compare_value), (swapped)
1406 
1407   // Result in register (must do this at the end because int_flag_success can be the
1408   // same register as one above).
1409   if (use_result_reg) {
1410     li(int_flag_success, 1);
1411   }
1412 
1413   if (semantics & MemBarFenceAfter) {
1414     fence();
1415   } else if (semantics & MemBarAcq) {
1416     isync();
1417   }
1418 
1419   if (use_result_reg && !preset_result_reg) {
1420     b(done);
1421   }
1422 
1423   bind(failed);
1424   if (use_result_reg && !preset_result_reg) {
1425     li(int_flag_success, 0);
1426   }
1427 
1428   bind(done);
1429   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1430   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1431 }
1432 
1433 // Preforms atomic compare exchange:
1434 //   if (compare_value == *addr_base)
1435 //     *addr_base = exchange_value
1436 //     int_flag_success = 1;
1437 //   else
1438 //     int_flag_success = 0;
1439 //
1440 // ConditionRegister flag       = cmp(compare_value, *addr_base)
1441 // Register dest_current_value  = *addr_base
1442 // Register compare_value       Used to compare with value in memory
1443 // Register exchange_value      Written to memory if compare_value == *addr_base
1444 // Register addr_base           The memory location to compareXChange
1445 // Register int_flag_success    Set to 1 if exchange_value was written to *addr_base
1446 //
1447 // To avoid the costly compare exchange the value is tested beforehand.
1448 // Several special cases exist to avoid that unnecessary information is generated.
1449 //
1450 void MacroAssembler::cmpxchgd(ConditionRegister flag,
1451                               Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value,
1452                               Register addr_base, int semantics, bool cmpxchgx_hint,
1453                               Register int_flag_success, Label* failed_ext, bool contention_hint) {
1454   Label retry;
1455   Label failed_int;
1456   Label& failed = (failed_ext != NULL) ? *failed_ext : failed_int;
1457   Label done;
1458 
1459   // Save one branch if result is returned via register and result register is different from the other ones.
1460   bool use_result_reg    = (int_flag_success!=noreg);
1461   bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() &&
1462                             int_flag_success!=exchange_value && int_flag_success!=addr_base);
1463   assert(int_flag_success == noreg || failed_ext == NULL, "cannot have both");
1464 
1465   // release/fence semantics
1466   if (semantics & MemBarRel) {
1467     release();
1468   }
1469 
1470   if (use_result_reg && preset_result_reg) {
1471     li(int_flag_success, 0); // preset (assume cas failed)
1472   }
1473 
1474   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1475   if (contention_hint) { // Don't try to reserve if cmp fails.
1476     ld(dest_current_value, 0, addr_base);
1477     cmpd(flag, compare_value, dest_current_value);
1478     bne(flag, failed);
1479   }
1480 
1481   // atomic emulation loop
1482   bind(retry);
1483 
1484   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1485   cmpd(flag, compare_value, dest_current_value);
1486   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1487     bne_predict_not_taken(flag, failed);
1488   } else {
1489     bne(                  flag, failed);
1490   }
1491 
1492   stdcx_(exchange_value, addr_base);
1493   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1494     bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
1495   } else {
1496     bne(                  CCR0, retry); // stXcx_ sets CCR0
1497   }
1498 
1499   // result in register (must do this at the end because int_flag_success can be the same register as one above)
1500   if (use_result_reg) {
1501     li(int_flag_success, 1);
1502   }
1503 
1504   // POWER6 doesn't need isync in CAS.
1505   // Always emit isync to be on the safe side.
1506   if (semantics & MemBarFenceAfter) {
1507     fence();
1508   } else if (semantics & MemBarAcq) {
1509     isync();
1510   }
1511 
1512   if (use_result_reg && !preset_result_reg) {
1513     b(done);
1514   }
1515 
1516   bind(failed_int);
1517   if (use_result_reg && !preset_result_reg) {
1518     li(int_flag_success, 0);
1519   }
1520 
1521   bind(done);
1522   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1523   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1524 }
1525 
1526 // Look up the method for a megamorphic invokeinterface call.
1527 // The target method is determined by <intf_klass, itable_index>.
1528 // The receiver klass is in recv_klass.
1529 // On success, the result will be in method_result, and execution falls through.
1530 // On failure, execution transfers to the given label.
1531 void MacroAssembler::lookup_interface_method(Register recv_klass,
1532                                              Register intf_klass,
1533                                              RegisterOrConstant itable_index,
1534                                              Register method_result,
1535                                              Register scan_temp,
1536                                              Register sethi_temp,
1537                                              Label& L_no_such_interface) {
1538   assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
1539   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
1540          "caller must use same register for non-constant itable index as for method");
1541 
1542   // Compute start of first itableOffsetEntry (which is at the end of the vtable).
1543   int vtable_base = InstanceKlass::vtable_start_offset() * wordSize;
1544   int itentry_off = itableMethodEntry::method_offset_in_bytes();
1545   int logMEsize   = exact_log2(itableMethodEntry::size() * wordSize);
1546   int scan_step   = itableOffsetEntry::size() * wordSize;
1547   int log_vte_size= exact_log2(vtableEntry::size() * wordSize);
1548 
1549   lwz(scan_temp, InstanceKlass::vtable_length_offset() * wordSize, recv_klass);
1550   // %%% We should store the aligned, prescaled offset in the klassoop.
1551   // Then the next several instructions would fold away.
1552 
1553   sldi(scan_temp, scan_temp, log_vte_size);
1554   addi(scan_temp, scan_temp, vtable_base);
1555   add(scan_temp, recv_klass, scan_temp);
1556 
1557   // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1558   if (itable_index.is_register()) {
1559     Register itable_offset = itable_index.as_register();
1560     sldi(itable_offset, itable_offset, logMEsize);
1561     if (itentry_off) addi(itable_offset, itable_offset, itentry_off);
1562     add(recv_klass, itable_offset, recv_klass);
1563   } else {
1564     long itable_offset = (long)itable_index.as_constant();
1565     load_const_optimized(sethi_temp, (itable_offset<<logMEsize)+itentry_off); // static address, no relocation
1566     add(recv_klass, sethi_temp, recv_klass);
1567   }
1568 
1569   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
1570   //   if (scan->interface() == intf) {
1571   //     result = (klass + scan->offset() + itable_index);
1572   //   }
1573   // }
1574   Label search, found_method;
1575 
1576   for (int peel = 1; peel >= 0; peel--) {
1577     // %%%% Could load both offset and interface in one ldx, if they were
1578     // in the opposite order. This would save a load.
1579     ld(method_result, itableOffsetEntry::interface_offset_in_bytes(), scan_temp);
1580 
1581     // Check that this entry is non-null. A null entry means that
1582     // the receiver class doesn't implement the interface, and wasn't the
1583     // same as when the caller was compiled.
1584     cmpd(CCR0, method_result, intf_klass);
1585 
1586     if (peel) {
1587       beq(CCR0, found_method);
1588     } else {
1589       bne(CCR0, search);
1590       // (invert the test to fall through to found_method...)
1591     }
1592 
1593     if (!peel) break;
1594 
1595     bind(search);
1596 
1597     cmpdi(CCR0, method_result, 0);
1598     beq(CCR0, L_no_such_interface);
1599     addi(scan_temp, scan_temp, scan_step);
1600   }
1601 
1602   bind(found_method);
1603 
1604   // Got a hit.
1605   int ito_offset = itableOffsetEntry::offset_offset_in_bytes();
1606   lwz(scan_temp, ito_offset, scan_temp);
1607   ldx(method_result, scan_temp, recv_klass);
1608 }
1609 
1610 // virtual method calling
1611 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1612                                            RegisterOrConstant vtable_index,
1613                                            Register method_result) {
1614 
1615   assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg());
1616 
1617   const int base = InstanceKlass::vtable_start_offset() * wordSize;
1618   assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1619 
1620   if (vtable_index.is_register()) {
1621     sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord);
1622     add(recv_klass, vtable_index.as_register(), recv_klass);
1623   } else {
1624     addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord);
1625   }
1626   ld(R19_method, base + vtableEntry::method_offset_in_bytes(), recv_klass);
1627 }
1628 
1629 /////////////////////////////////////////// subtype checking ////////////////////////////////////////////
1630 
1631 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1632                                                    Register super_klass,
1633                                                    Register temp1_reg,
1634                                                    Register temp2_reg,
1635                                                    Label& L_success,
1636                                                    Label& L_failure) {
1637 
1638   const Register check_cache_offset = temp1_reg;
1639   const Register cached_super       = temp2_reg;
1640 
1641   assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super);
1642 
1643   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1644   int sc_offset  = in_bytes(Klass::secondary_super_cache_offset());
1645 
1646   // If the pointers are equal, we are done (e.g., String[] elements).
1647   // This self-check enables sharing of secondary supertype arrays among
1648   // non-primary types such as array-of-interface. Otherwise, each such
1649   // type would need its own customized SSA.
1650   // We move this check to the front of the fast path because many
1651   // type checks are in fact trivially successful in this manner,
1652   // so we get a nicely predicted branch right at the start of the check.
1653   cmpd(CCR0, sub_klass, super_klass);
1654   beq(CCR0, L_success);
1655 
1656   // Check the supertype display:
1657   lwz(check_cache_offset, sco_offset, super_klass);
1658   // The loaded value is the offset from KlassOopDesc.
1659 
1660   ldx(cached_super, check_cache_offset, sub_klass);
1661   cmpd(CCR0, cached_super, super_klass);
1662   beq(CCR0, L_success);
1663 
1664   // This check has worked decisively for primary supers.
1665   // Secondary supers are sought in the super_cache ('super_cache_addr').
1666   // (Secondary supers are interfaces and very deeply nested subtypes.)
1667   // This works in the same check above because of a tricky aliasing
1668   // between the super_cache and the primary super display elements.
1669   // (The 'super_check_addr' can address either, as the case requires.)
1670   // Note that the cache is updated below if it does not help us find
1671   // what we need immediately.
1672   // So if it was a primary super, we can just fail immediately.
1673   // Otherwise, it's the slow path for us (no success at this point).
1674 
1675   cmpwi(CCR0, check_cache_offset, sc_offset);
1676   bne(CCR0, L_failure);
1677   // bind(slow_path); // fallthru
1678 }
1679 
1680 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1681                                                    Register super_klass,
1682                                                    Register temp1_reg,
1683                                                    Register temp2_reg,
1684                                                    Label* L_success,
1685                                                    Register result_reg) {
1686   const Register array_ptr = temp1_reg; // current value from cache array
1687   const Register temp      = temp2_reg;
1688 
1689   assert_different_registers(sub_klass, super_klass, array_ptr, temp);
1690 
1691   int source_offset = in_bytes(Klass::secondary_supers_offset());
1692   int target_offset = in_bytes(Klass::secondary_super_cache_offset());
1693 
1694   int length_offset = Array<Klass*>::length_offset_in_bytes();
1695   int base_offset   = Array<Klass*>::base_offset_in_bytes();
1696 
1697   Label hit, loop, failure, fallthru;
1698 
1699   ld(array_ptr, source_offset, sub_klass);
1700 
1701   //assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated.");
1702   lwz(temp, length_offset, array_ptr);
1703   cmpwi(CCR0, temp, 0);
1704   beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0
1705 
1706   mtctr(temp); // load ctr
1707 
1708   bind(loop);
1709   // Oops in table are NO MORE compressed.
1710   ld(temp, base_offset, array_ptr);
1711   cmpd(CCR0, temp, super_klass);
1712   beq(CCR0, hit);
1713   addi(array_ptr, array_ptr, BytesPerWord);
1714   bdnz(loop);
1715 
1716   bind(failure);
1717   if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss)
1718   b(fallthru);
1719 
1720   bind(hit);
1721   std(super_klass, target_offset, sub_klass); // save result to cache
1722   if (result_reg != noreg) li(result_reg, 0); // load zero result (indicates a hit)
1723   if (L_success != NULL) b(*L_success);
1724 
1725   bind(fallthru);
1726 }
1727 
1728 // Try fast path, then go to slow one if not successful
1729 void MacroAssembler::check_klass_subtype(Register sub_klass,
1730                          Register super_klass,
1731                          Register temp1_reg,
1732                          Register temp2_reg,
1733                          Label& L_success) {
1734   Label L_failure;
1735   check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, L_success, L_failure);
1736   check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success);
1737   bind(L_failure); // Fallthru if not successful.
1738 }
1739 
1740 void MacroAssembler::check_method_handle_type(Register mtype_reg, Register mh_reg,
1741                                               Register temp_reg,
1742                                               Label& wrong_method_type) {
1743   assert_different_registers(mtype_reg, mh_reg, temp_reg);
1744   // Compare method type against that of the receiver.
1745   load_heap_oop_not_null(temp_reg, delayed_value(java_lang_invoke_MethodHandle::type_offset_in_bytes, temp_reg), mh_reg);
1746   cmpd(CCR0, temp_reg, mtype_reg);
1747   bne(CCR0, wrong_method_type);
1748 }
1749 
1750 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot,
1751                                                    Register temp_reg,
1752                                                    int extra_slot_offset) {
1753   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
1754   int stackElementSize = Interpreter::stackElementSize;
1755   int offset = extra_slot_offset * stackElementSize;
1756   if (arg_slot.is_constant()) {
1757     offset += arg_slot.as_constant() * stackElementSize;
1758     return offset;
1759   } else {
1760     assert(temp_reg != noreg, "must specify");
1761     sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize));
1762     if (offset != 0)
1763       addi(temp_reg, temp_reg, offset);
1764     return temp_reg;
1765   }
1766 }
1767 
1768 void MacroAssembler::biased_locking_enter(ConditionRegister cr_reg, Register obj_reg,
1769                                           Register mark_reg, Register temp_reg,
1770                                           Register temp2_reg, Label& done, Label* slow_case) {
1771   assert(UseBiasedLocking, "why call this otherwise?");
1772 
1773 #ifdef ASSERT
1774   assert_different_registers(obj_reg, mark_reg, temp_reg, temp2_reg);
1775 #endif
1776 
1777   Label cas_label;
1778 
1779   // Branch to done if fast path fails and no slow_case provided.
1780   Label *slow_case_int = (slow_case != NULL) ? slow_case : &done;
1781 
1782   // Biased locking
1783   // See whether the lock is currently biased toward our thread and
1784   // whether the epoch is still valid
1785   // Note that the runtime guarantees sufficient alignment of JavaThread
1786   // pointers to allow age to be placed into low bits
1787   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits,
1788          "biased locking makes assumptions about bit layout");
1789 
1790   if (PrintBiasedLockingStatistics) {
1791     load_const(temp_reg, (address) BiasedLocking::total_entry_count_addr(), temp2_reg);
1792     lwz(temp2_reg, 0, temp_reg);
1793     addi(temp2_reg, temp2_reg, 1);
1794     stw(temp2_reg, 0, temp_reg);
1795   }
1796 
1797   andi(temp_reg, mark_reg, markOopDesc::biased_lock_mask_in_place);
1798   cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern);
1799   bne(cr_reg, cas_label);
1800 
1801   load_klass(temp_reg, obj_reg);
1802 
1803   load_const_optimized(temp2_reg, ~((int) markOopDesc::age_mask_in_place));
1804   ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
1805   orr(temp_reg, R16_thread, temp_reg);
1806   xorr(temp_reg, mark_reg, temp_reg);
1807   andr(temp_reg, temp_reg, temp2_reg);
1808   cmpdi(cr_reg, temp_reg, 0);
1809   if (PrintBiasedLockingStatistics) {
1810     Label l;
1811     bne(cr_reg, l);
1812     load_const(mark_reg, (address) BiasedLocking::biased_lock_entry_count_addr());
1813     lwz(temp2_reg, 0, mark_reg);
1814     addi(temp2_reg, temp2_reg, 1);
1815     stw(temp2_reg, 0, mark_reg);
1816     // restore mark_reg
1817     ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);
1818     bind(l);
1819   }
1820   beq(cr_reg, done);
1821 
1822   Label try_revoke_bias;
1823   Label try_rebias;
1824 
1825   // At this point we know that the header has the bias pattern and
1826   // that we are not the bias owner in the current epoch. We need to
1827   // figure out more details about the state of the header in order to
1828   // know what operations can be legally performed on the object's
1829   // header.
1830 
1831   // If the low three bits in the xor result aren't clear, that means
1832   // the prototype header is no longer biased and we have to revoke
1833   // the bias on this object.
1834   andi(temp2_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
1835   cmpwi(cr_reg, temp2_reg, 0);
1836   bne(cr_reg, try_revoke_bias);
1837 
1838   // Biasing is still enabled for this data type. See whether the
1839   // epoch of the current bias is still valid, meaning that the epoch
1840   // bits of the mark word are equal to the epoch bits of the
1841   // prototype header. (Note that the prototype header's epoch bits
1842   // only change at a safepoint.) If not, attempt to rebias the object
1843   // toward the current thread. Note that we must be absolutely sure
1844   // that the current epoch is invalid in order to do this because
1845   // otherwise the manipulations it performs on the mark word are
1846   // illegal.
1847 
1848   int shift_amount = 64 - markOopDesc::epoch_shift;
1849   // rotate epoch bits to right (little) end and set other bits to 0
1850   // [ big part | epoch | little part ] -> [ 0..0 | epoch ]
1851   rldicl_(temp2_reg, temp_reg, shift_amount, 64 - markOopDesc::epoch_bits);
1852   // branch if epoch bits are != 0, i.e. they differ, because the epoch has been incremented
1853   bne(CCR0, try_rebias);
1854 
1855   // The epoch of the current bias is still valid but we know nothing
1856   // about the owner; it might be set or it might be clear. Try to
1857   // acquire the bias of the object using an atomic operation. If this
1858   // fails we will go in to the runtime to revoke the object's bias.
1859   // Note that we first construct the presumed unbiased header so we
1860   // don't accidentally blow away another thread's valid bias.
1861   andi(mark_reg, mark_reg, (markOopDesc::biased_lock_mask_in_place |
1862                                 markOopDesc::age_mask_in_place |
1863                                 markOopDesc::epoch_mask_in_place));
1864   orr(temp_reg, R16_thread, mark_reg);
1865 
1866   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
1867 
1868   // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
1869   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
1870            /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
1871            /*where=*/obj_reg,
1872            MacroAssembler::MemBarAcq,
1873            MacroAssembler::cmpxchgx_hint_acquire_lock(),
1874            noreg, slow_case_int); // bail out if failed
1875 
1876   // If the biasing toward our thread failed, this means that
1877   // another thread succeeded in biasing it toward itself and we
1878   // need to revoke that bias. The revocation will occur in the
1879   // interpreter runtime in the slow case.
1880   if (PrintBiasedLockingStatistics) {
1881     load_const(temp_reg, (address) BiasedLocking::anonymously_biased_lock_entry_count_addr(), temp2_reg);
1882     lwz(temp2_reg, 0, temp_reg);
1883     addi(temp2_reg, temp2_reg, 1);
1884     stw(temp2_reg, 0, temp_reg);
1885   }
1886   b(done);
1887 
1888   bind(try_rebias);
1889   // At this point we know the epoch has expired, meaning that the
1890   // current "bias owner", if any, is actually invalid. Under these
1891   // circumstances _only_, we are allowed to use the current header's
1892   // value as the comparison value when doing the cas to acquire the
1893   // bias in the current epoch. In other words, we allow transfer of
1894   // the bias from one thread to another directly in this situation.
1895   andi(temp_reg, mark_reg, markOopDesc::age_mask_in_place);
1896   orr(temp_reg, R16_thread, temp_reg);
1897   load_klass(temp2_reg, obj_reg);
1898   ld(temp2_reg, in_bytes(Klass::prototype_header_offset()), temp2_reg);
1899   orr(temp_reg, temp_reg, temp2_reg);
1900 
1901   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
1902 
1903   // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
1904   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
1905                  /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
1906                  /*where=*/obj_reg,
1907                  MacroAssembler::MemBarAcq,
1908                  MacroAssembler::cmpxchgx_hint_acquire_lock(),
1909                  noreg, slow_case_int); // bail out if failed
1910 
1911   // If the biasing toward our thread failed, this means that
1912   // another thread succeeded in biasing it toward itself and we
1913   // need to revoke that bias. The revocation will occur in the
1914   // interpreter runtime in the slow case.
1915   if (PrintBiasedLockingStatistics) {
1916     load_const(temp_reg, (address) BiasedLocking::rebiased_lock_entry_count_addr(), temp2_reg);
1917     lwz(temp2_reg, 0, temp_reg);
1918     addi(temp2_reg, temp2_reg, 1);
1919     stw(temp2_reg, 0, temp_reg);
1920   }
1921   b(done);
1922 
1923   bind(try_revoke_bias);
1924   // The prototype mark in the klass doesn't have the bias bit set any
1925   // more, indicating that objects of this data type are not supposed
1926   // to be biased any more. We are going to try to reset the mark of
1927   // this object to the prototype value and fall through to the
1928   // CAS-based locking scheme. Note that if our CAS fails, it means
1929   // that another thread raced us for the privilege of revoking the
1930   // bias of this particular object, so it's okay to continue in the
1931   // normal locking code.
1932   load_klass(temp_reg, obj_reg);
1933   ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
1934   andi(temp2_reg, mark_reg, markOopDesc::age_mask_in_place);
1935   orr(temp_reg, temp_reg, temp2_reg);
1936 
1937   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
1938 
1939   // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
1940   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
1941                  /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
1942                  /*where=*/obj_reg,
1943                  MacroAssembler::MemBarAcq,
1944                  MacroAssembler::cmpxchgx_hint_acquire_lock());
1945 
1946   // reload markOop in mark_reg before continuing with lightweight locking
1947   ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);
1948 
1949   // Fall through to the normal CAS-based lock, because no matter what
1950   // the result of the above CAS, some thread must have succeeded in
1951   // removing the bias bit from the object's header.
1952   if (PrintBiasedLockingStatistics) {
1953     Label l;
1954     bne(cr_reg, l);
1955     load_const(temp_reg, (address) BiasedLocking::revoked_lock_entry_count_addr(), temp2_reg);
1956     lwz(temp2_reg, 0, temp_reg);
1957     addi(temp2_reg, temp2_reg, 1);
1958     stw(temp2_reg, 0, temp_reg);
1959     bind(l);
1960   }
1961 
1962   bind(cas_label);
1963 }
1964 
1965 void MacroAssembler::biased_locking_exit (ConditionRegister cr_reg, Register mark_addr, Register temp_reg, Label& done) {
1966   // Check for biased locking unlock case, which is a no-op
1967   // Note: we do not have to check the thread ID for two reasons.
1968   // First, the interpreter checks for IllegalMonitorStateException at
1969   // a higher level. Second, if the bias was revoked while we held the
1970   // lock, the object could not be rebiased toward another thread, so
1971   // the bias bit would be clear.
1972 
1973   ld(temp_reg, 0, mark_addr);
1974   andi(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
1975 
1976   cmpwi(cr_reg, temp_reg, markOopDesc::biased_lock_pattern);
1977   beq(cr_reg, done);
1978 }
1979 
1980 // TM on PPC64.
1981 void MacroAssembler::atomic_inc_ptr(Register addr, Register result, int simm16) {
1982   Label retry;
1983   bind(retry);
1984   ldarx(result, addr, /*hint*/ false);
1985   addi(result, result, simm16);
1986   stdcx_(result, addr);
1987   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1988     bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
1989   } else {
1990     bne(                  CCR0, retry); // stXcx_ sets CCR0
1991   }
1992 }
1993 
1994 void MacroAssembler::atomic_ori_int(Register addr, Register result, int uimm16) {
1995   Label retry;
1996   bind(retry);
1997   lwarx(result, addr, /*hint*/ false);
1998   ori(result, result, uimm16);
1999   stwcx_(result, addr);
2000   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
2001     bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
2002   } else {
2003     bne(                  CCR0, retry); // stXcx_ sets CCR0
2004   }
2005 }
2006 
2007 #if INCLUDE_RTM_OPT
2008 
2009 // Update rtm_counters based on abort status
2010 // input: abort_status
2011 //        rtm_counters (RTMLockingCounters*)
2012 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters_Reg) {
2013   // Mapping to keep PreciseRTMLockingStatistics similar to x86.
2014   // x86 ppc (! means inverted, ? means not the same)
2015   //  0   31  Set if abort caused by XABORT instruction.
2016   //  1  ! 7  If set, the transaction may succeed on a retry. This bit is always clear if bit 0 is set.
2017   //  2   13  Set if another logical processor conflicted with a memory address that was part of the transaction that aborted.
2018   //  3   10  Set if an internal buffer overflowed.
2019   //  4  ?12  Set if a debug breakpoint was hit.
2020   //  5  ?32  Set if an abort occurred during execution of a nested transaction.
2021   const  int tm_failure_bit[] = {Assembler::tm_tabort, // Note: Seems like signal handler sets this, too.
2022                                  Assembler::tm_failure_persistent, // inverted: transient
2023                                  Assembler::tm_trans_cf,
2024                                  Assembler::tm_footprint_of,
2025                                  Assembler::tm_non_trans_cf,
2026                                  Assembler::tm_suspended};
2027   const bool tm_failure_inv[] = {false, true, false, false, false, false};
2028   assert(sizeof(tm_failure_bit)/sizeof(int) == RTMLockingCounters::ABORT_STATUS_LIMIT, "adapt mapping!");
2029 
2030   const Register addr_Reg = R0;
2031   // Keep track of offset to where rtm_counters_Reg had pointed to.
2032   int counters_offs = RTMLockingCounters::abort_count_offset();
2033   addi(addr_Reg, rtm_counters_Reg, counters_offs);
2034   const Register temp_Reg = rtm_counters_Reg;
2035 
2036   //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically
2037   ldx(temp_Reg, addr_Reg);
2038   addi(temp_Reg, temp_Reg, 1);
2039   stdx(temp_Reg, addr_Reg);
2040 
2041   if (PrintPreciseRTMLockingStatistics) {
2042     int counters_offs_delta = RTMLockingCounters::abortX_count_offset() - counters_offs;
2043 
2044     //mftexasr(abort_status); done by caller
2045     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
2046       counters_offs += counters_offs_delta;
2047       li(temp_Reg, counters_offs_delta); // can't use addi with R0
2048       add(addr_Reg, addr_Reg, temp_Reg); // point to next counter
2049       counters_offs_delta = sizeof(uintx);
2050 
2051       Label check_abort;
2052       rldicr_(temp_Reg, abort_status, tm_failure_bit[i], 0);
2053       if (tm_failure_inv[i]) {
2054         bne(CCR0, check_abort);
2055       } else {
2056         beq(CCR0, check_abort);
2057       }
2058       //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically
2059       ldx(temp_Reg, addr_Reg);
2060       addi(temp_Reg, temp_Reg, 1);
2061       stdx(temp_Reg, addr_Reg);
2062       bind(check_abort);
2063     }
2064   }
2065   li(temp_Reg, -counters_offs); // can't use addi with R0
2066   add(rtm_counters_Reg, addr_Reg, temp_Reg); // restore
2067 }
2068 
2069 // Branch if (random & (count-1) != 0), count is 2^n
2070 // tmp and CR0 are killed
2071 void MacroAssembler::branch_on_random_using_tb(Register tmp, int count, Label& brLabel) {
2072   mftb(tmp);
2073   andi_(tmp, tmp, count-1);
2074   bne(CCR0, brLabel);
2075 }
2076 
2077 // Perform abort ratio calculation, set no_rtm bit if high ratio.
2078 // input:  rtm_counters_Reg (RTMLockingCounters* address) - KILLED
2079 void MacroAssembler::rtm_abort_ratio_calculation(Register rtm_counters_Reg,
2080                                                  RTMLockingCounters* rtm_counters,
2081                                                  Metadata* method_data) {
2082   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
2083 
2084   if (RTMLockingCalculationDelay > 0) {
2085     // Delay calculation.
2086     ld(rtm_counters_Reg, (RegisterOrConstant)(intptr_t)RTMLockingCounters::rtm_calculation_flag_addr());
2087     cmpdi(CCR0, rtm_counters_Reg, 0);
2088     beq(CCR0, L_done);
2089     load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2090   }
2091   // Abort ratio calculation only if abort_count > RTMAbortThreshold.
2092   //   Aborted transactions = abort_count * 100
2093   //   All transactions = total_count *  RTMTotalCountIncrRate
2094   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
2095   ld(R0, RTMLockingCounters::abort_count_offset(), rtm_counters_Reg);
2096   cmpdi(CCR0, R0, RTMAbortThreshold);
2097   blt(CCR0, L_check_always_rtm2);
2098   mulli(R0, R0, 100);
2099 
2100   const Register tmpReg = rtm_counters_Reg;
2101   ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2102   mulli(tmpReg, tmpReg, RTMTotalCountIncrRate);
2103   mulli(tmpReg, tmpReg, RTMAbortRatio);
2104   cmpd(CCR0, R0, tmpReg);
2105   blt(CCR0, L_check_always_rtm1); // jump to reload
2106   if (method_data != NULL) {
2107     // Set rtm_state to "no rtm" in MDO.
2108     // Not using a metadata relocation. Method and Class Loader are kept alive anyway.
2109     // (See nmethod::metadata_do and CodeBuffer::finalize_oop_references.)
2110     load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2111     atomic_ori_int(R0, tmpReg, NoRTM);
2112   }
2113   b(L_done);
2114 
2115   bind(L_check_always_rtm1);
2116   load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2117   bind(L_check_always_rtm2);
2118   ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2119   cmpdi(CCR0, tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
2120   blt(CCR0, L_done);
2121   if (method_data != NULL) {
2122     // Set rtm_state to "always rtm" in MDO.
2123     // Not using a metadata relocation. See above.
2124     load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2125     atomic_ori_int(R0, tmpReg, UseRTM);
2126   }
2127   bind(L_done);
2128 }
2129 
2130 // Update counters and perform abort ratio calculation.
2131 // input: abort_status_Reg
2132 void MacroAssembler::rtm_profiling(Register abort_status_Reg, Register temp_Reg,
2133                                    RTMLockingCounters* rtm_counters,
2134                                    Metadata* method_data,
2135                                    bool profile_rtm) {
2136 
2137   assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2138   // Update rtm counters based on state at abort.
2139   // Reads abort_status_Reg, updates flags.
2140   assert_different_registers(abort_status_Reg, temp_Reg);
2141   load_const_optimized(temp_Reg, (address)rtm_counters, R0);
2142   rtm_counters_update(abort_status_Reg, temp_Reg);
2143   if (profile_rtm) {
2144     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2145     rtm_abort_ratio_calculation(temp_Reg, rtm_counters, method_data);
2146   }
2147 }
2148 
2149 // Retry on abort if abort's status indicates non-persistent failure.
2150 // inputs: retry_count_Reg
2151 //       : abort_status_Reg
2152 // output: retry_count_Reg decremented by 1
2153 void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg,
2154                                              Label& retryLabel, Label* checkRetry) {
2155   Label doneRetry;
2156   rldicr_(R0, abort_status_Reg, tm_failure_persistent, 0);
2157   bne(CCR0, doneRetry);
2158   if (checkRetry) { bind(*checkRetry); }
2159   addic_(retry_count_Reg, retry_count_Reg, -1);
2160   blt(CCR0, doneRetry);
2161   smt_yield(); // Can't use wait(). No permission (SIGILL).
2162   b(retryLabel);
2163   bind(doneRetry);
2164 }
2165 
2166 // Spin and retry if lock is busy.
2167 // inputs: box_Reg (monitor address)
2168 //       : retry_count_Reg
2169 // output: retry_count_Reg decremented by 1
2170 // CTR is killed
2171 void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register owner_addr_Reg, Label& retryLabel) {
2172   Label SpinLoop, doneRetry;
2173   addic_(retry_count_Reg, retry_count_Reg, -1);
2174   blt(CCR0, doneRetry);
2175   li(R0, RTMSpinLoopCount);
2176   mtctr(R0);
2177 
2178   bind(SpinLoop);
2179   smt_yield(); // Can't use waitrsv(). No permission (SIGILL).
2180   bdz(retryLabel);
2181   ld(R0, 0, owner_addr_Reg);
2182   cmpdi(CCR0, R0, 0);
2183   bne(CCR0, SpinLoop);
2184   b(retryLabel);
2185 
2186   bind(doneRetry);
2187 }
2188 
2189 // Use RTM for normal stack locks.
2190 // Input: objReg (object to lock)
2191 void MacroAssembler::rtm_stack_locking(ConditionRegister flag,
2192                                        Register obj, Register mark_word, Register tmp,
2193                                        Register retry_on_abort_count_Reg,
2194                                        RTMLockingCounters* stack_rtm_counters,
2195                                        Metadata* method_data, bool profile_rtm,
2196                                        Label& DONE_LABEL, Label& IsInflated) {
2197   assert(UseRTMForStackLocks, "why call this otherwise?");
2198   assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
2199   Label L_rtm_retry, L_decrement_retry, L_on_abort;
2200 
2201   if (RTMRetryCount > 0) {
2202     load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
2203     bind(L_rtm_retry);
2204   }
2205   andi_(R0, mark_word, markOopDesc::monitor_value);  // inflated vs stack-locked|neutral|biased
2206   bne(CCR0, IsInflated);
2207 
2208   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2209     Label L_noincrement;
2210     if (RTMTotalCountIncrRate > 1) {
2211       branch_on_random_using_tb(tmp, (int)RTMTotalCountIncrRate, L_noincrement);
2212     }
2213     assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
2214     load_const_optimized(tmp, (address)stack_rtm_counters->total_count_addr(), R0);
2215     //atomic_inc_ptr(tmp, /*temp, will be reloaded*/mark_word); We don't increment atomically
2216     ldx(mark_word, tmp);
2217     addi(mark_word, mark_word, 1);
2218     stdx(mark_word, tmp);
2219     bind(L_noincrement);
2220   }
2221   tbegin_();
2222   beq(CCR0, L_on_abort);
2223   ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);         // Reload in transaction, conflicts need to be tracked.
2224   andi(R0, mark_word, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
2225   cmpwi(flag, R0, markOopDesc::unlocked_value);                // bits = 001 unlocked
2226   beq(flag, DONE_LABEL);                                       // all done if unlocked
2227 
2228   if (UseRTMXendForLockBusy) {
2229     tend_();
2230     b(L_decrement_retry);
2231   } else {
2232     tabort_();
2233   }
2234   bind(L_on_abort);
2235   const Register abort_status_Reg = tmp;
2236   mftexasr(abort_status_Reg);
2237   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2238     rtm_profiling(abort_status_Reg, /*temp*/mark_word, stack_rtm_counters, method_data, profile_rtm);
2239   }
2240   ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // reload
2241   if (RTMRetryCount > 0) {
2242     // Retry on lock abort if abort status is not permanent.
2243     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry, &L_decrement_retry);
2244   } else {
2245     bind(L_decrement_retry);
2246   }
2247 }
2248 
2249 // Use RTM for inflating locks
2250 // inputs: obj       (object to lock)
2251 //         mark_word (current header - KILLED)
2252 //         boxReg    (on-stack box address (displaced header location) - KILLED)
2253 void MacroAssembler::rtm_inflated_locking(ConditionRegister flag,
2254                                           Register obj, Register mark_word, Register boxReg,
2255                                           Register retry_on_busy_count_Reg, Register retry_on_abort_count_Reg,
2256                                           RTMLockingCounters* rtm_counters,
2257                                           Metadata* method_data, bool profile_rtm,
2258                                           Label& DONE_LABEL) {
2259   assert(UseRTMLocking, "why call this otherwise?");
2260   Label L_rtm_retry, L_decrement_retry, L_on_abort;
2261   // Clean monitor_value bit to get valid pointer.
2262   int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markOopDesc::monitor_value;
2263 
2264   // Store non-null, using boxReg instead of (intptr_t)markOopDesc::unused_mark().
2265   std(boxReg, BasicLock::displaced_header_offset_in_bytes(), boxReg);
2266   const Register tmpReg = boxReg;
2267   const Register owner_addr_Reg = mark_word;
2268   addi(owner_addr_Reg, mark_word, owner_offset);
2269 
2270   if (RTMRetryCount > 0) {
2271     load_const_optimized(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy.
2272     load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort.
2273     bind(L_rtm_retry);
2274   }
2275   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2276     Label L_noincrement;
2277     if (RTMTotalCountIncrRate > 1) {
2278       branch_on_random_using_tb(R0, (int)RTMTotalCountIncrRate, L_noincrement);
2279     }
2280     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2281     load_const(R0, (address)rtm_counters->total_count_addr(), tmpReg);
2282     //atomic_inc_ptr(R0, tmpReg); We don't increment atomically
2283     ldx(tmpReg, R0);
2284     addi(tmpReg, tmpReg, 1);
2285     stdx(tmpReg, R0);
2286     bind(L_noincrement);
2287   }
2288   tbegin_();
2289   beq(CCR0, L_on_abort);
2290   // We don't reload mark word. Will only be reset at safepoint.
2291   ld(R0, 0, owner_addr_Reg); // Load in transaction, conflicts need to be tracked.
2292   cmpdi(flag, R0, 0);
2293   beq(flag, DONE_LABEL);
2294 
2295   if (UseRTMXendForLockBusy) {
2296     tend_();
2297     b(L_decrement_retry);
2298   } else {
2299     tabort_();
2300   }
2301   bind(L_on_abort);
2302   const Register abort_status_Reg = tmpReg;
2303   mftexasr(abort_status_Reg);
2304   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2305     rtm_profiling(abort_status_Reg, /*temp*/ owner_addr_Reg, rtm_counters, method_data, profile_rtm);
2306     // Restore owner_addr_Reg
2307     ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);
2308 #ifdef ASSERT
2309     andi_(R0, mark_word, markOopDesc::monitor_value);
2310     asm_assert_ne("must be inflated", 0xa754); // Deflating only allowed at safepoint.
2311 #endif
2312     addi(owner_addr_Reg, mark_word, owner_offset);
2313   }
2314   if (RTMRetryCount > 0) {
2315     // Retry on lock abort if abort status is not permanent.
2316     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
2317   }
2318 
2319   // Appears unlocked - try to swing _owner from null to non-null.
2320   cmpxchgd(flag, /*current val*/ R0, (intptr_t)0, /*new val*/ R16_thread, owner_addr_Reg,
2321            MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2322            MacroAssembler::cmpxchgx_hint_acquire_lock(), noreg, &L_decrement_retry, true);
2323 
2324   if (RTMRetryCount > 0) {
2325     // success done else retry
2326     b(DONE_LABEL);
2327     bind(L_decrement_retry);
2328     // Spin and retry if lock is busy.
2329     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, owner_addr_Reg, L_rtm_retry);
2330   } else {
2331     bind(L_decrement_retry);
2332   }
2333 }
2334 
2335 #endif //  INCLUDE_RTM_OPT
2336 
2337 // "The box" is the space on the stack where we copy the object mark.
2338 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box,
2339                                                Register temp, Register displaced_header, Register current_header,
2340                                                bool try_bias,
2341                                                RTMLockingCounters* rtm_counters,
2342                                                RTMLockingCounters* stack_rtm_counters,
2343                                                Metadata* method_data,
2344                                                bool use_rtm, bool profile_rtm) {
2345   assert_different_registers(oop, box, temp, displaced_header, current_header);
2346   assert(flag != CCR0, "bad condition register");
2347   Label cont;
2348   Label object_has_monitor;
2349   Label cas_failed;
2350 
2351   // Load markOop from object into displaced_header.
2352   ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop);
2353 
2354 
2355   // Always do locking in runtime.
2356   if (EmitSync & 0x01) {
2357     cmpdi(flag, oop, 0); // Oop can't be 0 here => always false.
2358     return;
2359   }
2360 
2361   if (try_bias) {
2362     biased_locking_enter(flag, oop, displaced_header, temp, current_header, cont);
2363   }
2364 
2365 #if INCLUDE_RTM_OPT
2366   if (UseRTMForStackLocks && use_rtm) {
2367     rtm_stack_locking(flag, oop, displaced_header, temp, /*temp*/ current_header,
2368                       stack_rtm_counters, method_data, profile_rtm,
2369                       cont, object_has_monitor);
2370   }
2371 #endif // INCLUDE_RTM_OPT
2372 
2373   // Handle existing monitor.
2374   if ((EmitSync & 0x02) == 0) {
2375     // The object has an existing monitor iff (mark & monitor_value) != 0.
2376     andi_(temp, displaced_header, markOopDesc::monitor_value);
2377     bne(CCR0, object_has_monitor);
2378   }
2379 
2380   // Set displaced_header to be (markOop of object | UNLOCK_VALUE).
2381   ori(displaced_header, displaced_header, markOopDesc::unlocked_value);
2382 
2383   // Load Compare Value application register.
2384 
2385   // Initialize the box. (Must happen before we update the object mark!)
2386   std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2387 
2388   // Must fence, otherwise, preceding store(s) may float below cmpxchg.
2389   // Compare object markOop with mark and if equal exchange scratch1 with object markOop.
2390   // CmpxchgX sets cr_reg to cmpX(current, displaced).
2391   membar(Assembler::StoreStore);
2392   cmpxchgd(/*flag=*/flag,
2393            /*current_value=*/current_header,
2394            /*compare_value=*/displaced_header,
2395            /*exchange_value=*/box,
2396            /*where=*/oop,
2397            MacroAssembler::MemBarAcq,
2398            MacroAssembler::cmpxchgx_hint_acquire_lock(),
2399            noreg,
2400            &cas_failed);
2401   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2402 
2403   // If the compare-and-exchange succeeded, then we found an unlocked
2404   // object and we have now locked it.
2405   b(cont);
2406 
2407   bind(cas_failed);
2408   // We did not see an unlocked object so try the fast recursive case.
2409 
2410   // Check if the owner is self by comparing the value in the markOop of object
2411   // (current_header) with the stack pointer.
2412   sub(current_header, current_header, R1_SP);
2413   load_const_optimized(temp, (address) (~(os::vm_page_size()-1) |
2414                                         markOopDesc::lock_mask_in_place));
2415 
2416   and_(R0/*==0?*/, current_header, temp);
2417   // If condition is true we are cont and hence we can store 0 as the
2418   // displaced header in the box, which indicates that it is a recursive lock.
2419   mcrf(flag,CCR0);
2420   std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box);
2421 
2422   // Handle existing monitor.
2423   if ((EmitSync & 0x02) == 0) {
2424     b(cont);
2425 
2426     bind(object_has_monitor);
2427     // The object's monitor m is unlocked iff m->owner == NULL,
2428     // otherwise m->owner may contain a thread or a stack address.
2429 
2430 #if INCLUDE_RTM_OPT
2431     // Use the same RTM locking code in 32- and 64-bit VM.
2432     if (use_rtm) {
2433       rtm_inflated_locking(flag, oop, displaced_header, box, temp, /*temp*/ current_header,
2434                            rtm_counters, method_data, profile_rtm, cont);
2435     } else {
2436 #endif // INCLUDE_RTM_OPT
2437 
2438     // Try to CAS m->owner from NULL to current thread.
2439     addi(temp, displaced_header, ObjectMonitor::owner_offset_in_bytes()-markOopDesc::monitor_value);
2440     li(displaced_header, 0);
2441     // CmpxchgX sets flag to cmpX(current, displaced).
2442     cmpxchgd(/*flag=*/flag,
2443              /*current_value=*/current_header,
2444              /*compare_value=*/(intptr_t)0,
2445              /*exchange_value=*/R16_thread,
2446              /*where=*/temp,
2447              MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2448              MacroAssembler::cmpxchgx_hint_acquire_lock());
2449 
2450     // Store a non-null value into the box.
2451     std(box, BasicLock::displaced_header_offset_in_bytes(), box);
2452 
2453 #   ifdef ASSERT
2454     bne(flag, cont);
2455     // We have acquired the monitor, check some invariants.
2456     addi(/*monitor=*/temp, temp, -ObjectMonitor::owner_offset_in_bytes());
2457     // Invariant 1: _recursions should be 0.
2458     //assert(ObjectMonitor::recursions_size_in_bytes() == 8, "unexpected size");
2459     asm_assert_mem8_is_zero(ObjectMonitor::recursions_offset_in_bytes(), temp,
2460                             "monitor->_recursions should be 0", -1);
2461     // Invariant 2: OwnerIsThread shouldn't be 0.
2462     //assert(ObjectMonitor::OwnerIsThread_size_in_bytes() == 4, "unexpected size");
2463     //asm_assert_mem4_isnot_zero(ObjectMonitor::OwnerIsThread_offset_in_bytes(), temp,
2464     //                           "monitor->OwnerIsThread shouldn't be 0", -1);
2465 #   endif
2466 
2467 #if INCLUDE_RTM_OPT
2468     } // use_rtm()
2469 #endif
2470   }
2471 
2472   bind(cont);
2473   // flag == EQ indicates success
2474   // flag == NE indicates failure
2475 }
2476 
2477 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box,
2478                                                  Register temp, Register displaced_header, Register current_header,
2479                                                  bool try_bias, bool use_rtm) {
2480   assert_different_registers(oop, box, temp, displaced_header, current_header);
2481   assert(flag != CCR0, "bad condition register");
2482   Label cont;
2483   Label object_has_monitor;
2484 
2485   // Always do locking in runtime.
2486   if (EmitSync & 0x01) {
2487     cmpdi(flag, oop, 0); // Oop can't be 0 here => always false.
2488     return;
2489   }
2490 
2491   if (try_bias) {
2492     biased_locking_exit(flag, oop, current_header, cont);
2493   }
2494 
2495 #if INCLUDE_RTM_OPT
2496   if (UseRTMForStackLocks && use_rtm) {
2497     assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
2498     Label L_regular_unlock;
2499     ld(current_header, oopDesc::mark_offset_in_bytes(), oop);         // fetch markword
2500     andi(R0, current_header, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
2501     cmpwi(flag, R0, markOopDesc::unlocked_value);                     // bits = 001 unlocked
2502     bne(flag, L_regular_unlock);                                      // else RegularLock
2503     tend_();                                                          // otherwise end...
2504     b(cont);                                                          // ... and we're done
2505     bind(L_regular_unlock);
2506   }
2507 #endif
2508 
2509   // Find the lock address and load the displaced header from the stack.
2510   ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2511 
2512   // If the displaced header is 0, we have a recursive unlock.
2513   cmpdi(flag, displaced_header, 0);
2514   beq(flag, cont);
2515 
2516   // Handle existing monitor.
2517   if ((EmitSync & 0x02) == 0) {
2518     // The object has an existing monitor iff (mark & monitor_value) != 0.
2519     RTM_OPT_ONLY( if (!(UseRTMForStackLocks && use_rtm)) ) // skip load if already done
2520     ld(current_header, oopDesc::mark_offset_in_bytes(), oop);
2521     andi_(R0, current_header, markOopDesc::monitor_value);
2522     bne(CCR0, object_has_monitor);
2523   }
2524 
2525   // Check if it is still a light weight lock, this is is true if we see
2526   // the stack address of the basicLock in the markOop of the object.
2527   // Cmpxchg sets flag to cmpd(current_header, box).
2528   cmpxchgd(/*flag=*/flag,
2529            /*current_value=*/current_header,
2530            /*compare_value=*/box,
2531            /*exchange_value=*/displaced_header,
2532            /*where=*/oop,
2533            MacroAssembler::MemBarRel,
2534            MacroAssembler::cmpxchgx_hint_release_lock(),
2535            noreg,
2536            &cont);
2537 
2538   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2539 
2540   // Handle existing monitor.
2541   if ((EmitSync & 0x02) == 0) {
2542     b(cont);
2543 
2544     bind(object_has_monitor);
2545     addi(current_header, current_header, -markOopDesc::monitor_value); // monitor
2546     ld(temp,             ObjectMonitor::owner_offset_in_bytes(), current_header);
2547 
2548     // It's inflated.
2549 #if INCLUDE_RTM_OPT
2550     if (use_rtm) {
2551       Label L_regular_inflated_unlock;
2552       // Clean monitor_value bit to get valid pointer
2553       cmpdi(flag, temp, 0);
2554       bne(flag, L_regular_inflated_unlock);
2555       tend_();
2556       b(cont);
2557       bind(L_regular_inflated_unlock);
2558     }
2559 #endif
2560 
2561     ld(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header);
2562     xorr(temp, R16_thread, temp);      // Will be 0 if we are the owner.
2563     orr(temp, temp, displaced_header); // Will be 0 if there are 0 recursions.
2564     cmpdi(flag, temp, 0);
2565     bne(flag, cont);
2566 
2567     ld(temp,             ObjectMonitor::EntryList_offset_in_bytes(), current_header);
2568     ld(displaced_header, ObjectMonitor::cxq_offset_in_bytes(), current_header);
2569     orr(temp, temp, displaced_header); // Will be 0 if both are 0.
2570     cmpdi(flag, temp, 0);
2571     bne(flag, cont);
2572     release();
2573     std(temp, ObjectMonitor::owner_offset_in_bytes(), current_header);
2574   }
2575 
2576   bind(cont);
2577   // flag == EQ indicates success
2578   // flag == NE indicates failure
2579 }
2580 
2581 // Write serialization page so VM thread can do a pseudo remote membar.
2582 // We use the current thread pointer to calculate a thread specific
2583 // offset to write to within the page. This minimizes bus traffic
2584 // due to cache line collision.
2585 void MacroAssembler::serialize_memory(Register thread, Register tmp1, Register tmp2) {
2586   srdi(tmp2, thread, os::get_serialize_page_shift_count());
2587 
2588   int mask = os::vm_page_size() - sizeof(int);
2589   if (Assembler::is_simm(mask, 16)) {
2590     andi(tmp2, tmp2, mask);
2591   } else {
2592     lis(tmp1, (int)((signed short) (mask >> 16)));
2593     ori(tmp1, tmp1, mask & 0x0000ffff);
2594     andr(tmp2, tmp2, tmp1);
2595   }
2596 
2597   load_const(tmp1, (long) os::get_memory_serialize_page());
2598   release();
2599   stwx(R0, tmp1, tmp2);
2600 }
2601 
2602 
2603 // GC barrier helper macros
2604 
2605 // Write the card table byte if needed.
2606 void MacroAssembler::card_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp) {
2607   CardTableModRefBS* bs =
2608     barrier_set_cast<CardTableModRefBS>(Universe::heap()->barrier_set());
2609   assert(bs->kind() == BarrierSet::CardTableForRS ||
2610          bs->kind() == BarrierSet::CardTableExtension, "wrong barrier");
2611 #ifdef ASSERT
2612   cmpdi(CCR0, Rnew_val, 0);
2613   asm_assert_ne("null oop not allowed", 0x321);
2614 #endif
2615   card_table_write(bs->byte_map_base, Rtmp, Rstore_addr);
2616 }
2617 
2618 // Write the card table byte.
2619 void MacroAssembler::card_table_write(jbyte* byte_map_base, Register Rtmp, Register Robj) {
2620   assert_different_registers(Robj, Rtmp, R0);
2621   load_const_optimized(Rtmp, (address)byte_map_base, R0);
2622   srdi(Robj, Robj, CardTableModRefBS::card_shift);
2623   li(R0, 0); // dirty
2624   if (UseConcMarkSweepGC) membar(Assembler::StoreStore);
2625   stbx(R0, Rtmp, Robj);
2626 }
2627 
2628 #if INCLUDE_ALL_GCS
2629 // General G1 pre-barrier generator.
2630 // Goal: record the previous value if it is not null.
2631 void MacroAssembler::g1_write_barrier_pre(Register Robj, RegisterOrConstant offset, Register Rpre_val,
2632                                           Register Rtmp1, Register Rtmp2, bool needs_frame) {
2633   Label runtime, filtered;
2634 
2635   // Is marking active?
2636   if (in_bytes(SATBMarkQueue::byte_width_of_active()) == 4) {
2637     lwz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_active()), R16_thread);
2638   } else {
2639     guarantee(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, "Assumption");
2640     lbz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_active()), R16_thread);
2641   }
2642   cmpdi(CCR0, Rtmp1, 0);
2643   beq(CCR0, filtered);
2644 
2645   // Do we need to load the previous value?
2646   if (Robj != noreg) {
2647     // Load the previous value...
2648     if (UseCompressedOops) {
2649       lwz(Rpre_val, offset, Robj);
2650     } else {
2651       ld(Rpre_val, offset, Robj);
2652     }
2653     // Previous value has been loaded into Rpre_val.
2654   }
2655   assert(Rpre_val != noreg, "must have a real register");
2656 
2657   // Is the previous value null?
2658   cmpdi(CCR0, Rpre_val, 0);
2659   beq(CCR0, filtered);
2660 
2661   if (Robj != noreg && UseCompressedOops) {
2662     decode_heap_oop_not_null(Rpre_val);
2663   }
2664 
2665   // OK, it's not filtered, so we'll need to call enqueue. In the normal
2666   // case, pre_val will be a scratch G-reg, but there are some cases in
2667   // which it's an O-reg. In the first case, do a normal call. In the
2668   // latter, do a save here and call the frameless version.
2669 
2670   // Can we store original value in the thread's buffer?
2671   // Is index == 0?
2672   // (The index field is typed as size_t.)
2673   const Register Rbuffer = Rtmp1, Rindex = Rtmp2;
2674 
2675   ld(Rindex, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_index()), R16_thread);
2676   cmpdi(CCR0, Rindex, 0);
2677   beq(CCR0, runtime); // If index == 0, goto runtime.
2678   ld(Rbuffer, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_buf()), R16_thread);
2679 
2680   addi(Rindex, Rindex, -wordSize); // Decrement index.
2681   std(Rindex, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_index()), R16_thread);
2682 
2683   // Record the previous value.
2684   stdx(Rpre_val, Rbuffer, Rindex);
2685   b(filtered);
2686 
2687   bind(runtime);
2688 
2689   // VM call need frame to access(write) O register.
2690   if (needs_frame) {
2691     save_LR_CR(Rtmp1);
2692     push_frame_reg_args(0, Rtmp2);
2693   }
2694 
2695   if (Rpre_val->is_volatile() && Robj == noreg) mr(R31, Rpre_val); // Save pre_val across C call if it was preloaded.
2696   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), Rpre_val, R16_thread);
2697   if (Rpre_val->is_volatile() && Robj == noreg) mr(Rpre_val, R31); // restore
2698 
2699   if (needs_frame) {
2700     pop_frame();
2701     restore_LR_CR(Rtmp1);
2702   }
2703 
2704   bind(filtered);
2705 }
2706 
2707 // General G1 post-barrier generator
2708 // Store cross-region card.
2709 void MacroAssembler::g1_write_barrier_post(Register Rstore_addr, Register Rnew_val, Register Rtmp1, Register Rtmp2, Register Rtmp3, Label *filtered_ext) {
2710   Label runtime, filtered_int;
2711   Label& filtered = (filtered_ext != NULL) ? *filtered_ext : filtered_int;
2712   assert_different_registers(Rstore_addr, Rnew_val, Rtmp1, Rtmp2);
2713 
2714   G1SATBCardTableLoggingModRefBS* bs =
2715     barrier_set_cast<G1SATBCardTableLoggingModRefBS>(Universe::heap()->barrier_set());
2716 
2717   // Does store cross heap regions?
2718   if (G1RSBarrierRegionFilter) {
2719     xorr(Rtmp1, Rstore_addr, Rnew_val);
2720     srdi_(Rtmp1, Rtmp1, HeapRegion::LogOfHRGrainBytes);
2721     beq(CCR0, filtered);
2722   }
2723 
2724   // Crosses regions, storing NULL?
2725 #ifdef ASSERT
2726   cmpdi(CCR0, Rnew_val, 0);
2727   asm_assert_ne("null oop not allowed (G1)", 0x322); // Checked by caller on PPC64, so following branch is obsolete:
2728   //beq(CCR0, filtered);
2729 #endif
2730 
2731   // Storing region crossing non-NULL, is card already dirty?
2732   assert(sizeof(*bs->byte_map_base) == sizeof(jbyte), "adjust this code");
2733   const Register Rcard_addr = Rtmp1;
2734   Register Rbase = Rtmp2;
2735   load_const_optimized(Rbase, (address)bs->byte_map_base, /*temp*/ Rtmp3);
2736 
2737   srdi(Rcard_addr, Rstore_addr, CardTableModRefBS::card_shift);
2738 
2739   // Get the address of the card.
2740   lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr);
2741   cmpwi(CCR0, Rtmp3, (int)G1SATBCardTableModRefBS::g1_young_card_val());
2742   beq(CCR0, filtered);
2743 
2744   membar(Assembler::StoreLoad);
2745   lbzx(/*card value*/ Rtmp3, Rbase, Rcard_addr);  // Reload after membar.
2746   cmpwi(CCR0, Rtmp3 /* card value */, CardTableModRefBS::dirty_card_val());
2747   beq(CCR0, filtered);
2748 
2749   // Storing a region crossing, non-NULL oop, card is clean.
2750   // Dirty card and log.
2751   li(Rtmp3, CardTableModRefBS::dirty_card_val());
2752   //release(); // G1: oops are allowed to get visible after dirty marking.
2753   stbx(Rtmp3, Rbase, Rcard_addr);
2754 
2755   add(Rcard_addr, Rbase, Rcard_addr); // This is the address which needs to get enqueued.
2756   Rbase = noreg; // end of lifetime
2757 
2758   const Register Rqueue_index = Rtmp2,
2759                  Rqueue_buf   = Rtmp3;
2760   ld(Rqueue_index, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_index()), R16_thread);
2761   cmpdi(CCR0, Rqueue_index, 0);
2762   beq(CCR0, runtime); // index == 0 then jump to runtime
2763   ld(Rqueue_buf, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_buf()), R16_thread);
2764 
2765   addi(Rqueue_index, Rqueue_index, -wordSize); // decrement index
2766   std(Rqueue_index, in_bytes(JavaThread::dirty_card_queue_offset() + DirtyCardQueue::byte_offset_of_index()), R16_thread);
2767 
2768   stdx(Rcard_addr, Rqueue_buf, Rqueue_index); // store card
2769   b(filtered);
2770 
2771   bind(runtime);
2772 
2773   // Save the live input values.
2774   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), Rcard_addr, R16_thread);
2775 
2776   bind(filtered_int);
2777 }
2778 #endif // INCLUDE_ALL_GCS
2779 
2780 // Values for last_Java_pc, and last_Java_sp must comply to the rules
2781 // in frame_ppc.hpp.
2782 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) {
2783   // Always set last_Java_pc and flags first because once last_Java_sp
2784   // is visible has_last_Java_frame is true and users will look at the
2785   // rest of the fields. (Note: flags should always be zero before we
2786   // get here so doesn't need to be set.)
2787 
2788   // Verify that last_Java_pc was zeroed on return to Java
2789   asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread,
2790                           "last_Java_pc not zeroed before leaving Java", 0x200);
2791 
2792   // When returning from calling out from Java mode the frame anchor's
2793   // last_Java_pc will always be set to NULL. It is set here so that
2794   // if we are doing a call to native (not VM) that we capture the
2795   // known pc and don't have to rely on the native call having a
2796   // standard frame linkage where we can find the pc.
2797   if (last_Java_pc != noreg)
2798     std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
2799 
2800   // Set last_Java_sp last.
2801   std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
2802 }
2803 
2804 void MacroAssembler::reset_last_Java_frame(void) {
2805   asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()),
2806                              R16_thread, "SP was not set, still zero", 0x202);
2807 
2808   BLOCK_COMMENT("reset_last_Java_frame {");
2809   li(R0, 0);
2810 
2811   // _last_Java_sp = 0
2812   std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
2813 
2814   // _last_Java_pc = 0
2815   std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
2816   BLOCK_COMMENT("} reset_last_Java_frame");
2817 }
2818 
2819 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) {
2820   assert_different_registers(sp, tmp1);
2821 
2822   // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via
2823   // TOP_IJAVA_FRAME_ABI.
2824   // FIXME: assert that we really have a TOP_IJAVA_FRAME here!
2825   address entry = pc();
2826   load_const_optimized(tmp1, entry);
2827 
2828   set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1);
2829 }
2830 
2831 void MacroAssembler::get_vm_result(Register oop_result) {
2832   // Read:
2833   //   R16_thread
2834   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
2835   //
2836   // Updated:
2837   //   oop_result
2838   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
2839 
2840   verify_thread();
2841 
2842   ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread);
2843   li(R0, 0);
2844   std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread);
2845 
2846   verify_oop(oop_result);
2847 }
2848 
2849 void MacroAssembler::get_vm_result_2(Register metadata_result) {
2850   // Read:
2851   //   R16_thread
2852   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
2853   //
2854   // Updated:
2855   //   metadata_result
2856   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
2857 
2858   ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
2859   li(R0, 0);
2860   std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
2861 }
2862 
2863 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) {
2864   Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided.
2865   if (Universe::narrow_klass_base() != 0) {
2866     // Use dst as temp if it is free.
2867     sub_const_optimized(dst, current, Universe::narrow_klass_base(), R0);
2868     current = dst;
2869   }
2870   if (Universe::narrow_klass_shift() != 0) {
2871     srdi(dst, current, Universe::narrow_klass_shift());
2872     current = dst;
2873   }
2874   return current;
2875 }
2876 
2877 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) {
2878   if (UseCompressedClassPointers) {
2879     Register compressedKlass = encode_klass_not_null(ck, klass);
2880     stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop);
2881   } else {
2882     std(klass, oopDesc::klass_offset_in_bytes(), dst_oop);
2883   }
2884 }
2885 
2886 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) {
2887   if (UseCompressedClassPointers) {
2888     if (val == noreg) {
2889       val = R0;
2890       li(val, 0);
2891     }
2892     stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed
2893   }
2894 }
2895 
2896 int MacroAssembler::instr_size_for_decode_klass_not_null() {
2897   if (!UseCompressedClassPointers) return 0;
2898   int num_instrs = 1;  // shift or move
2899   if (Universe::narrow_klass_base() != 0) num_instrs = 7;  // shift + load const + add
2900   return num_instrs * BytesPerInstWord;
2901 }
2902 
2903 void MacroAssembler::decode_klass_not_null(Register dst, Register src) {
2904   assert(dst != R0, "Dst reg may not be R0, as R0 is used here.");
2905   if (src == noreg) src = dst;
2906   Register shifted_src = src;
2907   if (Universe::narrow_klass_shift() != 0 ||
2908       Universe::narrow_klass_base() == 0 && src != dst) {  // Move required.
2909     shifted_src = dst;
2910     sldi(shifted_src, src, Universe::narrow_klass_shift());
2911   }
2912   if (Universe::narrow_klass_base() != 0) {
2913     add_const_optimized(dst, shifted_src, Universe::narrow_klass_base(), R0);
2914   }
2915 }
2916 
2917 void MacroAssembler::load_klass(Register dst, Register src) {
2918   if (UseCompressedClassPointers) {
2919     lwz(dst, oopDesc::klass_offset_in_bytes(), src);
2920     // Attention: no null check here!
2921     decode_klass_not_null(dst, dst);
2922   } else {
2923     ld(dst, oopDesc::klass_offset_in_bytes(), src);
2924   }
2925 }
2926 
2927 void MacroAssembler::load_klass_with_trap_null_check(Register dst, Register src) {
2928   if (!os::zero_page_read_protected()) {
2929     if (TrapBasedNullChecks) {
2930       trap_null_check(src);
2931     }
2932   }
2933   load_klass(dst, src);
2934 }
2935 
2936 void MacroAssembler::reinit_heapbase(Register d, Register tmp) {
2937   if (Universe::heap() != NULL) {
2938     load_const_optimized(R30, Universe::narrow_ptrs_base(), tmp);
2939   } else {
2940     // Heap not yet allocated. Load indirectly.
2941     int simm16_offset = load_const_optimized(R30, Universe::narrow_ptrs_base_addr(), tmp, true);
2942     ld(R30, simm16_offset, R30);
2943   }
2944 }
2945 
2946 // Clear Array
2947 // Kills both input registers. tmp == R0 is allowed.
2948 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp) {
2949   // Procedure for large arrays (uses data cache block zero instruction).
2950     Label startloop, fast, fastloop, small_rest, restloop, done;
2951     const int cl_size         = VM_Version::get_cache_line_size(),
2952               cl_dwords       = cl_size>>3,
2953               cl_dw_addr_bits = exact_log2(cl_dwords),
2954               dcbz_min        = 1;                     // Min count of dcbz executions, needs to be >0.
2955 
2956 //2:
2957     cmpdi(CCR1, cnt_dwords, ((dcbz_min+1)<<cl_dw_addr_bits)-1); // Big enough? (ensure >=dcbz_min lines included).
2958     blt(CCR1, small_rest);                                      // Too small.
2959     rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits);           // Extract dword offset within first cache line.
2960     beq(CCR0, fast);                                            // Already 128byte aligned.
2961 
2962     subfic(tmp, tmp, cl_dwords);
2963     mtctr(tmp);                        // Set ctr to hit 128byte boundary (0<ctr<cl_dwords).
2964     subf(cnt_dwords, tmp, cnt_dwords); // rest.
2965     li(tmp, 0);
2966 //10:
2967   bind(startloop);                     // Clear at the beginning to reach 128byte boundary.
2968     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
2969     addi(base_ptr, base_ptr, 8);
2970     bdnz(startloop);
2971 //13:
2972   bind(fast);                                  // Clear 128byte blocks.
2973     srdi(tmp, cnt_dwords, cl_dw_addr_bits);    // Loop count for 128byte loop (>0).
2974     andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords.
2975     mtctr(tmp);                                // Load counter.
2976 //16:
2977   bind(fastloop);
2978     dcbz(base_ptr);                    // Clear 128byte aligned block.
2979     addi(base_ptr, base_ptr, cl_size);
2980     bdnz(fastloop);
2981     if (InsertEndGroupPPC64) { endgroup(); } else { nop(); }
2982 //20:
2983   bind(small_rest);
2984     cmpdi(CCR0, cnt_dwords, 0);        // size 0?
2985     beq(CCR0, done);                   // rest == 0
2986     li(tmp, 0);
2987     mtctr(cnt_dwords);                 // Load counter.
2988 //24:
2989   bind(restloop);                      // Clear rest.
2990     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
2991     addi(base_ptr, base_ptr, 8);
2992     bdnz(restloop);
2993 //27:
2994   bind(done);
2995 }
2996 
2997 /////////////////////////////////////////// String intrinsics ////////////////////////////////////////////
2998 
2999 // Search for a single jchar in an jchar[].
3000 //
3001 // Assumes that result differs from all other registers.
3002 //
3003 // Haystack, needle are the addresses of jchar-arrays.
3004 // NeedleChar is needle[0] if it is known at compile time.
3005 // Haycnt is the length of the haystack. We assume haycnt >=1.
3006 //
3007 // Preserves haystack, haycnt, kills all other registers.
3008 //
3009 // If needle == R0, we search for the constant needleChar.
3010 void MacroAssembler::string_indexof_1(Register result, Register haystack, Register haycnt,
3011                                       Register needle, jchar needleChar,
3012                                       Register tmp1, Register tmp2) {
3013 
3014   assert_different_registers(result, haystack, haycnt, needle, tmp1, tmp2);
3015 
3016   Label L_InnerLoop, L_FinalCheck, L_Found1, L_Found2, L_Found3, L_NotFound, L_End;
3017   Register needle0 = needle, // Contains needle[0].
3018            addr = tmp1,
3019            ch1 = tmp2,
3020            ch2 = R0;
3021 
3022 //2 (variable) or 3 (const):
3023    if (needle != R0) lhz(needle0, 0, needle); // Preload needle character, needle has len==1.
3024    dcbtct(haystack, 0x00);                        // Indicate R/O access to haystack.
3025 
3026    srwi_(tmp2, haycnt, 1);   // Shift right by exact_log2(UNROLL_FACTOR).
3027    mr(addr, haystack);
3028    beq(CCR0, L_FinalCheck);
3029    mtctr(tmp2);              // Move to count register.
3030 //8:
3031   bind(L_InnerLoop);             // Main work horse (2x unrolled search loop).
3032    lhz(ch1, 0, addr);        // Load characters from haystack.
3033    lhz(ch2, 2, addr);
3034    (needle != R0) ? cmpw(CCR0, ch1, needle0) : cmplwi(CCR0, ch1, needleChar);
3035    (needle != R0) ? cmpw(CCR1, ch2, needle0) : cmplwi(CCR1, ch2, needleChar);
3036    beq(CCR0, L_Found1);   // Did we find the needle?
3037    beq(CCR1, L_Found2);
3038    addi(addr, addr, 4);
3039    bdnz(L_InnerLoop);
3040 //16:
3041   bind(L_FinalCheck);
3042    andi_(R0, haycnt, 1);
3043    beq(CCR0, L_NotFound);
3044    lhz(ch1, 0, addr);        // One position left at which we have to compare.
3045    (needle != R0) ? cmpw(CCR1, ch1, needle0) : cmplwi(CCR1, ch1, needleChar);
3046    beq(CCR1, L_Found3);
3047 //21:
3048   bind(L_NotFound);
3049    li(result, -1);           // Not found.
3050    b(L_End);
3051 
3052   bind(L_Found2);
3053    addi(addr, addr, 2);
3054 //24:
3055   bind(L_Found1);
3056   bind(L_Found3);                  // Return index ...
3057    subf(addr, haystack, addr); // relative to haystack,
3058    srdi(result, addr, 1);      // in characters.
3059   bind(L_End);
3060 }
3061 
3062 
3063 // Implementation of IndexOf for jchar arrays.
3064 //
3065 // The length of haystack and needle are not constant, i.e. passed in a register.
3066 //
3067 // Preserves registers haystack, needle.
3068 // Kills registers haycnt, needlecnt.
3069 // Assumes that result differs from all other registers.
3070 // Haystack, needle are the addresses of jchar-arrays.
3071 // Haycnt, needlecnt are the lengths of them, respectively.
3072 //
3073 // Needlecntval must be zero or 15-bit unsigned immediate and > 1.
3074 void MacroAssembler::string_indexof(Register result, Register haystack, Register haycnt,
3075                                     Register needle, ciTypeArray* needle_values, Register needlecnt, int needlecntval,
3076                                     Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
3077 
3078   // Ensure 0<needlecnt<=haycnt in ideal graph as prerequisite!
3079   Label L_TooShort, L_Found, L_NotFound, L_End;
3080   Register last_addr = haycnt, // Kill haycnt at the beginning.
3081            addr      = tmp1,
3082            n_start   = tmp2,
3083            ch1       = tmp3,
3084            ch2       = R0;
3085 
3086   // **************************************************************************************************
3087   // Prepare for main loop: optimized for needle count >=2, bail out otherwise.
3088   // **************************************************************************************************
3089 
3090 //1 (variable) or 3 (const):
3091    dcbtct(needle, 0x00);    // Indicate R/O access to str1.
3092    dcbtct(haystack, 0x00);  // Indicate R/O access to str2.
3093 
3094   // Compute last haystack addr to use if no match gets found.
3095   if (needlecntval == 0) { // variable needlecnt
3096 //3:
3097    subf(ch1, needlecnt, haycnt);      // Last character index to compare is haycnt-needlecnt.
3098    addi(addr, haystack, -2);          // Accesses use pre-increment.
3099    cmpwi(CCR6, needlecnt, 2);
3100    blt(CCR6, L_TooShort);          // Variable needlecnt: handle short needle separately.
3101    slwi(ch1, ch1, 1);                 // Scale to number of bytes.
3102    lwz(n_start, 0, needle);           // Load first 2 characters of needle.
3103    add(last_addr, haystack, ch1);     // Point to last address to compare (haystack+2*(haycnt-needlecnt)).
3104    addi(needlecnt, needlecnt, -2);    // Rest of needle.
3105   } else { // constant needlecnt
3106   guarantee(needlecntval != 1, "IndexOf with single-character needle must be handled separately");
3107   assert((needlecntval & 0x7fff) == needlecntval, "wrong immediate");
3108 //5:
3109    addi(ch1, haycnt, -needlecntval);  // Last character index to compare is haycnt-needlecnt.
3110    lwz(n_start, 0, needle);           // Load first 2 characters of needle.
3111    addi(addr, haystack, -2);          // Accesses use pre-increment.
3112    slwi(ch1, ch1, 1);                 // Scale to number of bytes.
3113    add(last_addr, haystack, ch1);     // Point to last address to compare (haystack+2*(haycnt-needlecnt)).
3114    li(needlecnt, needlecntval-2);     // Rest of needle.
3115   }
3116 
3117   // Main Loop (now we have at least 3 characters).
3118 //11:
3119   Label L_OuterLoop, L_InnerLoop, L_FinalCheck, L_Comp1, L_Comp2, L_Comp3;
3120   bind(L_OuterLoop); // Search for 1st 2 characters.
3121   Register addr_diff = tmp4;
3122    subf(addr_diff, addr, last_addr); // Difference between already checked address and last address to check.
3123    addi(addr, addr, 2);              // This is the new address we want to use for comparing.
3124    srdi_(ch2, addr_diff, 2);
3125    beq(CCR0, L_FinalCheck);       // 2 characters left?
3126    mtctr(ch2);                       // addr_diff/4
3127 //16:
3128   bind(L_InnerLoop);                // Main work horse (2x unrolled search loop)
3129    lwz(ch1, 0, addr);           // Load 2 characters of haystack (ignore alignment).
3130    lwz(ch2, 2, addr);
3131    cmpw(CCR0, ch1, n_start); // Compare 2 characters (1 would be sufficient but try to reduce branches to CompLoop).
3132    cmpw(CCR1, ch2, n_start);
3133    beq(CCR0, L_Comp1);       // Did we find the needle start?
3134    beq(CCR1, L_Comp2);
3135    addi(addr, addr, 4);
3136    bdnz(L_InnerLoop);
3137 //24:
3138   bind(L_FinalCheck);
3139    rldicl_(addr_diff, addr_diff, 64-1, 63); // Remaining characters not covered by InnerLoop: (addr_diff>>1)&1.
3140    beq(CCR0, L_NotFound);
3141    lwz(ch1, 0, addr);                       // One position left at which we have to compare.
3142    cmpw(CCR1, ch1, n_start);
3143    beq(CCR1, L_Comp3);
3144 //29:
3145   bind(L_NotFound);
3146    li(result, -1); // not found
3147    b(L_End);
3148 
3149 
3150    // **************************************************************************************************
3151    // Special Case: unfortunately, the variable needle case can be called with needlecnt<2
3152    // **************************************************************************************************
3153 //31:
3154  if ((needlecntval>>1) !=1 ) { // Const needlecnt is 2 or 3? Reduce code size.
3155   int nopcnt = 5;
3156   if (needlecntval !=0 ) ++nopcnt; // Balance alignment (other case: see below).
3157   if (needlecntval == 0) {         // We have to handle these cases separately.
3158   Label L_OneCharLoop;
3159   bind(L_TooShort);
3160    mtctr(haycnt);
3161    lhz(n_start, 0, needle);    // First character of needle
3162   bind(L_OneCharLoop);
3163    lhzu(ch1, 2, addr);
3164    cmpw(CCR1, ch1, n_start);
3165    beq(CCR1, L_Found);      // Did we find the one character needle?
3166    bdnz(L_OneCharLoop);
3167    li(result, -1);             // Not found.
3168    b(L_End);
3169   } // 8 instructions, so no impact on alignment.
3170   for (int x = 0; x < nopcnt; ++x) nop();
3171  }
3172 
3173   // **************************************************************************************************
3174   // Regular Case Part II: compare rest of needle (first 2 characters have been compared already)
3175   // **************************************************************************************************
3176 
3177   // Compare the rest
3178 //36 if needlecntval==0, else 37:
3179   bind(L_Comp2);
3180    addi(addr, addr, 2); // First comparison has failed, 2nd one hit.
3181   bind(L_Comp1);            // Addr points to possible needle start.
3182   bind(L_Comp3);            // Could have created a copy and use a different return address but saving code size here.
3183   if (needlecntval != 2) {  // Const needlecnt==2?
3184    if (needlecntval != 3) {
3185     if (needlecntval == 0) beq(CCR6, L_Found); // Variable needlecnt==2?
3186     Register ind_reg = tmp4;
3187     li(ind_reg, 2*2);   // First 2 characters are already compared, use index 2.
3188     mtctr(needlecnt);   // Decremented by 2, still > 0.
3189 //40:
3190    Label L_CompLoop;
3191    bind(L_CompLoop);
3192     lhzx(ch2, needle, ind_reg);
3193     lhzx(ch1, addr, ind_reg);
3194     cmpw(CCR1, ch1, ch2);
3195     bne(CCR1, L_OuterLoop);
3196     addi(ind_reg, ind_reg, 2);
3197     bdnz(L_CompLoop);
3198    } else { // No loop required if there's only one needle character left.
3199     lhz(ch2, 2*2, needle);
3200     lhz(ch1, 2*2, addr);
3201     cmpw(CCR1, ch1, ch2);
3202     bne(CCR1, L_OuterLoop);
3203    }
3204   }
3205   // Return index ...
3206 //46:
3207   bind(L_Found);
3208    subf(addr, haystack, addr); // relative to haystack, ...
3209    srdi(result, addr, 1);      // in characters.
3210 //48:
3211   bind(L_End);
3212 }
3213 
3214 // Implementation of Compare for jchar arrays.
3215 //
3216 // Kills the registers str1, str2, cnt1, cnt2.
3217 // Kills cr0, ctr.
3218 // Assumes that result differes from the input registers.
3219 void MacroAssembler::string_compare(Register str1_reg, Register str2_reg, Register cnt1_reg, Register cnt2_reg,
3220                                     Register result_reg, Register tmp_reg) {
3221    assert_different_registers(result_reg, str1_reg, str2_reg, cnt1_reg, cnt2_reg, tmp_reg);
3222 
3223    Label Ldone, Lslow_case, Lslow_loop, Lfast_loop;
3224    Register cnt_diff = R0,
3225             limit_reg = cnt1_reg,
3226             chr1_reg = result_reg,
3227             chr2_reg = cnt2_reg,
3228             addr_diff = str2_reg;
3229 
3230    // Offset 0 should be 32 byte aligned.
3231 //-4:
3232     dcbtct(str1_reg, 0x00);  // Indicate R/O access to str1.
3233     dcbtct(str2_reg, 0x00);  // Indicate R/O access to str2.
3234 //-2:
3235    // Compute min(cnt1, cnt2) and check if 0 (bail out if we don't need to compare characters).
3236     subf(result_reg, cnt2_reg, cnt1_reg);  // difference between cnt1/2
3237     subf_(addr_diff, str1_reg, str2_reg);  // alias?
3238     beq(CCR0, Ldone);                   // return cnt difference if both ones are identical
3239     srawi(limit_reg, result_reg, 31);      // generate signmask (cnt1/2 must be non-negative so cnt_diff can't overflow)
3240     mr(cnt_diff, result_reg);
3241     andr(limit_reg, result_reg, limit_reg); // difference or zero (negative): cnt1<cnt2 ? cnt1-cnt2 : 0
3242     add_(limit_reg, cnt2_reg, limit_reg);  // min(cnt1, cnt2)==0?
3243     beq(CCR0, Ldone);                   // return cnt difference if one has 0 length
3244 
3245     lhz(chr1_reg, 0, str1_reg);            // optional: early out if first characters mismatch
3246     lhzx(chr2_reg, str1_reg, addr_diff);   // optional: early out if first characters mismatch
3247     addi(tmp_reg, limit_reg, -1);          // min(cnt1, cnt2)-1
3248     subf_(result_reg, chr2_reg, chr1_reg); // optional: early out if first characters mismatch
3249     bne(CCR0, Ldone);                   // optional: early out if first characters mismatch
3250 
3251    // Set loop counter by scaling down tmp_reg
3252     srawi_(chr2_reg, tmp_reg, exact_log2(4)); // (min(cnt1, cnt2)-1)/4
3253     ble(CCR0, Lslow_case);                 // need >4 characters for fast loop
3254     andi(limit_reg, tmp_reg, 4-1);            // remaining characters
3255 
3256    // Adapt str1_reg str2_reg for the first loop iteration
3257     mtctr(chr2_reg);                 // (min(cnt1, cnt2)-1)/4
3258     addi(limit_reg, limit_reg, 4+1); // compare last 5-8 characters in slow_case if mismatch found in fast_loop
3259 //16:
3260    // Compare the rest of the characters
3261    bind(Lfast_loop);
3262     ld(chr1_reg, 0, str1_reg);
3263     ldx(chr2_reg, str1_reg, addr_diff);
3264     cmpd(CCR0, chr2_reg, chr1_reg);
3265     bne(CCR0, Lslow_case); // return chr1_reg
3266     addi(str1_reg, str1_reg, 4*2);
3267     bdnz(Lfast_loop);
3268     addi(limit_reg, limit_reg, -4); // no mismatch found in fast_loop, only 1-4 characters missing
3269 //23:
3270    bind(Lslow_case);
3271     mtctr(limit_reg);
3272 //24:
3273    bind(Lslow_loop);
3274     lhz(chr1_reg, 0, str1_reg);
3275     lhzx(chr2_reg, str1_reg, addr_diff);
3276     subf_(result_reg, chr2_reg, chr1_reg);
3277     bne(CCR0, Ldone); // return chr1_reg
3278     addi(str1_reg, str1_reg, 1*2);
3279     bdnz(Lslow_loop);
3280 //30:
3281    // If strings are equal up to min length, return the length difference.
3282     mr(result_reg, cnt_diff);
3283     nop(); // alignment
3284 //32:
3285    // Otherwise, return the difference between the first mismatched chars.
3286    bind(Ldone);
3287 }
3288 
3289 
3290 // Compare char[] arrays.
3291 //
3292 // str1_reg   USE only
3293 // str2_reg   USE only
3294 // cnt_reg    USE_DEF, due to tmp reg shortage
3295 // result_reg DEF only, might compromise USE only registers
3296 void MacroAssembler::char_arrays_equals(Register str1_reg, Register str2_reg, Register cnt_reg, Register result_reg,
3297                                         Register tmp1_reg, Register tmp2_reg, Register tmp3_reg, Register tmp4_reg,
3298                                         Register tmp5_reg) {
3299 
3300   // Str1 may be the same register as str2 which can occur e.g. after scalar replacement.
3301   assert_different_registers(result_reg, str1_reg, cnt_reg, tmp1_reg, tmp2_reg, tmp3_reg, tmp4_reg, tmp5_reg);
3302   assert_different_registers(result_reg, str2_reg, cnt_reg, tmp1_reg, tmp2_reg, tmp3_reg, tmp4_reg, tmp5_reg);
3303 
3304   // Offset 0 should be 32 byte aligned.
3305   Label Linit_cbc, Lcbc, Lloop, Ldone_true, Ldone_false;
3306   Register index_reg = tmp5_reg;
3307   Register cbc_iter  = tmp4_reg;
3308 
3309 //-1:
3310   dcbtct(str1_reg, 0x00);  // Indicate R/O access to str1.
3311   dcbtct(str2_reg, 0x00);  // Indicate R/O access to str2.
3312 //1:
3313   andi(cbc_iter, cnt_reg, 4-1);            // Remaining iterations after 4 java characters per iteration loop.
3314   li(index_reg, 0); // init
3315   li(result_reg, 0); // assume false
3316   srwi_(tmp2_reg, cnt_reg, exact_log2(4)); // Div: 4 java characters per iteration (main loop).
3317 
3318   cmpwi(CCR1, cbc_iter, 0);             // CCR1 = (cbc_iter==0)
3319   beq(CCR0, Linit_cbc);                 // too short
3320     mtctr(tmp2_reg);
3321 //8:
3322     bind(Lloop);
3323       ldx(tmp1_reg, str1_reg, index_reg);
3324       ldx(tmp2_reg, str2_reg, index_reg);
3325       cmpd(CCR0, tmp1_reg, tmp2_reg);
3326       bne(CCR0, Ldone_false);  // Unequal char pair found -> done.
3327       addi(index_reg, index_reg, 4*sizeof(jchar));
3328       bdnz(Lloop);
3329 //14:
3330   bind(Linit_cbc);
3331   beq(CCR1, Ldone_true);
3332     mtctr(cbc_iter);
3333 //16:
3334     bind(Lcbc);
3335       lhzx(tmp1_reg, str1_reg, index_reg);
3336       lhzx(tmp2_reg, str2_reg, index_reg);
3337       cmpw(CCR0, tmp1_reg, tmp2_reg);
3338       bne(CCR0, Ldone_false);  // Unequal char pair found -> done.
3339       addi(index_reg, index_reg, 1*sizeof(jchar));
3340       bdnz(Lcbc);
3341     nop();
3342   bind(Ldone_true);
3343   li(result_reg, 1);
3344 //24:
3345   bind(Ldone_false);
3346 }
3347 
3348 
3349 void MacroAssembler::char_arrays_equalsImm(Register str1_reg, Register str2_reg, int cntval, Register result_reg,
3350                                            Register tmp1_reg, Register tmp2_reg) {
3351   // Str1 may be the same register as str2 which can occur e.g. after scalar replacement.
3352   assert_different_registers(result_reg, str1_reg, tmp1_reg, tmp2_reg);
3353   assert_different_registers(result_reg, str2_reg, tmp1_reg, tmp2_reg);
3354   assert(sizeof(jchar) == 2, "must be");
3355   assert(cntval >= 0 && ((cntval & 0x7fff) == cntval), "wrong immediate");
3356 
3357   Label Ldone_false;
3358 
3359   if (cntval < 16) { // short case
3360     if (cntval != 0) li(result_reg, 0); // assume false
3361 
3362     const int num_bytes = cntval*sizeof(jchar);
3363     int index = 0;
3364     for (int next_index; (next_index = index + 8) <= num_bytes; index = next_index) {
3365       ld(tmp1_reg, index, str1_reg);
3366       ld(tmp2_reg, index, str2_reg);
3367       cmpd(CCR0, tmp1_reg, tmp2_reg);
3368       bne(CCR0, Ldone_false);
3369     }
3370     if (cntval & 2) {
3371       lwz(tmp1_reg, index, str1_reg);
3372       lwz(tmp2_reg, index, str2_reg);
3373       cmpw(CCR0, tmp1_reg, tmp2_reg);
3374       bne(CCR0, Ldone_false);
3375       index += 4;
3376     }
3377     if (cntval & 1) {
3378       lhz(tmp1_reg, index, str1_reg);
3379       lhz(tmp2_reg, index, str2_reg);
3380       cmpw(CCR0, tmp1_reg, tmp2_reg);
3381       bne(CCR0, Ldone_false);
3382     }
3383     // fallthrough: true
3384   } else {
3385     Label Lloop;
3386     Register index_reg = tmp1_reg;
3387     const int loopcnt = cntval/4;
3388     assert(loopcnt > 0, "must be");
3389     // Offset 0 should be 32 byte aligned.
3390     //2:
3391     dcbtct(str1_reg, 0x00);  // Indicate R/O access to str1.
3392     dcbtct(str2_reg, 0x00);  // Indicate R/O access to str2.
3393     li(tmp2_reg, loopcnt);
3394     li(index_reg, 0); // init
3395     li(result_reg, 0); // assume false
3396     mtctr(tmp2_reg);
3397     //8:
3398     bind(Lloop);
3399     ldx(R0, str1_reg, index_reg);
3400     ldx(tmp2_reg, str2_reg, index_reg);
3401     cmpd(CCR0, R0, tmp2_reg);
3402     bne(CCR0, Ldone_false);  // Unequal char pair found -> done.
3403     addi(index_reg, index_reg, 4*sizeof(jchar));
3404     bdnz(Lloop);
3405     //14:
3406     if (cntval & 2) {
3407       lwzx(R0, str1_reg, index_reg);
3408       lwzx(tmp2_reg, str2_reg, index_reg);
3409       cmpw(CCR0, R0, tmp2_reg);
3410       bne(CCR0, Ldone_false);
3411       if (cntval & 1) addi(index_reg, index_reg, 2*sizeof(jchar));
3412     }
3413     if (cntval & 1) {
3414       lhzx(R0, str1_reg, index_reg);
3415       lhzx(tmp2_reg, str2_reg, index_reg);
3416       cmpw(CCR0, R0, tmp2_reg);
3417       bne(CCR0, Ldone_false);
3418     }
3419     // fallthru: true
3420   }
3421   li(result_reg, 1);
3422   bind(Ldone_false);
3423 }
3424 
3425 // Helpers for Intrinsic Emitters
3426 //
3427 // Revert the byte order of a 32bit value in a register
3428 //   src: 0x44556677
3429 //   dst: 0x77665544
3430 // Three steps to obtain the result:
3431 //  1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word
3432 //     into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared.
3433 //     This value initializes dst.
3434 //  2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost
3435 //     byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go.
3436 //     This value is mask inserted into dst with a [0..23] mask of 1s.
3437 //  3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position.
3438 //     This value is mask inserted into dst with a [8..15] mask of 1s.
3439 void MacroAssembler::load_reverse_32(Register dst, Register src) {
3440   assert_different_registers(dst, src);
3441 
3442   rldicl(dst, src, (4+1)*8, 56);       // Rotate byte 4 into position 7 (rightmost), clear all to the left.
3443   rlwimi(dst, src,     3*8,  0, 23);   // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone.
3444   rlwimi(dst, src,     1*8,  8, 15);   // Insert byte 6 into position 5, leave the rest alone.
3445 }
3446 
3447 // Calculate the column addresses of the crc32 lookup table into distinct registers.
3448 // This loop-invariant calculation is moved out of the loop body, reducing the loop
3449 // body size from 20 to 16 instructions.
3450 // Returns the offset that was used to calculate the address of column tc3.
3451 // Due to register shortage, setting tc3 may overwrite table. With the return offset
3452 // at hand, the original table address can be easily reconstructed.
3453 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {
3454 
3455 #ifdef VM_LITTLE_ENDIAN
3456   // This is what we implement (the DOLIT4 part):
3457   // ========================================================================= */
3458   // #define DOLIT4 c ^= *buf4++; \
3459   //         c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \
3460   //             crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24]
3461   // #define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4
3462   // ========================================================================= */
3463   const int ix0 = 3*(4*CRC32_COLUMN_SIZE);
3464   const int ix1 = 2*(4*CRC32_COLUMN_SIZE);
3465   const int ix2 = 1*(4*CRC32_COLUMN_SIZE);
3466   const int ix3 = 0*(4*CRC32_COLUMN_SIZE);
3467 #else
3468   // This is what we implement (the DOBIG4 part):
3469   // =========================================================================
3470   // #define DOBIG4 c ^= *++buf4; \
3471   //         c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \
3472   //             crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24]
3473   // #define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4
3474   // =========================================================================
3475   const int ix0 = 4*(4*CRC32_COLUMN_SIZE);
3476   const int ix1 = 5*(4*CRC32_COLUMN_SIZE);
3477   const int ix2 = 6*(4*CRC32_COLUMN_SIZE);
3478   const int ix3 = 7*(4*CRC32_COLUMN_SIZE);
3479 #endif
3480   assert_different_registers(table, tc0, tc1, tc2);
3481   assert(table == tc3, "must be!");
3482 
3483   if (ix0 != 0) addi(tc0, table, ix0);
3484   if (ix1 != 0) addi(tc1, table, ix1);
3485   if (ix2 != 0) addi(tc2, table, ix2);
3486   if (ix3 != 0) addi(tc3, table, ix3);
3487 
3488   return ix3;
3489 }
3490 
3491 /**
3492  * uint32_t crc;
3493  * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
3494  */
3495 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) {
3496   assert_different_registers(crc, table, tmp);
3497   assert_different_registers(val, table);
3498 
3499   if (crc == val) {                   // Must rotate first to use the unmodified value.
3500     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3501                                       // As we use a word (4-byte) instruction, we have to adapt the mask bit positions.
3502     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
3503   } else {
3504     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
3505     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3506   }
3507   lwzx(tmp, table, tmp);
3508   xorr(crc, crc, tmp);
3509 }
3510 
3511 /**
3512  * uint32_t crc;
3513  * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
3514  */
3515 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
3516   fold_byte_crc32(crc, crc, table, tmp);
3517 }
3518 
3519 /**
3520  * Emits code to update CRC-32 with a byte value according to constants in table.
3521  *
3522  * @param [in,out]crc   Register containing the crc.
3523  * @param [in]val       Register containing the byte to fold into the CRC.
3524  * @param [in]table     Register containing the table of crc constants.
3525  *
3526  * uint32_t crc;
3527  * val = crc_table[(val ^ crc) & 0xFF];
3528  * crc = val ^ (crc >> 8);
3529  */
3530 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3531   BLOCK_COMMENT("update_byte_crc32:");
3532   xorr(val, val, crc);
3533   fold_byte_crc32(crc, val, table, val);
3534 }
3535 
3536 /**
3537  * @param crc   register containing existing CRC (32-bit)
3538  * @param buf   register pointing to input byte buffer (byte*)
3539  * @param len   register containing number of bytes
3540  * @param table register pointing to CRC table
3541  */
3542 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,
3543                                            Register data, bool loopAlignment, bool invertCRC) {
3544   assert_different_registers(crc, buf, len, table, data);
3545 
3546   Label L_mainLoop, L_done;
3547   const int mainLoop_stepping  = 1;
3548   const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4;
3549 
3550   // Process all bytes in a single-byte loop.
3551   cmpdi(CCR0, len, 0);                           // Anything to do?
3552   mtctr(len);
3553   beq(CCR0, L_done);
3554 
3555   if (invertCRC) {
3556     nand(crc, crc, crc);                         // ~c
3557   }
3558 
3559   align(mainLoop_alignment);
3560   BIND(L_mainLoop);
3561     lbz(data, 0, buf);                           // Byte from buffer, zero-extended.
3562     addi(buf, buf, mainLoop_stepping);           // Advance buffer position.
3563     update_byte_crc32(crc, data, table);
3564     bdnz(L_mainLoop);                            // Iterate.
3565 
3566   if (invertCRC) {
3567     nand(crc, crc, crc);                         // ~c
3568   }
3569 
3570   bind(L_done);
3571 }
3572 
3573 /**
3574  * Emits code to update CRC-32 with a 4-byte value according to constants in table
3575  * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
3576  */
3577 // A not on the lookup table address(es):
3578 // The lookup table consists of two sets of four columns each.
3579 // The columns {0..3} are used for little-endian machines.
3580 // The columns {4..7} are used for big-endian machines.
3581 // To save the effort of adding the column offset to the table address each time
3582 // a table element is looked up, it is possible to pass the pre-calculated
3583 // column addresses.
3584 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary.
3585 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,
3586                                         Register t0,  Register t1,  Register t2,  Register t3,
3587                                         Register tc0, Register tc1, Register tc2, Register tc3) {
3588   assert_different_registers(crc, t3);
3589 
3590   // XOR crc with next four bytes of buffer.
3591   lwz(t3, bufDisp, buf);
3592   if (bufInc != 0) {
3593     addi(buf, buf, bufInc);
3594   }
3595   xorr(t3, t3, crc);
3596 
3597   // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices.
3598   rlwinm(t0, t3,  2,         24-2, 31-2);  // ((t1 >>  0) & 0xff) << 2
3599   rlwinm(t1, t3,  32+(2- 8), 24-2, 31-2);  // ((t1 >>  8) & 0xff) << 2
3600   rlwinm(t2, t3,  32+(2-16), 24-2, 31-2);  // ((t1 >> 16) & 0xff) << 2
3601   rlwinm(t3, t3,  32+(2-24), 24-2, 31-2);  // ((t1 >> 24) & 0xff) << 2
3602 
3603   // Use the pre-calculated column addresses.
3604   // Load pre-calculated table values.
3605   lwzx(t0, tc0, t0);
3606   lwzx(t1, tc1, t1);
3607   lwzx(t2, tc2, t2);
3608   lwzx(t3, tc3, t3);
3609 
3610   // Calculate new crc from table values.
3611   xorr(t0,  t0, t1);
3612   xorr(t2,  t2, t3);
3613   xorr(crc, t0, t2);  // Now crc contains the final checksum value.
3614 }
3615 
3616 /**
3617  * @param crc   register containing existing CRC (32-bit)
3618  * @param buf   register pointing to input byte buffer (byte*)
3619  * @param len   register containing number of bytes
3620  * @param table register pointing to CRC table
3621  *
3622  * Uses R9..R12 as work register. Must be saved/restored by caller!
3623  */
3624 void MacroAssembler::kernel_crc32_2word(Register crc, Register buf, Register len, Register table,
3625                                         Register t0,  Register t1,  Register t2,  Register t3,
3626                                         Register tc0, Register tc1, Register tc2, Register tc3) {
3627   assert_different_registers(crc, buf, len, table);
3628 
3629   Label L_mainLoop, L_tail;
3630   Register  tmp  = t0;
3631   Register  data = t0;
3632   Register  tmp2 = t1;
3633   const int mainLoop_stepping  = 8;
3634   const int tailLoop_stepping  = 1;
3635   const int log_stepping       = exact_log2(mainLoop_stepping);
3636   const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
3637   const int complexThreshold   = 2*mainLoop_stepping;
3638 
3639   // Don't test for len <= 0 here. This pathological case should not occur anyway.
3640   // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles.
3641   // The situation itself is detected and handled correctly by the conditional branches
3642   // following  aghi(len, -stepping) and aghi(len, +stepping).
3643   assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
3644 
3645   BLOCK_COMMENT("kernel_crc32_2word {");
3646 
3647   nand(crc, crc, crc);                           // ~c
3648 
3649   // Check for short (<mainLoop_stepping) buffer.
3650   cmpdi(CCR0, len, complexThreshold);
3651   blt(CCR0, L_tail);
3652 
3653   // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
3654   // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
3655   {
3656     // Align buf addr to mainLoop_stepping boundary.
3657     neg(tmp2, buf);                           // Calculate # preLoop iterations for alignment.
3658     rldicl(tmp2, tmp2, 0, 64-log_stepping);   // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
3659 
3660     if (complexThreshold > mainLoop_stepping) {
3661       sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3662     } else {
3663       sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
3664       cmpdi(CCR0, tmp, mainLoop_stepping);
3665       blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
3666       mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3667     }
3668     update_byteLoop_crc32(crc, buf, tmp2, table, data, false, false);
3669   }
3670 
3671   srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
3672   andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
3673   mtctr(tmp2);
3674 
3675 #ifdef VM_LITTLE_ENDIAN
3676   Register crc_rv = crc;
3677 #else
3678   Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
3679                                                  // Occupies tmp, but frees up crc.
3680   load_reverse_32(crc_rv, crc);                  // Revert byte order because we are dealing with big-endian data.
3681   tmp = crc;
3682 #endif
3683 
3684   int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
3685 
3686   align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
3687   BIND(L_mainLoop);
3688     update_1word_crc32(crc_rv, buf, table, 0, 0, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
3689     update_1word_crc32(crc_rv, buf, table, 4, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
3690     bdnz(L_mainLoop);
3691 
3692 #ifndef VM_LITTLE_ENDIAN
3693   load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
3694   tmp = crc_rv;                                  // Tmp uses it's original register again.
3695 #endif
3696 
3697   // Restore original table address for tailLoop.
3698   if (reconstructTableOffset != 0) {
3699     addi(table, table, -reconstructTableOffset);
3700   }
3701 
3702   // Process last few (<complexThreshold) bytes of buffer.
3703   BIND(L_tail);
3704   update_byteLoop_crc32(crc, buf, len, table, data, false, false);
3705 
3706   nand(crc, crc, crc);                           // ~c
3707   BLOCK_COMMENT("} kernel_crc32_2word");
3708 }
3709 
3710 /**
3711  * @param crc   register containing existing CRC (32-bit)
3712  * @param buf   register pointing to input byte buffer (byte*)
3713  * @param len   register containing number of bytes
3714  * @param table register pointing to CRC table
3715  *
3716  * uses R9..R12 as work register. Must be saved/restored by caller!
3717  */
3718 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table,
3719                                         Register t0,  Register t1,  Register t2,  Register t3,
3720                                         Register tc0, Register tc1, Register tc2, Register tc3) {
3721   assert_different_registers(crc, buf, len, table);
3722 
3723   Label L_mainLoop, L_tail;
3724   Register  tmp          = t0;
3725   Register  data         = t0;
3726   Register  tmp2         = t1;
3727   const int mainLoop_stepping  = 4;
3728   const int tailLoop_stepping  = 1;
3729   const int log_stepping       = exact_log2(mainLoop_stepping);
3730   const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
3731   const int complexThreshold   = 2*mainLoop_stepping;
3732 
3733   // Don't test for len <= 0 here. This pathological case should not occur anyway.
3734   // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles.
3735   // The situation itself is detected and handled correctly by the conditional branches
3736   // following  aghi(len, -stepping) and aghi(len, +stepping).
3737   assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
3738 
3739   BLOCK_COMMENT("kernel_crc32_1word {");
3740 
3741   nand(crc, crc, crc);                           // ~c
3742 
3743   // Check for short (<mainLoop_stepping) buffer.
3744   cmpdi(CCR0, len, complexThreshold);
3745   blt(CCR0, L_tail);
3746 
3747   // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
3748   // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
3749   {
3750     // Align buf addr to mainLoop_stepping boundary.
3751     neg(tmp2, buf);                              // Calculate # preLoop iterations for alignment.
3752     rldicl(tmp2, tmp2, 0, 64-log_stepping);      // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
3753 
3754     if (complexThreshold > mainLoop_stepping) {
3755       sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3756     } else {
3757       sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
3758       cmpdi(CCR0, tmp, mainLoop_stepping);
3759       blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
3760       mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3761     }
3762     update_byteLoop_crc32(crc, buf, tmp2, table, data, false, false);
3763   }
3764 
3765   srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
3766   andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
3767   mtctr(tmp2);
3768 
3769 #ifdef VM_LITTLE_ENDIAN
3770   Register crc_rv = crc;
3771 #else
3772   Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
3773                                                  // Occupies tmp, but frees up crc.
3774   load_reverse_32(crc_rv, crc);                  // evert byte order because we are dealing with big-endian data.
3775   tmp = crc;
3776 #endif
3777 
3778   int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
3779 
3780   align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
3781   BIND(L_mainLoop);
3782     update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
3783     bdnz(L_mainLoop);
3784 
3785 #ifndef VM_LITTLE_ENDIAN
3786   load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
3787   tmp = crc_rv;                                  // Tmp uses it's original register again.
3788 #endif
3789 
3790   // Restore original table address for tailLoop.
3791   if (reconstructTableOffset != 0) {
3792     addi(table, table, -reconstructTableOffset);
3793   }
3794 
3795   // Process last few (<complexThreshold) bytes of buffer.
3796   BIND(L_tail);
3797   update_byteLoop_crc32(crc, buf, len, table, data, false, false);
3798 
3799   nand(crc, crc, crc);                           // ~c
3800   BLOCK_COMMENT("} kernel_crc32_1word");
3801 }
3802 
3803 /**
3804  * @param crc   register containing existing CRC (32-bit)
3805  * @param buf   register pointing to input byte buffer (byte*)
3806  * @param len   register containing number of bytes
3807  * @param table register pointing to CRC table
3808  *
3809  * Uses R7_ARG5, R8_ARG6 as work registers.
3810  */
3811 void MacroAssembler::kernel_crc32_1byte(Register crc, Register buf, Register len, Register table,
3812                                         Register t0,  Register t1,  Register t2,  Register t3) {
3813   assert_different_registers(crc, buf, len, table);
3814 
3815   Register  data = t0;                   // Holds the current byte to be folded into crc.
3816 
3817   BLOCK_COMMENT("kernel_crc32_1byte {");
3818 
3819   // Process all bytes in a single-byte loop.
3820   update_byteLoop_crc32(crc, buf, len, table, data, true, true);
3821 
3822   BLOCK_COMMENT("} kernel_crc32_1byte");
3823 }
3824 
3825 void MacroAssembler::kernel_crc32_singleByte(Register crc, Register buf, Register len, Register table, Register tmp) {
3826   assert_different_registers(crc, buf, /* len,  not used!! */ table, tmp);
3827 
3828   BLOCK_COMMENT("kernel_crc32_singleByte:");
3829   nand(crc, crc, crc);       // ~c
3830 
3831   lbz(tmp, 0, buf);          // Byte from buffer, zero-extended.
3832   update_byte_crc32(crc, tmp, table);
3833 
3834   nand(crc, crc, crc);       // ~c
3835 }
3836 
3837 // dest_lo += src1 + src2
3838 // dest_hi += carry1 + carry2
3839 void MacroAssembler::add2_with_carry(Register dest_hi,
3840                                      Register dest_lo,
3841                                      Register src1, Register src2) {
3842   li(R0, 0);
3843   addc(dest_lo, dest_lo, src1);
3844   adde(dest_hi, dest_hi, R0);
3845   addc(dest_lo, dest_lo, src2);
3846   adde(dest_hi, dest_hi, R0);
3847 }
3848 
3849 // Multiply 64 bit by 64 bit first loop.
3850 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,
3851                                            Register x_xstart,
3852                                            Register y, Register y_idx,
3853                                            Register z,
3854                                            Register carry,
3855                                            Register product_high, Register product,
3856                                            Register idx, Register kdx,
3857                                            Register tmp) {
3858   //  jlong carry, x[], y[], z[];
3859   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
3860   //    huge_128 product = y[idx] * x[xstart] + carry;
3861   //    z[kdx] = (jlong)product;
3862   //    carry  = (jlong)(product >>> 64);
3863   //  }
3864   //  z[xstart] = carry;
3865 
3866   Label L_first_loop, L_first_loop_exit;
3867   Label L_one_x, L_one_y, L_multiply;
3868 
3869   addic_(xstart, xstart, -1);
3870   blt(CCR0, L_one_x);   // Special case: length of x is 1.
3871 
3872   // Load next two integers of x.
3873   sldi(tmp, xstart, LogBytesPerInt);
3874   ldx(x_xstart, x, tmp);
3875 #ifdef VM_LITTLE_ENDIAN
3876   rldicl(x_xstart, x_xstart, 32, 0);
3877 #endif
3878 
3879   align(32, 16);
3880   bind(L_first_loop);
3881 
3882   cmpdi(CCR0, idx, 1);
3883   blt(CCR0, L_first_loop_exit);
3884   addi(idx, idx, -2);
3885   beq(CCR0, L_one_y);
3886 
3887   // Load next two integers of y.
3888   sldi(tmp, idx, LogBytesPerInt);
3889   ldx(y_idx, y, tmp);
3890 #ifdef VM_LITTLE_ENDIAN
3891   rldicl(y_idx, y_idx, 32, 0);
3892 #endif
3893 
3894 
3895   bind(L_multiply);
3896   multiply64(product_high, product, x_xstart, y_idx);
3897 
3898   li(tmp, 0);
3899   addc(product, product, carry);         // Add carry to result.
3900   adde(product_high, product_high, tmp); // Add carry of the last addition.
3901   addi(kdx, kdx, -2);
3902 
3903   // Store result.
3904 #ifdef VM_LITTLE_ENDIAN
3905   rldicl(product, product, 32, 0);
3906 #endif
3907   sldi(tmp, kdx, LogBytesPerInt);
3908   stdx(product, z, tmp);
3909   mr_if_needed(carry, product_high);
3910   b(L_first_loop);
3911 
3912 
3913   bind(L_one_y); // Load one 32 bit portion of y as (0,value).
3914 
3915   lwz(y_idx, 0, y);
3916   b(L_multiply);
3917 
3918 
3919   bind(L_one_x); // Load one 32 bit portion of x as (0,value).
3920 
3921   lwz(x_xstart, 0, x);
3922   b(L_first_loop);
3923 
3924   bind(L_first_loop_exit);
3925 }
3926 
3927 // Multiply 64 bit by 64 bit and add 128 bit.
3928 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y,
3929                                             Register z, Register yz_idx,
3930                                             Register idx, Register carry,
3931                                             Register product_high, Register product,
3932                                             Register tmp, int offset) {
3933 
3934   //  huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
3935   //  z[kdx] = (jlong)product;
3936 
3937   sldi(tmp, idx, LogBytesPerInt);
3938   if (offset) {
3939     addi(tmp, tmp, offset);
3940   }
3941   ldx(yz_idx, y, tmp);
3942 #ifdef VM_LITTLE_ENDIAN
3943   rldicl(yz_idx, yz_idx, 32, 0);
3944 #endif
3945 
3946   multiply64(product_high, product, x_xstart, yz_idx);
3947   ldx(yz_idx, z, tmp);
3948 #ifdef VM_LITTLE_ENDIAN
3949   rldicl(yz_idx, yz_idx, 32, 0);
3950 #endif
3951 
3952   add2_with_carry(product_high, product, carry, yz_idx);
3953 
3954   sldi(tmp, idx, LogBytesPerInt);
3955   if (offset) {
3956     addi(tmp, tmp, offset);
3957   }
3958 #ifdef VM_LITTLE_ENDIAN
3959   rldicl(product, product, 32, 0);
3960 #endif
3961   stdx(product, z, tmp);
3962 }
3963 
3964 // Multiply 128 bit by 128 bit. Unrolled inner loop.
3965 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart,
3966                                              Register y, Register z,
3967                                              Register yz_idx, Register idx, Register carry,
3968                                              Register product_high, Register product,
3969                                              Register carry2, Register tmp) {
3970 
3971   //  jlong carry, x[], y[], z[];
3972   //  int kdx = ystart+1;
3973   //  for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
3974   //    huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
3975   //    z[kdx+idx+1] = (jlong)product;
3976   //    jlong carry2 = (jlong)(product >>> 64);
3977   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
3978   //    z[kdx+idx] = (jlong)product;
3979   //    carry = (jlong)(product >>> 64);
3980   //  }
3981   //  idx += 2;
3982   //  if (idx > 0) {
3983   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
3984   //    z[kdx+idx] = (jlong)product;
3985   //    carry = (jlong)(product >>> 64);
3986   //  }
3987 
3988   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
3989   const Register jdx = R0;
3990 
3991   // Scale the index.
3992   srdi_(jdx, idx, 2);
3993   beq(CCR0, L_third_loop_exit);
3994   mtctr(jdx);
3995 
3996   align(32, 16);
3997   bind(L_third_loop);
3998 
3999   addi(idx, idx, -4);
4000 
4001   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8);
4002   mr_if_needed(carry2, product_high);
4003 
4004   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0);
4005   mr_if_needed(carry, product_high);
4006   bdnz(L_third_loop);
4007 
4008   bind(L_third_loop_exit);  // Handle any left-over operand parts.
4009 
4010   andi_(idx, idx, 0x3);
4011   beq(CCR0, L_post_third_loop_done);
4012 
4013   Label L_check_1;
4014 
4015   addic_(idx, idx, -2);
4016   blt(CCR0, L_check_1);
4017 
4018   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0);
4019   mr_if_needed(carry, product_high);
4020 
4021   bind(L_check_1);
4022 
4023   addi(idx, idx, 0x2);
4024   andi_(idx, idx, 0x1) ;
4025   addic_(idx, idx, -1);
4026   blt(CCR0, L_post_third_loop_done);
4027 
4028   sldi(tmp, idx, LogBytesPerInt);
4029   lwzx(yz_idx, y, tmp);
4030   multiply64(product_high, product, x_xstart, yz_idx);
4031   lwzx(yz_idx, z, tmp);
4032 
4033   add2_with_carry(product_high, product, yz_idx, carry);
4034 
4035   sldi(tmp, idx, LogBytesPerInt);
4036   stwx(product, z, tmp);
4037   srdi(product, product, 32);
4038 
4039   sldi(product_high, product_high, 32);
4040   orr(product, product, product_high);
4041   mr_if_needed(carry, product);
4042 
4043   bind(L_post_third_loop_done);
4044 }   // multiply_128_x_128_loop
4045 
4046 void MacroAssembler::multiply_to_len(Register x, Register xlen,
4047                                      Register y, Register ylen,
4048                                      Register z, Register zlen,
4049                                      Register tmp1, Register tmp2,
4050                                      Register tmp3, Register tmp4,
4051                                      Register tmp5, Register tmp6,
4052                                      Register tmp7, Register tmp8,
4053                                      Register tmp9, Register tmp10,
4054                                      Register tmp11, Register tmp12,
4055                                      Register tmp13) {
4056 
4057   ShortBranchVerifier sbv(this);
4058 
4059   assert_different_registers(x, xlen, y, ylen, z, zlen,
4060                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
4061   assert_different_registers(x, xlen, y, ylen, z, zlen,
4062                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp7);
4063   assert_different_registers(x, xlen, y, ylen, z, zlen,
4064                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp8);
4065 
4066   const Register idx = tmp1;
4067   const Register kdx = tmp2;
4068   const Register xstart = tmp3;
4069 
4070   const Register y_idx = tmp4;
4071   const Register carry = tmp5;
4072   const Register product = tmp6;
4073   const Register product_high = tmp7;
4074   const Register x_xstart = tmp8;
4075   const Register tmp = tmp9;
4076 
4077   // First Loop.
4078   //
4079   //  final static long LONG_MASK = 0xffffffffL;
4080   //  int xstart = xlen - 1;
4081   //  int ystart = ylen - 1;
4082   //  long carry = 0;
4083   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
4084   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
4085   //    z[kdx] = (int)product;
4086   //    carry = product >>> 32;
4087   //  }
4088   //  z[xstart] = (int)carry;
4089 
4090   mr_if_needed(idx, ylen);        // idx = ylen
4091   mr_if_needed(kdx, zlen);        // kdx = xlen + ylen
4092   li(carry, 0);                   // carry = 0
4093 
4094   Label L_done;
4095 
4096   addic_(xstart, xlen, -1);
4097   blt(CCR0, L_done);
4098 
4099   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z,
4100                         carry, product_high, product, idx, kdx, tmp);
4101 
4102   Label L_second_loop;
4103 
4104   cmpdi(CCR0, kdx, 0);
4105   beq(CCR0, L_second_loop);
4106 
4107   Label L_carry;
4108 
4109   addic_(kdx, kdx, -1);
4110   beq(CCR0, L_carry);
4111 
4112   // Store lower 32 bits of carry.
4113   sldi(tmp, kdx, LogBytesPerInt);
4114   stwx(carry, z, tmp);
4115   srdi(carry, carry, 32);
4116   addi(kdx, kdx, -1);
4117 
4118 
4119   bind(L_carry);
4120 
4121   // Store upper 32 bits of carry.
4122   sldi(tmp, kdx, LogBytesPerInt);
4123   stwx(carry, z, tmp);
4124 
4125   // Second and third (nested) loops.
4126   //
4127   //  for (int i = xstart-1; i >= 0; i--) { // Second loop
4128   //    carry = 0;
4129   //    for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
4130   //      long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
4131   //                     (z[k] & LONG_MASK) + carry;
4132   //      z[k] = (int)product;
4133   //      carry = product >>> 32;
4134   //    }
4135   //    z[i] = (int)carry;
4136   //  }
4137   //
4138   //  i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
4139 
4140   bind(L_second_loop);
4141 
4142   li(carry, 0);                   // carry = 0;
4143 
4144   addic_(xstart, xstart, -1);     // i = xstart-1;
4145   blt(CCR0, L_done);
4146 
4147   Register zsave = tmp10;
4148 
4149   mr(zsave, z);
4150 
4151 
4152   Label L_last_x;
4153 
4154   sldi(tmp, xstart, LogBytesPerInt);
4155   add(z, z, tmp);                 // z = z + k - j
4156   addi(z, z, 4);
4157   addic_(xstart, xstart, -1);     // i = xstart-1;
4158   blt(CCR0, L_last_x);
4159 
4160   sldi(tmp, xstart, LogBytesPerInt);
4161   ldx(x_xstart, x, tmp);
4162 #ifdef VM_LITTLE_ENDIAN
4163   rldicl(x_xstart, x_xstart, 32, 0);
4164 #endif
4165 
4166 
4167   Label L_third_loop_prologue;
4168 
4169   bind(L_third_loop_prologue);
4170 
4171   Register xsave = tmp11;
4172   Register xlensave = tmp12;
4173   Register ylensave = tmp13;
4174 
4175   mr(xsave, x);
4176   mr(xlensave, xstart);
4177   mr(ylensave, ylen);
4178 
4179 
4180   multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen,
4181                           carry, product_high, product, x, tmp);
4182 
4183   mr(z, zsave);
4184   mr(x, xsave);
4185   mr(xlen, xlensave);   // This is the decrement of the loop counter!
4186   mr(ylen, ylensave);
4187 
4188   addi(tmp3, xlen, 1);
4189   sldi(tmp, tmp3, LogBytesPerInt);
4190   stwx(carry, z, tmp);
4191   addic_(tmp3, tmp3, -1);
4192   blt(CCR0, L_done);
4193 
4194   srdi(carry, carry, 32);
4195   sldi(tmp, tmp3, LogBytesPerInt);
4196   stwx(carry, z, tmp);
4197   b(L_second_loop);
4198 
4199   // Next infrequent code is moved outside loops.
4200   bind(L_last_x);
4201 
4202   lwz(x_xstart, 0, x);
4203   b(L_third_loop_prologue);
4204 
4205   bind(L_done);
4206 }   // multiply_to_len
4207 
4208 void MacroAssembler::asm_assert(bool check_equal, const char *msg, int id) {
4209 #ifdef ASSERT
4210   Label ok;
4211   if (check_equal) {
4212     beq(CCR0, ok);
4213   } else {
4214     bne(CCR0, ok);
4215   }
4216   stop(msg, id);
4217   bind(ok);
4218 #endif
4219 }
4220 
4221 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset,
4222                                           Register mem_base, const char* msg, int id) {
4223 #ifdef ASSERT
4224   switch (size) {
4225     case 4:
4226       lwz(R0, mem_offset, mem_base);
4227       cmpwi(CCR0, R0, 0);
4228       break;
4229     case 8:
4230       ld(R0, mem_offset, mem_base);
4231       cmpdi(CCR0, R0, 0);
4232       break;
4233     default:
4234       ShouldNotReachHere();
4235   }
4236   asm_assert(check_equal, msg, id);
4237 #endif // ASSERT
4238 }
4239 
4240 void MacroAssembler::verify_thread() {
4241   if (VerifyThread) {
4242     unimplemented("'VerifyThread' currently not implemented on PPC");
4243   }
4244 }
4245 
4246 // READ: oop. KILL: R0. Volatile floats perhaps.
4247 void MacroAssembler::verify_oop(Register oop, const char* msg) {
4248   if (!VerifyOops) {
4249     return;
4250   }
4251 
4252   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4253   const Register tmp = R11; // Will be preserved.
4254   const int nbytes_save = 11*8; // Volatile gprs except R0.
4255   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
4256 
4257   if (oop == tmp) mr(R4_ARG2, oop);
4258   save_LR_CR(tmp); // save in old frame
4259   push_frame_reg_args(nbytes_save, tmp);
4260   // load FunctionDescriptor** / entry_address *
4261   load_const_optimized(tmp, fd, R0);
4262   // load FunctionDescriptor* / entry_address
4263   ld(tmp, 0, tmp);
4264   if (oop != tmp) mr_if_needed(R4_ARG2, oop);
4265   load_const_optimized(R3_ARG1, (address)msg, R0);
4266   // Call destination for its side effect.
4267   call_c(tmp);
4268 
4269   pop_frame();
4270   restore_LR_CR(tmp);
4271   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
4272 }
4273 
4274 const char* stop_types[] = {
4275   "stop",
4276   "untested",
4277   "unimplemented",
4278   "shouldnotreachhere"
4279 };
4280 
4281 static void stop_on_request(int tp, const char* msg) {
4282   tty->print("PPC assembly code requires stop: (%s) %s\n", stop_types[tp%/*stop_end*/4], msg);
4283   guarantee(false, "PPC assembly code requires stop: %s", msg);
4284 }
4285 
4286 // Call a C-function that prints output.
4287 void MacroAssembler::stop(int type, const char* msg, int id) {
4288 #ifndef PRODUCT
4289   block_comment(err_msg("stop: %s %s {", stop_types[type%stop_end], msg));
4290 #else
4291   block_comment("stop {");
4292 #endif
4293 
4294   // setup arguments
4295   load_const_optimized(R3_ARG1, type);
4296   load_const_optimized(R4_ARG2, (void *)msg, /*tmp=*/R0);
4297   call_VM_leaf(CAST_FROM_FN_PTR(address, stop_on_request), R3_ARG1, R4_ARG2);
4298   illtrap();
4299   emit_int32(id);
4300   block_comment("} stop;");
4301 }
4302 
4303 #ifndef PRODUCT
4304 // Write pattern 0x0101010101010101 in memory region [low-before, high+after].
4305 // Val, addr are temp registers.
4306 // If low == addr, addr is killed.
4307 // High is preserved.
4308 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) {
4309   if (!ZapMemory) return;
4310 
4311   assert_different_registers(low, val);
4312 
4313   BLOCK_COMMENT("zap memory region {");
4314   load_const_optimized(val, 0x0101010101010101);
4315   int size = before + after;
4316   if (low == high && size < 5 && size > 0) {
4317     int offset = -before*BytesPerWord;
4318     for (int i = 0; i < size; ++i) {
4319       std(val, offset, low);
4320       offset += (1*BytesPerWord);
4321     }
4322   } else {
4323     addi(addr, low, -before*BytesPerWord);
4324     assert_different_registers(high, val);
4325     if (after) addi(high, high, after * BytesPerWord);
4326     Label loop;
4327     bind(loop);
4328     std(val, 0, addr);
4329     addi(addr, addr, 8);
4330     cmpd(CCR6, addr, high);
4331     ble(CCR6, loop);
4332     if (after) addi(high, high, -after * BytesPerWord);  // Correct back to old value.
4333   }
4334   BLOCK_COMMENT("} zap memory region");
4335 }
4336 
4337 #endif // !PRODUCT
4338 
4339 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() {
4340   int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true);
4341   assert(sizeof(bool) == 1, "PowerPC ABI");
4342   masm->lbz(temp, simm16_offset, temp);
4343   masm->cmpwi(CCR0, temp, 0);
4344   masm->beq(CCR0, _label);
4345 }
4346 
4347 SkipIfEqualZero::~SkipIfEqualZero() {
4348   _masm->bind(_label);
4349 }