1 /*
   2  * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "compiler/disassembler.hpp"
  29 #include "gc/shared/cardTableModRefBS.hpp"
  30 #include "gc/shared/collectedHeap.inline.hpp"
  31 #include "interpreter/interpreter.hpp"
  32 #include "memory/resourceArea.hpp"
  33 #include "memory/universe.hpp"
  34 #include "prims/methodHandles.hpp"
  35 #include "runtime/biasedLocking.hpp"
  36 #include "runtime/interfaceSupport.hpp"
  37 #include "runtime/objectMonitor.hpp"
  38 #include "runtime/os.hpp"
  39 #include "runtime/sharedRuntime.hpp"
  40 #include "runtime/stubRoutines.hpp"
  41 #include "utilities/macros.hpp"
  42 #if INCLUDE_ALL_GCS
  43 #include "gc/g1/g1CollectedHeap.inline.hpp"
  44 #include "gc/g1/g1SATBCardTableModRefBS.hpp"
  45 #include "gc/g1/heapRegion.hpp"
  46 #endif // INCLUDE_ALL_GCS
  47 
  48 #ifdef PRODUCT
  49 #define BLOCK_COMMENT(str) /* nothing */
  50 #define STOP(error) stop(error)
  51 #else
  52 #define BLOCK_COMMENT(str) block_comment(str)
  53 #define STOP(error) block_comment(error); stop(error)
  54 #endif
  55 
  56 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  57 
  58 PRAGMA_FORMAT_MUTE_WARNINGS_FOR_GCC
  59 
  60 #ifdef ASSERT
  61 bool AbstractAssembler::pd_check_instruction_mark() { return true; }
  62 #endif
  63 
  64 static Assembler::Condition reverse[] = {
  65     Assembler::noOverflow     /* overflow      = 0x0 */ ,
  66     Assembler::overflow       /* noOverflow    = 0x1 */ ,
  67     Assembler::aboveEqual     /* carrySet      = 0x2, below         = 0x2 */ ,
  68     Assembler::below          /* aboveEqual    = 0x3, carryClear    = 0x3 */ ,
  69     Assembler::notZero        /* zero          = 0x4, equal         = 0x4 */ ,
  70     Assembler::zero           /* notZero       = 0x5, notEqual      = 0x5 */ ,
  71     Assembler::above          /* belowEqual    = 0x6 */ ,
  72     Assembler::belowEqual     /* above         = 0x7 */ ,
  73     Assembler::positive       /* negative      = 0x8 */ ,
  74     Assembler::negative       /* positive      = 0x9 */ ,
  75     Assembler::noParity       /* parity        = 0xa */ ,
  76     Assembler::parity         /* noParity      = 0xb */ ,
  77     Assembler::greaterEqual   /* less          = 0xc */ ,
  78     Assembler::less           /* greaterEqual  = 0xd */ ,
  79     Assembler::greater        /* lessEqual     = 0xe */ ,
  80     Assembler::lessEqual      /* greater       = 0xf, */
  81 
  82 };
  83 
  84 
  85 // Implementation of MacroAssembler
  86 
  87 // First all the versions that have distinct versions depending on 32/64 bit
  88 // Unless the difference is trivial (1 line or so).
  89 
  90 #ifndef _LP64
  91 
  92 // 32bit versions
  93 
  94 Address MacroAssembler::as_Address(AddressLiteral adr) {
  95   return Address(adr.target(), adr.rspec());
  96 }
  97 
  98 Address MacroAssembler::as_Address(ArrayAddress adr) {
  99   return Address::make_array(adr);
 100 }
 101 
 102 void MacroAssembler::call_VM_leaf_base(address entry_point,
 103                                        int number_of_arguments) {
 104   call(RuntimeAddress(entry_point));
 105   increment(rsp, number_of_arguments * wordSize);
 106 }
 107 
 108 void MacroAssembler::cmpklass(Address src1, Metadata* obj) {
 109   cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 110 }
 111 
 112 void MacroAssembler::cmpklass(Register src1, Metadata* obj) {
 113   cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 114 }
 115 
 116 void MacroAssembler::cmpoop(Address src1, jobject obj) {
 117   cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
 118 }
 119 
 120 void MacroAssembler::cmpoop(Register src1, jobject obj) {
 121   cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
 122 }
 123 
 124 void MacroAssembler::extend_sign(Register hi, Register lo) {
 125   // According to Intel Doc. AP-526, "Integer Divide", p.18.
 126   if (VM_Version::is_P6() && hi == rdx && lo == rax) {
 127     cdql();
 128   } else {
 129     movl(hi, lo);
 130     sarl(hi, 31);
 131   }
 132 }
 133 
 134 void MacroAssembler::jC2(Register tmp, Label& L) {
 135   // set parity bit if FPU flag C2 is set (via rax)
 136   save_rax(tmp);
 137   fwait(); fnstsw_ax();
 138   sahf();
 139   restore_rax(tmp);
 140   // branch
 141   jcc(Assembler::parity, L);
 142 }
 143 
 144 void MacroAssembler::jnC2(Register tmp, Label& L) {
 145   // set parity bit if FPU flag C2 is set (via rax)
 146   save_rax(tmp);
 147   fwait(); fnstsw_ax();
 148   sahf();
 149   restore_rax(tmp);
 150   // branch
 151   jcc(Assembler::noParity, L);
 152 }
 153 
 154 // 32bit can do a case table jump in one instruction but we no longer allow the base
 155 // to be installed in the Address class
 156 void MacroAssembler::jump(ArrayAddress entry) {
 157   jmp(as_Address(entry));
 158 }
 159 
 160 // Note: y_lo will be destroyed
 161 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
 162   // Long compare for Java (semantics as described in JVM spec.)
 163   Label high, low, done;
 164 
 165   cmpl(x_hi, y_hi);
 166   jcc(Assembler::less, low);
 167   jcc(Assembler::greater, high);
 168   // x_hi is the return register
 169   xorl(x_hi, x_hi);
 170   cmpl(x_lo, y_lo);
 171   jcc(Assembler::below, low);
 172   jcc(Assembler::equal, done);
 173 
 174   bind(high);
 175   xorl(x_hi, x_hi);
 176   increment(x_hi);
 177   jmp(done);
 178 
 179   bind(low);
 180   xorl(x_hi, x_hi);
 181   decrementl(x_hi);
 182 
 183   bind(done);
 184 }
 185 
 186 void MacroAssembler::lea(Register dst, AddressLiteral src) {
 187     mov_literal32(dst, (int32_t)src.target(), src.rspec());
 188 }
 189 
 190 void MacroAssembler::lea(Address dst, AddressLiteral adr) {
 191   // leal(dst, as_Address(adr));
 192   // see note in movl as to why we must use a move
 193   mov_literal32(dst, (int32_t) adr.target(), adr.rspec());
 194 }
 195 
 196 void MacroAssembler::leave() {
 197   mov(rsp, rbp);
 198   pop(rbp);
 199 }
 200 
 201 void MacroAssembler::lmul(int x_rsp_offset, int y_rsp_offset) {
 202   // Multiplication of two Java long values stored on the stack
 203   // as illustrated below. Result is in rdx:rax.
 204   //
 205   // rsp ---> [  ??  ] \               \
 206   //            ....    | y_rsp_offset  |
 207   //          [ y_lo ] /  (in bytes)    | x_rsp_offset
 208   //          [ y_hi ]                  | (in bytes)
 209   //            ....                    |
 210   //          [ x_lo ]                 /
 211   //          [ x_hi ]
 212   //            ....
 213   //
 214   // Basic idea: lo(result) = lo(x_lo * y_lo)
 215   //             hi(result) = hi(x_lo * y_lo) + lo(x_hi * y_lo) + lo(x_lo * y_hi)
 216   Address x_hi(rsp, x_rsp_offset + wordSize); Address x_lo(rsp, x_rsp_offset);
 217   Address y_hi(rsp, y_rsp_offset + wordSize); Address y_lo(rsp, y_rsp_offset);
 218   Label quick;
 219   // load x_hi, y_hi and check if quick
 220   // multiplication is possible
 221   movl(rbx, x_hi);
 222   movl(rcx, y_hi);
 223   movl(rax, rbx);
 224   orl(rbx, rcx);                                 // rbx, = 0 <=> x_hi = 0 and y_hi = 0
 225   jcc(Assembler::zero, quick);                   // if rbx, = 0 do quick multiply
 226   // do full multiplication
 227   // 1st step
 228   mull(y_lo);                                    // x_hi * y_lo
 229   movl(rbx, rax);                                // save lo(x_hi * y_lo) in rbx,
 230   // 2nd step
 231   movl(rax, x_lo);
 232   mull(rcx);                                     // x_lo * y_hi
 233   addl(rbx, rax);                                // add lo(x_lo * y_hi) to rbx,
 234   // 3rd step
 235   bind(quick);                                   // note: rbx, = 0 if quick multiply!
 236   movl(rax, x_lo);
 237   mull(y_lo);                                    // x_lo * y_lo
 238   addl(rdx, rbx);                                // correct hi(x_lo * y_lo)
 239 }
 240 
 241 void MacroAssembler::lneg(Register hi, Register lo) {
 242   negl(lo);
 243   adcl(hi, 0);
 244   negl(hi);
 245 }
 246 
 247 void MacroAssembler::lshl(Register hi, Register lo) {
 248   // Java shift left long support (semantics as described in JVM spec., p.305)
 249   // (basic idea for shift counts s >= n: x << s == (x << n) << (s - n))
 250   // shift value is in rcx !
 251   assert(hi != rcx, "must not use rcx");
 252   assert(lo != rcx, "must not use rcx");
 253   const Register s = rcx;                        // shift count
 254   const int      n = BitsPerWord;
 255   Label L;
 256   andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
 257   cmpl(s, n);                                    // if (s < n)
 258   jcc(Assembler::less, L);                       // else (s >= n)
 259   movl(hi, lo);                                  // x := x << n
 260   xorl(lo, lo);
 261   // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
 262   bind(L);                                       // s (mod n) < n
 263   shldl(hi, lo);                                 // x := x << s
 264   shll(lo);
 265 }
 266 
 267 
 268 void MacroAssembler::lshr(Register hi, Register lo, bool sign_extension) {
 269   // Java shift right long support (semantics as described in JVM spec., p.306 & p.310)
 270   // (basic idea for shift counts s >= n: x >> s == (x >> n) >> (s - n))
 271   assert(hi != rcx, "must not use rcx");
 272   assert(lo != rcx, "must not use rcx");
 273   const Register s = rcx;                        // shift count
 274   const int      n = BitsPerWord;
 275   Label L;
 276   andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
 277   cmpl(s, n);                                    // if (s < n)
 278   jcc(Assembler::less, L);                       // else (s >= n)
 279   movl(lo, hi);                                  // x := x >> n
 280   if (sign_extension) sarl(hi, 31);
 281   else                xorl(hi, hi);
 282   // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
 283   bind(L);                                       // s (mod n) < n
 284   shrdl(lo, hi);                                 // x := x >> s
 285   if (sign_extension) sarl(hi);
 286   else                shrl(hi);
 287 }
 288 
 289 void MacroAssembler::movoop(Register dst, jobject obj) {
 290   mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
 291 }
 292 
 293 void MacroAssembler::movoop(Address dst, jobject obj) {
 294   mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
 295 }
 296 
 297 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
 298   mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 299 }
 300 
 301 void MacroAssembler::mov_metadata(Address dst, Metadata* obj) {
 302   mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 303 }
 304 
 305 void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) {
 306   // scratch register is not used,
 307   // it is defined to match parameters of 64-bit version of this method.
 308   if (src.is_lval()) {
 309     mov_literal32(dst, (intptr_t)src.target(), src.rspec());
 310   } else {
 311     movl(dst, as_Address(src));
 312   }
 313 }
 314 
 315 void MacroAssembler::movptr(ArrayAddress dst, Register src) {
 316   movl(as_Address(dst), src);
 317 }
 318 
 319 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
 320   movl(dst, as_Address(src));
 321 }
 322 
 323 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
 324 void MacroAssembler::movptr(Address dst, intptr_t src) {
 325   movl(dst, src);
 326 }
 327 
 328 
 329 void MacroAssembler::pop_callee_saved_registers() {
 330   pop(rcx);
 331   pop(rdx);
 332   pop(rdi);
 333   pop(rsi);
 334 }
 335 
 336 void MacroAssembler::pop_fTOS() {
 337   fld_d(Address(rsp, 0));
 338   addl(rsp, 2 * wordSize);
 339 }
 340 
 341 void MacroAssembler::push_callee_saved_registers() {
 342   push(rsi);
 343   push(rdi);
 344   push(rdx);
 345   push(rcx);
 346 }
 347 
 348 void MacroAssembler::push_fTOS() {
 349   subl(rsp, 2 * wordSize);
 350   fstp_d(Address(rsp, 0));
 351 }
 352 
 353 
 354 void MacroAssembler::pushoop(jobject obj) {
 355   push_literal32((int32_t)obj, oop_Relocation::spec_for_immediate());
 356 }
 357 
 358 void MacroAssembler::pushklass(Metadata* obj) {
 359   push_literal32((int32_t)obj, metadata_Relocation::spec_for_immediate());
 360 }
 361 
 362 void MacroAssembler::pushptr(AddressLiteral src) {
 363   if (src.is_lval()) {
 364     push_literal32((int32_t)src.target(), src.rspec());
 365   } else {
 366     pushl(as_Address(src));
 367   }
 368 }
 369 
 370 void MacroAssembler::set_word_if_not_zero(Register dst) {
 371   xorl(dst, dst);
 372   set_byte_if_not_zero(dst);
 373 }
 374 
 375 static void pass_arg0(MacroAssembler* masm, Register arg) {
 376   masm->push(arg);
 377 }
 378 
 379 static void pass_arg1(MacroAssembler* masm, Register arg) {
 380   masm->push(arg);
 381 }
 382 
 383 static void pass_arg2(MacroAssembler* masm, Register arg) {
 384   masm->push(arg);
 385 }
 386 
 387 static void pass_arg3(MacroAssembler* masm, Register arg) {
 388   masm->push(arg);
 389 }
 390 
 391 #ifndef PRODUCT
 392 extern "C" void findpc(intptr_t x);
 393 #endif
 394 
 395 void MacroAssembler::debug32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip, char* msg) {
 396   // In order to get locks to work, we need to fake a in_VM state
 397   JavaThread* thread = JavaThread::current();
 398   JavaThreadState saved_state = thread->thread_state();
 399   thread->set_thread_state(_thread_in_vm);
 400   if (ShowMessageBoxOnError) {
 401     JavaThread* thread = JavaThread::current();
 402     JavaThreadState saved_state = thread->thread_state();
 403     thread->set_thread_state(_thread_in_vm);
 404     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
 405       ttyLocker ttyl;
 406       BytecodeCounter::print();
 407     }
 408     // To see where a verify_oop failed, get $ebx+40/X for this frame.
 409     // This is the value of eip which points to where verify_oop will return.
 410     if (os::message_box(msg, "Execution stopped, print registers?")) {
 411       print_state32(rdi, rsi, rbp, rsp, rbx, rdx, rcx, rax, eip);
 412       BREAKPOINT;
 413     }
 414   } else {
 415     ttyLocker ttyl;
 416     ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n", msg);
 417   }
 418   // Don't assert holding the ttyLock
 419     assert(false, err_msg("DEBUG MESSAGE: %s", msg));
 420   ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
 421 }
 422 
 423 void MacroAssembler::print_state32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip) {
 424   ttyLocker ttyl;
 425   FlagSetting fs(Debugging, true);
 426   tty->print_cr("eip = 0x%08x", eip);
 427 #ifndef PRODUCT
 428   if ((WizardMode || Verbose) && PrintMiscellaneous) {
 429     tty->cr();
 430     findpc(eip);
 431     tty->cr();
 432   }
 433 #endif
 434 #define PRINT_REG(rax) \
 435   { tty->print("%s = ", #rax); os::print_location(tty, rax); }
 436   PRINT_REG(rax);
 437   PRINT_REG(rbx);
 438   PRINT_REG(rcx);
 439   PRINT_REG(rdx);
 440   PRINT_REG(rdi);
 441   PRINT_REG(rsi);
 442   PRINT_REG(rbp);
 443   PRINT_REG(rsp);
 444 #undef PRINT_REG
 445   // Print some words near top of staack.
 446   int* dump_sp = (int*) rsp;
 447   for (int col1 = 0; col1 < 8; col1++) {
 448     tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 449     os::print_location(tty, *dump_sp++);
 450   }
 451   for (int row = 0; row < 16; row++) {
 452     tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 453     for (int col = 0; col < 8; col++) {
 454       tty->print(" 0x%08x", *dump_sp++);
 455     }
 456     tty->cr();
 457   }
 458   // Print some instructions around pc:
 459   Disassembler::decode((address)eip-64, (address)eip);
 460   tty->print_cr("--------");
 461   Disassembler::decode((address)eip, (address)eip+32);
 462 }
 463 
 464 void MacroAssembler::stop(const char* msg) {
 465   ExternalAddress message((address)msg);
 466   // push address of message
 467   pushptr(message.addr());
 468   { Label L; call(L, relocInfo::none); bind(L); }     // push eip
 469   pusha();                                            // push registers
 470   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug32)));
 471   hlt();
 472 }
 473 
 474 void MacroAssembler::warn(const char* msg) {
 475   push_CPU_state();
 476 
 477   ExternalAddress message((address) msg);
 478   // push address of message
 479   pushptr(message.addr());
 480 
 481   call(RuntimeAddress(CAST_FROM_FN_PTR(address, warning)));
 482   addl(rsp, wordSize);       // discard argument
 483   pop_CPU_state();
 484 }
 485 
 486 void MacroAssembler::print_state() {
 487   { Label L; call(L, relocInfo::none); bind(L); }     // push eip
 488   pusha();                                            // push registers
 489 
 490   push_CPU_state();
 491   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::print_state32)));
 492   pop_CPU_state();
 493 
 494   popa();
 495   addl(rsp, wordSize);
 496 }
 497 
 498 #else // _LP64
 499 
 500 // 64 bit versions
 501 
 502 Address MacroAssembler::as_Address(AddressLiteral adr) {
 503   // amd64 always does this as a pc-rel
 504   // we can be absolute or disp based on the instruction type
 505   // jmp/call are displacements others are absolute
 506   assert(!adr.is_lval(), "must be rval");
 507   assert(reachable(adr), "must be");
 508   return Address((int32_t)(intptr_t)(adr.target() - pc()), adr.target(), adr.reloc());
 509 
 510 }
 511 
 512 Address MacroAssembler::as_Address(ArrayAddress adr) {
 513   AddressLiteral base = adr.base();
 514   lea(rscratch1, base);
 515   Address index = adr.index();
 516   assert(index._disp == 0, "must not have disp"); // maybe it can?
 517   Address array(rscratch1, index._index, index._scale, index._disp);
 518   return array;
 519 }
 520 
 521 void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) {
 522   Label L, E;
 523 
 524 #ifdef _WIN64
 525   // Windows always allocates space for it's register args
 526   assert(num_args <= 4, "only register arguments supported");
 527   subq(rsp,  frame::arg_reg_save_area_bytes);
 528 #endif
 529 
 530   // Align stack if necessary
 531   testl(rsp, 15);
 532   jcc(Assembler::zero, L);
 533 
 534   subq(rsp, 8);
 535   {
 536     call(RuntimeAddress(entry_point));
 537   }
 538   addq(rsp, 8);
 539   jmp(E);
 540 
 541   bind(L);
 542   {
 543     call(RuntimeAddress(entry_point));
 544   }
 545 
 546   bind(E);
 547 
 548 #ifdef _WIN64
 549   // restore stack pointer
 550   addq(rsp, frame::arg_reg_save_area_bytes);
 551 #endif
 552 
 553 }
 554 
 555 void MacroAssembler::cmp64(Register src1, AddressLiteral src2) {
 556   assert(!src2.is_lval(), "should use cmpptr");
 557 
 558   if (reachable(src2)) {
 559     cmpq(src1, as_Address(src2));
 560   } else {
 561     lea(rscratch1, src2);
 562     Assembler::cmpq(src1, Address(rscratch1, 0));
 563   }
 564 }
 565 
 566 int MacroAssembler::corrected_idivq(Register reg) {
 567   // Full implementation of Java ldiv and lrem; checks for special
 568   // case as described in JVM spec., p.243 & p.271.  The function
 569   // returns the (pc) offset of the idivl instruction - may be needed
 570   // for implicit exceptions.
 571   //
 572   //         normal case                           special case
 573   //
 574   // input : rax: dividend                         min_long
 575   //         reg: divisor   (may not be eax/edx)   -1
 576   //
 577   // output: rax: quotient  (= rax idiv reg)       min_long
 578   //         rdx: remainder (= rax irem reg)       0
 579   assert(reg != rax && reg != rdx, "reg cannot be rax or rdx register");
 580   static const int64_t min_long = 0x8000000000000000;
 581   Label normal_case, special_case;
 582 
 583   // check for special case
 584   cmp64(rax, ExternalAddress((address) &min_long));
 585   jcc(Assembler::notEqual, normal_case);
 586   xorl(rdx, rdx); // prepare rdx for possible special case (where
 587                   // remainder = 0)
 588   cmpq(reg, -1);
 589   jcc(Assembler::equal, special_case);
 590 
 591   // handle normal case
 592   bind(normal_case);
 593   cdqq();
 594   int idivq_offset = offset();
 595   idivq(reg);
 596 
 597   // normal and special case exit
 598   bind(special_case);
 599 
 600   return idivq_offset;
 601 }
 602 
 603 void MacroAssembler::decrementq(Register reg, int value) {
 604   if (value == min_jint) { subq(reg, value); return; }
 605   if (value <  0) { incrementq(reg, -value); return; }
 606   if (value == 0) {                        ; return; }
 607   if (value == 1 && UseIncDec) { decq(reg) ; return; }
 608   /* else */      { subq(reg, value)       ; return; }
 609 }
 610 
 611 void MacroAssembler::decrementq(Address dst, int value) {
 612   if (value == min_jint) { subq(dst, value); return; }
 613   if (value <  0) { incrementq(dst, -value); return; }
 614   if (value == 0) {                        ; return; }
 615   if (value == 1 && UseIncDec) { decq(dst) ; return; }
 616   /* else */      { subq(dst, value)       ; return; }
 617 }
 618 
 619 void MacroAssembler::incrementq(AddressLiteral dst) {
 620   if (reachable(dst)) {
 621     incrementq(as_Address(dst));
 622   } else {
 623     lea(rscratch1, dst);
 624     incrementq(Address(rscratch1, 0));
 625   }
 626 }
 627 
 628 void MacroAssembler::incrementq(Register reg, int value) {
 629   if (value == min_jint) { addq(reg, value); return; }
 630   if (value <  0) { decrementq(reg, -value); return; }
 631   if (value == 0) {                        ; return; }
 632   if (value == 1 && UseIncDec) { incq(reg) ; return; }
 633   /* else */      { addq(reg, value)       ; return; }
 634 }
 635 
 636 void MacroAssembler::incrementq(Address dst, int value) {
 637   if (value == min_jint) { addq(dst, value); return; }
 638   if (value <  0) { decrementq(dst, -value); return; }
 639   if (value == 0) {                        ; return; }
 640   if (value == 1 && UseIncDec) { incq(dst) ; return; }
 641   /* else */      { addq(dst, value)       ; return; }
 642 }
 643 
 644 // 32bit can do a case table jump in one instruction but we no longer allow the base
 645 // to be installed in the Address class
 646 void MacroAssembler::jump(ArrayAddress entry) {
 647   lea(rscratch1, entry.base());
 648   Address dispatch = entry.index();
 649   assert(dispatch._base == noreg, "must be");
 650   dispatch._base = rscratch1;
 651   jmp(dispatch);
 652 }
 653 
 654 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
 655   ShouldNotReachHere(); // 64bit doesn't use two regs
 656   cmpq(x_lo, y_lo);
 657 }
 658 
 659 void MacroAssembler::lea(Register dst, AddressLiteral src) {
 660     mov_literal64(dst, (intptr_t)src.target(), src.rspec());
 661 }
 662 
 663 void MacroAssembler::lea(Address dst, AddressLiteral adr) {
 664   mov_literal64(rscratch1, (intptr_t)adr.target(), adr.rspec());
 665   movptr(dst, rscratch1);
 666 }
 667 
 668 void MacroAssembler::leave() {
 669   // %%% is this really better? Why not on 32bit too?
 670   emit_int8((unsigned char)0xC9); // LEAVE
 671 }
 672 
 673 void MacroAssembler::lneg(Register hi, Register lo) {
 674   ShouldNotReachHere(); // 64bit doesn't use two regs
 675   negq(lo);
 676 }
 677 
 678 void MacroAssembler::movoop(Register dst, jobject obj) {
 679   mov_literal64(dst, (intptr_t)obj, oop_Relocation::spec_for_immediate());
 680 }
 681 
 682 void MacroAssembler::movoop(Address dst, jobject obj) {
 683   mov_literal64(rscratch1, (intptr_t)obj, oop_Relocation::spec_for_immediate());
 684   movq(dst, rscratch1);
 685 }
 686 
 687 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
 688   mov_literal64(dst, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
 689 }
 690 
 691 void MacroAssembler::mov_metadata(Address dst, Metadata* obj) {
 692   mov_literal64(rscratch1, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
 693   movq(dst, rscratch1);
 694 }
 695 
 696 void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) {
 697   if (src.is_lval()) {
 698     mov_literal64(dst, (intptr_t)src.target(), src.rspec());
 699   } else {
 700     if (reachable(src)) {
 701       movq(dst, as_Address(src));
 702     } else {
 703       lea(scratch, src);
 704       movq(dst, Address(scratch, 0));
 705     }
 706   }
 707 }
 708 
 709 void MacroAssembler::movptr(ArrayAddress dst, Register src) {
 710   movq(as_Address(dst), src);
 711 }
 712 
 713 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
 714   movq(dst, as_Address(src));
 715 }
 716 
 717 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
 718 void MacroAssembler::movptr(Address dst, intptr_t src) {
 719   mov64(rscratch1, src);
 720   movq(dst, rscratch1);
 721 }
 722 
 723 // These are mostly for initializing NULL
 724 void MacroAssembler::movptr(Address dst, int32_t src) {
 725   movslq(dst, src);
 726 }
 727 
 728 void MacroAssembler::movptr(Register dst, int32_t src) {
 729   mov64(dst, (intptr_t)src);
 730 }
 731 
 732 void MacroAssembler::pushoop(jobject obj) {
 733   movoop(rscratch1, obj);
 734   push(rscratch1);
 735 }
 736 
 737 void MacroAssembler::pushklass(Metadata* obj) {
 738   mov_metadata(rscratch1, obj);
 739   push(rscratch1);
 740 }
 741 
 742 void MacroAssembler::pushptr(AddressLiteral src) {
 743   lea(rscratch1, src);
 744   if (src.is_lval()) {
 745     push(rscratch1);
 746   } else {
 747     pushq(Address(rscratch1, 0));
 748   }
 749 }
 750 
 751 void MacroAssembler::reset_last_Java_frame(bool clear_fp,
 752                                            bool clear_pc) {
 753   // we must set sp to zero to clear frame
 754   movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
 755   // must clear fp, so that compiled frames are not confused; it is
 756   // possible that we need it only for debugging
 757   if (clear_fp) {
 758     movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
 759   }
 760 
 761   if (clear_pc) {
 762     movptr(Address(r15_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
 763   }
 764 }
 765 
 766 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 767                                          Register last_java_fp,
 768                                          address  last_java_pc) {
 769   // determine last_java_sp register
 770   if (!last_java_sp->is_valid()) {
 771     last_java_sp = rsp;
 772   }
 773 
 774   // last_java_fp is optional
 775   if (last_java_fp->is_valid()) {
 776     movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()),
 777            last_java_fp);
 778   }
 779 
 780   // last_java_pc is optional
 781   if (last_java_pc != NULL) {
 782     Address java_pc(r15_thread,
 783                     JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset());
 784     lea(rscratch1, InternalAddress(last_java_pc));
 785     movptr(java_pc, rscratch1);
 786   }
 787 
 788   movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
 789 }
 790 
 791 static void pass_arg0(MacroAssembler* masm, Register arg) {
 792   if (c_rarg0 != arg ) {
 793     masm->mov(c_rarg0, arg);
 794   }
 795 }
 796 
 797 static void pass_arg1(MacroAssembler* masm, Register arg) {
 798   if (c_rarg1 != arg ) {
 799     masm->mov(c_rarg1, arg);
 800   }
 801 }
 802 
 803 static void pass_arg2(MacroAssembler* masm, Register arg) {
 804   if (c_rarg2 != arg ) {
 805     masm->mov(c_rarg2, arg);
 806   }
 807 }
 808 
 809 static void pass_arg3(MacroAssembler* masm, Register arg) {
 810   if (c_rarg3 != arg ) {
 811     masm->mov(c_rarg3, arg);
 812   }
 813 }
 814 
 815 void MacroAssembler::stop(const char* msg) {
 816   address rip = pc();
 817   pusha(); // get regs on stack
 818   lea(c_rarg0, ExternalAddress((address) msg));
 819   lea(c_rarg1, InternalAddress(rip));
 820   movq(c_rarg2, rsp); // pass pointer to regs array
 821   andq(rsp, -16); // align stack as required by ABI
 822   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
 823   hlt();
 824 }
 825 
 826 void MacroAssembler::warn(const char* msg) {
 827   push(rbp);
 828   movq(rbp, rsp);
 829   andq(rsp, -16);     // align stack as required by push_CPU_state and call
 830   push_CPU_state();   // keeps alignment at 16 bytes
 831   lea(c_rarg0, ExternalAddress((address) msg));
 832   call_VM_leaf(CAST_FROM_FN_PTR(address, warning), c_rarg0);
 833   pop_CPU_state();
 834   mov(rsp, rbp);
 835   pop(rbp);
 836 }
 837 
 838 void MacroAssembler::print_state() {
 839   address rip = pc();
 840   pusha();            // get regs on stack
 841   push(rbp);
 842   movq(rbp, rsp);
 843   andq(rsp, -16);     // align stack as required by push_CPU_state and call
 844   push_CPU_state();   // keeps alignment at 16 bytes
 845 
 846   lea(c_rarg0, InternalAddress(rip));
 847   lea(c_rarg1, Address(rbp, wordSize)); // pass pointer to regs array
 848   call_VM_leaf(CAST_FROM_FN_PTR(address, MacroAssembler::print_state64), c_rarg0, c_rarg1);
 849 
 850   pop_CPU_state();
 851   mov(rsp, rbp);
 852   pop(rbp);
 853   popa();
 854 }
 855 
 856 #ifndef PRODUCT
 857 extern "C" void findpc(intptr_t x);
 858 #endif
 859 
 860 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) {
 861   // In order to get locks to work, we need to fake a in_VM state
 862   if (ShowMessageBoxOnError) {
 863     JavaThread* thread = JavaThread::current();
 864     JavaThreadState saved_state = thread->thread_state();
 865     thread->set_thread_state(_thread_in_vm);
 866 #ifndef PRODUCT
 867     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
 868       ttyLocker ttyl;
 869       BytecodeCounter::print();
 870     }
 871 #endif
 872     // To see where a verify_oop failed, get $ebx+40/X for this frame.
 873     // XXX correct this offset for amd64
 874     // This is the value of eip which points to where verify_oop will return.
 875     if (os::message_box(msg, "Execution stopped, print registers?")) {
 876       print_state64(pc, regs);
 877       BREAKPOINT;
 878       assert(false, "start up GDB");
 879     }
 880     ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
 881   } else {
 882     ttyLocker ttyl;
 883     ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n",
 884                     msg);
 885     assert(false, err_msg("DEBUG MESSAGE: %s", msg));
 886   }
 887 }
 888 
 889 void MacroAssembler::print_state64(int64_t pc, int64_t regs[]) {
 890   ttyLocker ttyl;
 891   FlagSetting fs(Debugging, true);
 892   tty->print_cr("rip = 0x%016lx", pc);
 893 #ifndef PRODUCT
 894   tty->cr();
 895   findpc(pc);
 896   tty->cr();
 897 #endif
 898 #define PRINT_REG(rax, value) \
 899   { tty->print("%s = ", #rax); os::print_location(tty, value); }
 900   PRINT_REG(rax, regs[15]);
 901   PRINT_REG(rbx, regs[12]);
 902   PRINT_REG(rcx, regs[14]);
 903   PRINT_REG(rdx, regs[13]);
 904   PRINT_REG(rdi, regs[8]);
 905   PRINT_REG(rsi, regs[9]);
 906   PRINT_REG(rbp, regs[10]);
 907   PRINT_REG(rsp, regs[11]);
 908   PRINT_REG(r8 , regs[7]);
 909   PRINT_REG(r9 , regs[6]);
 910   PRINT_REG(r10, regs[5]);
 911   PRINT_REG(r11, regs[4]);
 912   PRINT_REG(r12, regs[3]);
 913   PRINT_REG(r13, regs[2]);
 914   PRINT_REG(r14, regs[1]);
 915   PRINT_REG(r15, regs[0]);
 916 #undef PRINT_REG
 917   // Print some words near top of staack.
 918   int64_t* rsp = (int64_t*) regs[11];
 919   int64_t* dump_sp = rsp;
 920   for (int col1 = 0; col1 < 8; col1++) {
 921     tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (int64_t)dump_sp);
 922     os::print_location(tty, *dump_sp++);
 923   }
 924   for (int row = 0; row < 25; row++) {
 925     tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (int64_t)dump_sp);
 926     for (int col = 0; col < 4; col++) {
 927       tty->print(" 0x%016lx", *dump_sp++);
 928     }
 929     tty->cr();
 930   }
 931   // Print some instructions around pc:
 932   Disassembler::decode((address)pc-64, (address)pc);
 933   tty->print_cr("--------");
 934   Disassembler::decode((address)pc, (address)pc+32);
 935 }
 936 
 937 #endif // _LP64
 938 
 939 // Now versions that are common to 32/64 bit
 940 
 941 void MacroAssembler::addptr(Register dst, int32_t imm32) {
 942   LP64_ONLY(addq(dst, imm32)) NOT_LP64(addl(dst, imm32));
 943 }
 944 
 945 void MacroAssembler::addptr(Register dst, Register src) {
 946   LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
 947 }
 948 
 949 void MacroAssembler::addptr(Address dst, Register src) {
 950   LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
 951 }
 952 
 953 void MacroAssembler::addsd(XMMRegister dst, AddressLiteral src) {
 954   if (reachable(src)) {
 955     Assembler::addsd(dst, as_Address(src));
 956   } else {
 957     lea(rscratch1, src);
 958     Assembler::addsd(dst, Address(rscratch1, 0));
 959   }
 960 }
 961 
 962 void MacroAssembler::addss(XMMRegister dst, AddressLiteral src) {
 963   if (reachable(src)) {
 964     addss(dst, as_Address(src));
 965   } else {
 966     lea(rscratch1, src);
 967     addss(dst, Address(rscratch1, 0));
 968   }
 969 }
 970 
 971 void MacroAssembler::align(int modulus) {
 972   if (offset() % modulus != 0) {
 973     nop(modulus - (offset() % modulus));
 974   }
 975 }
 976 
 977 void MacroAssembler::andpd(XMMRegister dst, AddressLiteral src) {
 978   // Used in sign-masking with aligned address.
 979   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
 980   if (reachable(src)) {
 981     Assembler::andpd(dst, as_Address(src));
 982   } else {
 983     lea(rscratch1, src);
 984     Assembler::andpd(dst, Address(rscratch1, 0));
 985   }
 986 }
 987 
 988 void MacroAssembler::andps(XMMRegister dst, AddressLiteral src) {
 989   // Used in sign-masking with aligned address.
 990   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
 991   if (reachable(src)) {
 992     Assembler::andps(dst, as_Address(src));
 993   } else {
 994     lea(rscratch1, src);
 995     Assembler::andps(dst, Address(rscratch1, 0));
 996   }
 997 }
 998 
 999 void MacroAssembler::andptr(Register dst, int32_t imm32) {
1000   LP64_ONLY(andq(dst, imm32)) NOT_LP64(andl(dst, imm32));
1001 }
1002 
1003 void MacroAssembler::atomic_incl(Address counter_addr) {
1004   if (os::is_MP())
1005     lock();
1006   incrementl(counter_addr);
1007 }
1008 
1009 void MacroAssembler::atomic_incl(AddressLiteral counter_addr, Register scr) {
1010   if (reachable(counter_addr)) {
1011     atomic_incl(as_Address(counter_addr));
1012   } else {
1013     lea(scr, counter_addr);
1014     atomic_incl(Address(scr, 0));
1015   }
1016 }
1017 
1018 #ifdef _LP64
1019 void MacroAssembler::atomic_incq(Address counter_addr) {
1020   if (os::is_MP())
1021     lock();
1022   incrementq(counter_addr);
1023 }
1024 
1025 void MacroAssembler::atomic_incq(AddressLiteral counter_addr, Register scr) {
1026   if (reachable(counter_addr)) {
1027     atomic_incq(as_Address(counter_addr));
1028   } else {
1029     lea(scr, counter_addr);
1030     atomic_incq(Address(scr, 0));
1031   }
1032 }
1033 #endif
1034 
1035 // Writes to stack successive pages until offset reached to check for
1036 // stack overflow + shadow pages.  This clobbers tmp.
1037 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
1038   movptr(tmp, rsp);
1039   // Bang stack for total size given plus shadow page size.
1040   // Bang one page at a time because large size can bang beyond yellow and
1041   // red zones.
1042   Label loop;
1043   bind(loop);
1044   movl(Address(tmp, (-os::vm_page_size())), size );
1045   subptr(tmp, os::vm_page_size());
1046   subl(size, os::vm_page_size());
1047   jcc(Assembler::greater, loop);
1048 
1049   // Bang down shadow pages too.
1050   // At this point, (tmp-0) is the last address touched, so don't
1051   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
1052   // was post-decremented.)  Skip this address by starting at i=1, and
1053   // touch a few more pages below.  N.B.  It is important to touch all
1054   // the way down to and including i=StackShadowPages.
1055   for (int i = 1; i < StackShadowPages; i++) {
1056     // this could be any sized move but this is can be a debugging crumb
1057     // so the bigger the better.
1058     movptr(Address(tmp, (-i*os::vm_page_size())), size );
1059   }
1060 }
1061 
1062 int MacroAssembler::biased_locking_enter(Register lock_reg,
1063                                          Register obj_reg,
1064                                          Register swap_reg,
1065                                          Register tmp_reg,
1066                                          bool swap_reg_contains_mark,
1067                                          Label& done,
1068                                          Label* slow_case,
1069                                          BiasedLockingCounters* counters) {
1070   assert(UseBiasedLocking, "why call this otherwise?");
1071   assert(swap_reg == rax, "swap_reg must be rax for cmpxchgq");
1072   assert(tmp_reg != noreg, "tmp_reg must be supplied");
1073   assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg);
1074   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
1075   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
1076   Address saved_mark_addr(lock_reg, 0);
1077 
1078   if (PrintBiasedLockingStatistics && counters == NULL) {
1079     counters = BiasedLocking::counters();
1080   }
1081   // Biased locking
1082   // See whether the lock is currently biased toward our thread and
1083   // whether the epoch is still valid
1084   // Note that the runtime guarantees sufficient alignment of JavaThread
1085   // pointers to allow age to be placed into low bits
1086   // First check to see whether biasing is even enabled for this object
1087   Label cas_label;
1088   int null_check_offset = -1;
1089   if (!swap_reg_contains_mark) {
1090     null_check_offset = offset();
1091     movptr(swap_reg, mark_addr);
1092   }
1093   movptr(tmp_reg, swap_reg);
1094   andptr(tmp_reg, markOopDesc::biased_lock_mask_in_place);
1095   cmpptr(tmp_reg, markOopDesc::biased_lock_pattern);
1096   jcc(Assembler::notEqual, cas_label);
1097   // The bias pattern is present in the object's header. Need to check
1098   // whether the bias owner and the epoch are both still current.
1099 #ifndef _LP64
1100   // Note that because there is no current thread register on x86_32 we
1101   // need to store off the mark word we read out of the object to
1102   // avoid reloading it and needing to recheck invariants below. This
1103   // store is unfortunate but it makes the overall code shorter and
1104   // simpler.
1105   movptr(saved_mark_addr, swap_reg);
1106 #endif
1107   if (swap_reg_contains_mark) {
1108     null_check_offset = offset();
1109   }
1110   load_prototype_header(tmp_reg, obj_reg);
1111 #ifdef _LP64
1112   orptr(tmp_reg, r15_thread);
1113   xorptr(tmp_reg, swap_reg);
1114   Register header_reg = tmp_reg;
1115 #else
1116   xorptr(tmp_reg, swap_reg);
1117   get_thread(swap_reg);
1118   xorptr(swap_reg, tmp_reg);
1119   Register header_reg = swap_reg;
1120 #endif
1121   andptr(header_reg, ~((int) markOopDesc::age_mask_in_place));
1122   if (counters != NULL) {
1123     cond_inc32(Assembler::zero,
1124                ExternalAddress((address) counters->biased_lock_entry_count_addr()));
1125   }
1126   jcc(Assembler::equal, done);
1127 
1128   Label try_revoke_bias;
1129   Label try_rebias;
1130 
1131   // At this point we know that the header has the bias pattern and
1132   // that we are not the bias owner in the current epoch. We need to
1133   // figure out more details about the state of the header in order to
1134   // know what operations can be legally performed on the object's
1135   // header.
1136 
1137   // If the low three bits in the xor result aren't clear, that means
1138   // the prototype header is no longer biased and we have to revoke
1139   // the bias on this object.
1140   testptr(header_reg, markOopDesc::biased_lock_mask_in_place);
1141   jccb(Assembler::notZero, try_revoke_bias);
1142 
1143   // Biasing is still enabled for this data type. See whether the
1144   // epoch of the current bias is still valid, meaning that the epoch
1145   // bits of the mark word are equal to the epoch bits of the
1146   // prototype header. (Note that the prototype header's epoch bits
1147   // only change at a safepoint.) If not, attempt to rebias the object
1148   // toward the current thread. Note that we must be absolutely sure
1149   // that the current epoch is invalid in order to do this because
1150   // otherwise the manipulations it performs on the mark word are
1151   // illegal.
1152   testptr(header_reg, markOopDesc::epoch_mask_in_place);
1153   jccb(Assembler::notZero, try_rebias);
1154 
1155   // The epoch of the current bias is still valid but we know nothing
1156   // about the owner; it might be set or it might be clear. Try to
1157   // acquire the bias of the object using an atomic operation. If this
1158   // fails we will go in to the runtime to revoke the object's bias.
1159   // Note that we first construct the presumed unbiased header so we
1160   // don't accidentally blow away another thread's valid bias.
1161   NOT_LP64( movptr(swap_reg, saved_mark_addr); )
1162   andptr(swap_reg,
1163          markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
1164 #ifdef _LP64
1165   movptr(tmp_reg, swap_reg);
1166   orptr(tmp_reg, r15_thread);
1167 #else
1168   get_thread(tmp_reg);
1169   orptr(tmp_reg, swap_reg);
1170 #endif
1171   if (os::is_MP()) {
1172     lock();
1173   }
1174   cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1175   // If the biasing toward our thread failed, this means that
1176   // another thread succeeded in biasing it toward itself and we
1177   // need to revoke that bias. The revocation will occur in the
1178   // interpreter runtime in the slow case.
1179   if (counters != NULL) {
1180     cond_inc32(Assembler::zero,
1181                ExternalAddress((address) counters->anonymously_biased_lock_entry_count_addr()));
1182   }
1183   if (slow_case != NULL) {
1184     jcc(Assembler::notZero, *slow_case);
1185   }
1186   jmp(done);
1187 
1188   bind(try_rebias);
1189   // At this point we know the epoch has expired, meaning that the
1190   // current "bias owner", if any, is actually invalid. Under these
1191   // circumstances _only_, we are allowed to use the current header's
1192   // value as the comparison value when doing the cas to acquire the
1193   // bias in the current epoch. In other words, we allow transfer of
1194   // the bias from one thread to another directly in this situation.
1195   //
1196   // FIXME: due to a lack of registers we currently blow away the age
1197   // bits in this situation. Should attempt to preserve them.
1198   load_prototype_header(tmp_reg, obj_reg);
1199 #ifdef _LP64
1200   orptr(tmp_reg, r15_thread);
1201 #else
1202   get_thread(swap_reg);
1203   orptr(tmp_reg, swap_reg);
1204   movptr(swap_reg, saved_mark_addr);
1205 #endif
1206   if (os::is_MP()) {
1207     lock();
1208   }
1209   cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1210   // If the biasing toward our thread failed, then another thread
1211   // succeeded in biasing it toward itself and we need to revoke that
1212   // bias. The revocation will occur in the runtime in the slow case.
1213   if (counters != NULL) {
1214     cond_inc32(Assembler::zero,
1215                ExternalAddress((address) counters->rebiased_lock_entry_count_addr()));
1216   }
1217   if (slow_case != NULL) {
1218     jcc(Assembler::notZero, *slow_case);
1219   }
1220   jmp(done);
1221 
1222   bind(try_revoke_bias);
1223   // The prototype mark in the klass doesn't have the bias bit set any
1224   // more, indicating that objects of this data type are not supposed
1225   // to be biased any more. We are going to try to reset the mark of
1226   // this object to the prototype value and fall through to the
1227   // CAS-based locking scheme. Note that if our CAS fails, it means
1228   // that another thread raced us for the privilege of revoking the
1229   // bias of this particular object, so it's okay to continue in the
1230   // normal locking code.
1231   //
1232   // FIXME: due to a lack of registers we currently blow away the age
1233   // bits in this situation. Should attempt to preserve them.
1234   NOT_LP64( movptr(swap_reg, saved_mark_addr); )
1235   load_prototype_header(tmp_reg, obj_reg);
1236   if (os::is_MP()) {
1237     lock();
1238   }
1239   cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1240   // Fall through to the normal CAS-based lock, because no matter what
1241   // the result of the above CAS, some thread must have succeeded in
1242   // removing the bias bit from the object's header.
1243   if (counters != NULL) {
1244     cond_inc32(Assembler::zero,
1245                ExternalAddress((address) counters->revoked_lock_entry_count_addr()));
1246   }
1247 
1248   bind(cas_label);
1249 
1250   return null_check_offset;
1251 }
1252 
1253 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
1254   assert(UseBiasedLocking, "why call this otherwise?");
1255 
1256   // Check for biased locking unlock case, which is a no-op
1257   // Note: we do not have to check the thread ID for two reasons.
1258   // First, the interpreter checks for IllegalMonitorStateException at
1259   // a higher level. Second, if the bias was revoked while we held the
1260   // lock, the object could not be rebiased toward another thread, so
1261   // the bias bit would be clear.
1262   movptr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
1263   andptr(temp_reg, markOopDesc::biased_lock_mask_in_place);
1264   cmpptr(temp_reg, markOopDesc::biased_lock_pattern);
1265   jcc(Assembler::equal, done);
1266 }
1267 
1268 #ifdef COMPILER2
1269 
1270 #if INCLUDE_RTM_OPT
1271 
1272 // Update rtm_counters based on abort status
1273 // input: abort_status
1274 //        rtm_counters (RTMLockingCounters*)
1275 // flags are killed
1276 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
1277 
1278   atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
1279   if (PrintPreciseRTMLockingStatistics) {
1280     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
1281       Label check_abort;
1282       testl(abort_status, (1<<i));
1283       jccb(Assembler::equal, check_abort);
1284       atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
1285       bind(check_abort);
1286     }
1287   }
1288 }
1289 
1290 // Branch if (random & (count-1) != 0), count is 2^n
1291 // tmp, scr and flags are killed
1292 void MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
1293   assert(tmp == rax, "");
1294   assert(scr == rdx, "");
1295   rdtsc(); // modifies EDX:EAX
1296   andptr(tmp, count-1);
1297   jccb(Assembler::notZero, brLabel);
1298 }
1299 
1300 // Perform abort ratio calculation, set no_rtm bit if high ratio
1301 // input:  rtm_counters_Reg (RTMLockingCounters* address)
1302 // tmpReg, rtm_counters_Reg and flags are killed
1303 void MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
1304                                                  Register rtm_counters_Reg,
1305                                                  RTMLockingCounters* rtm_counters,
1306                                                  Metadata* method_data) {
1307   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
1308 
1309   if (RTMLockingCalculationDelay > 0) {
1310     // Delay calculation
1311     movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()), tmpReg);
1312     testptr(tmpReg, tmpReg);
1313     jccb(Assembler::equal, L_done);
1314   }
1315   // Abort ratio calculation only if abort_count > RTMAbortThreshold
1316   //   Aborted transactions = abort_count * 100
1317   //   All transactions = total_count *  RTMTotalCountIncrRate
1318   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
1319 
1320   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
1321   cmpptr(tmpReg, RTMAbortThreshold);
1322   jccb(Assembler::below, L_check_always_rtm2);
1323   imulptr(tmpReg, tmpReg, 100);
1324 
1325   Register scrReg = rtm_counters_Reg;
1326   movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
1327   imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
1328   imulptr(scrReg, scrReg, RTMAbortRatio);
1329   cmpptr(tmpReg, scrReg);
1330   jccb(Assembler::below, L_check_always_rtm1);
1331   if (method_data != NULL) {
1332     // set rtm_state to "no rtm" in MDO
1333     mov_metadata(tmpReg, method_data);
1334     if (os::is_MP()) {
1335       lock();
1336     }
1337     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM);
1338   }
1339   jmpb(L_done);
1340   bind(L_check_always_rtm1);
1341   // Reload RTMLockingCounters* address
1342   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
1343   bind(L_check_always_rtm2);
1344   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
1345   cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
1346   jccb(Assembler::below, L_done);
1347   if (method_data != NULL) {
1348     // set rtm_state to "always rtm" in MDO
1349     mov_metadata(tmpReg, method_data);
1350     if (os::is_MP()) {
1351       lock();
1352     }
1353     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM);
1354   }
1355   bind(L_done);
1356 }
1357 
1358 // Update counters and perform abort ratio calculation
1359 // input:  abort_status_Reg
1360 // rtm_counters_Reg, flags are killed
1361 void MacroAssembler::rtm_profiling(Register abort_status_Reg,
1362                                    Register rtm_counters_Reg,
1363                                    RTMLockingCounters* rtm_counters,
1364                                    Metadata* method_data,
1365                                    bool profile_rtm) {
1366 
1367   assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
1368   // update rtm counters based on rax value at abort
1369   // reads abort_status_Reg, updates flags
1370   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
1371   rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
1372   if (profile_rtm) {
1373     // Save abort status because abort_status_Reg is used by following code.
1374     if (RTMRetryCount > 0) {
1375       push(abort_status_Reg);
1376     }
1377     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
1378     rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
1379     // restore abort status
1380     if (RTMRetryCount > 0) {
1381       pop(abort_status_Reg);
1382     }
1383   }
1384 }
1385 
1386 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
1387 // inputs: retry_count_Reg
1388 //       : abort_status_Reg
1389 // output: retry_count_Reg decremented by 1
1390 // flags are killed
1391 void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
1392   Label doneRetry;
1393   assert(abort_status_Reg == rax, "");
1394   // The abort reason bits are in eax (see all states in rtmLocking.hpp)
1395   // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
1396   // if reason is in 0x6 and retry count != 0 then retry
1397   andptr(abort_status_Reg, 0x6);
1398   jccb(Assembler::zero, doneRetry);
1399   testl(retry_count_Reg, retry_count_Reg);
1400   jccb(Assembler::zero, doneRetry);
1401   pause();
1402   decrementl(retry_count_Reg);
1403   jmp(retryLabel);
1404   bind(doneRetry);
1405 }
1406 
1407 // Spin and retry if lock is busy,
1408 // inputs: box_Reg (monitor address)
1409 //       : retry_count_Reg
1410 // output: retry_count_Reg decremented by 1
1411 //       : clear z flag if retry count exceeded
1412 // tmp_Reg, scr_Reg, flags are killed
1413 void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
1414                                             Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
1415   Label SpinLoop, SpinExit, doneRetry;
1416   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
1417 
1418   testl(retry_count_Reg, retry_count_Reg);
1419   jccb(Assembler::zero, doneRetry);
1420   decrementl(retry_count_Reg);
1421   movptr(scr_Reg, RTMSpinLoopCount);
1422 
1423   bind(SpinLoop);
1424   pause();
1425   decrementl(scr_Reg);
1426   jccb(Assembler::lessEqual, SpinExit);
1427   movptr(tmp_Reg, Address(box_Reg, owner_offset));
1428   testptr(tmp_Reg, tmp_Reg);
1429   jccb(Assembler::notZero, SpinLoop);
1430 
1431   bind(SpinExit);
1432   jmp(retryLabel);
1433   bind(doneRetry);
1434   incrementl(retry_count_Reg); // clear z flag
1435 }
1436 
1437 // Use RTM for normal stack locks
1438 // Input: objReg (object to lock)
1439 void MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
1440                                        Register retry_on_abort_count_Reg,
1441                                        RTMLockingCounters* stack_rtm_counters,
1442                                        Metadata* method_data, bool profile_rtm,
1443                                        Label& DONE_LABEL, Label& IsInflated) {
1444   assert(UseRTMForStackLocks, "why call this otherwise?");
1445   assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
1446   assert(tmpReg == rax, "");
1447   assert(scrReg == rdx, "");
1448   Label L_rtm_retry, L_decrement_retry, L_on_abort;
1449 
1450   if (RTMRetryCount > 0) {
1451     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
1452     bind(L_rtm_retry);
1453   }
1454   movptr(tmpReg, Address(objReg, 0));
1455   testptr(tmpReg, markOopDesc::monitor_value);  // inflated vs stack-locked|neutral|biased
1456   jcc(Assembler::notZero, IsInflated);
1457 
1458   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1459     Label L_noincrement;
1460     if (RTMTotalCountIncrRate > 1) {
1461       // tmpReg, scrReg and flags are killed
1462       branch_on_random_using_rdtsc(tmpReg, scrReg, (int)RTMTotalCountIncrRate, L_noincrement);
1463     }
1464     assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
1465     atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
1466     bind(L_noincrement);
1467   }
1468   xbegin(L_on_abort);
1469   movptr(tmpReg, Address(objReg, 0));       // fetch markword
1470   andptr(tmpReg, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
1471   cmpptr(tmpReg, markOopDesc::unlocked_value);            // bits = 001 unlocked
1472   jcc(Assembler::equal, DONE_LABEL);        // all done if unlocked
1473 
1474   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
1475   if (UseRTMXendForLockBusy) {
1476     xend();
1477     movptr(abort_status_Reg, 0x2);   // Set the abort status to 2 (so we can retry)
1478     jmp(L_decrement_retry);
1479   }
1480   else {
1481     xabort(0);
1482   }
1483   bind(L_on_abort);
1484   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1485     rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
1486   }
1487   bind(L_decrement_retry);
1488   if (RTMRetryCount > 0) {
1489     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
1490     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
1491   }
1492 }
1493 
1494 // Use RTM for inflating locks
1495 // inputs: objReg (object to lock)
1496 //         boxReg (on-stack box address (displaced header location) - KILLED)
1497 //         tmpReg (ObjectMonitor address + markOopDesc::monitor_value)
1498 void MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
1499                                           Register scrReg, Register retry_on_busy_count_Reg,
1500                                           Register retry_on_abort_count_Reg,
1501                                           RTMLockingCounters* rtm_counters,
1502                                           Metadata* method_data, bool profile_rtm,
1503                                           Label& DONE_LABEL) {
1504   assert(UseRTMLocking, "why call this otherwise?");
1505   assert(tmpReg == rax, "");
1506   assert(scrReg == rdx, "");
1507   Label L_rtm_retry, L_decrement_retry, L_on_abort;
1508   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
1509 
1510   // Without cast to int32_t a movptr will destroy r10 which is typically obj
1511   movptr(Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark()));
1512   movptr(boxReg, tmpReg); // Save ObjectMonitor address
1513 
1514   if (RTMRetryCount > 0) {
1515     movl(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy
1516     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
1517     bind(L_rtm_retry);
1518   }
1519   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1520     Label L_noincrement;
1521     if (RTMTotalCountIncrRate > 1) {
1522       // tmpReg, scrReg and flags are killed
1523       branch_on_random_using_rdtsc(tmpReg, scrReg, (int)RTMTotalCountIncrRate, L_noincrement);
1524     }
1525     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
1526     atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
1527     bind(L_noincrement);
1528   }
1529   xbegin(L_on_abort);
1530   movptr(tmpReg, Address(objReg, 0));
1531   movptr(tmpReg, Address(tmpReg, owner_offset));
1532   testptr(tmpReg, tmpReg);
1533   jcc(Assembler::zero, DONE_LABEL);
1534   if (UseRTMXendForLockBusy) {
1535     xend();
1536     jmp(L_decrement_retry);
1537   }
1538   else {
1539     xabort(0);
1540   }
1541   bind(L_on_abort);
1542   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
1543   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1544     rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
1545   }
1546   if (RTMRetryCount > 0) {
1547     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
1548     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
1549   }
1550 
1551   movptr(tmpReg, Address(boxReg, owner_offset)) ;
1552   testptr(tmpReg, tmpReg) ;
1553   jccb(Assembler::notZero, L_decrement_retry) ;
1554 
1555   // Appears unlocked - try to swing _owner from null to non-null.
1556   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
1557 #ifdef _LP64
1558   Register threadReg = r15_thread;
1559 #else
1560   get_thread(scrReg);
1561   Register threadReg = scrReg;
1562 #endif
1563   if (os::is_MP()) {
1564     lock();
1565   }
1566   cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
1567 
1568   if (RTMRetryCount > 0) {
1569     // success done else retry
1570     jccb(Assembler::equal, DONE_LABEL) ;
1571     bind(L_decrement_retry);
1572     // Spin and retry if lock is busy.
1573     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
1574   }
1575   else {
1576     bind(L_decrement_retry);
1577   }
1578 }
1579 
1580 #endif //  INCLUDE_RTM_OPT
1581 
1582 // Fast_Lock and Fast_Unlock used by C2
1583 
1584 // Because the transitions from emitted code to the runtime
1585 // monitorenter/exit helper stubs are so slow it's critical that
1586 // we inline both the stack-locking fast-path and the inflated fast path.
1587 //
1588 // See also: cmpFastLock and cmpFastUnlock.
1589 //
1590 // What follows is a specialized inline transliteration of the code
1591 // in slow_enter() and slow_exit().  If we're concerned about I$ bloat
1592 // another option would be to emit TrySlowEnter and TrySlowExit methods
1593 // at startup-time.  These methods would accept arguments as
1594 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
1595 // indications in the icc.ZFlag.  Fast_Lock and Fast_Unlock would simply
1596 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
1597 // In practice, however, the # of lock sites is bounded and is usually small.
1598 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
1599 // if the processor uses simple bimodal branch predictors keyed by EIP
1600 // Since the helper routines would be called from multiple synchronization
1601 // sites.
1602 //
1603 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
1604 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
1605 // to those specialized methods.  That'd give us a mostly platform-independent
1606 // implementation that the JITs could optimize and inline at their pleasure.
1607 // Done correctly, the only time we'd need to cross to native could would be
1608 // to park() or unpark() threads.  We'd also need a few more unsafe operators
1609 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
1610 // (b) explicit barriers or fence operations.
1611 //
1612 // TODO:
1613 //
1614 // *  Arrange for C2 to pass "Self" into Fast_Lock and Fast_Unlock in one of the registers (scr).
1615 //    This avoids manifesting the Self pointer in the Fast_Lock and Fast_Unlock terminals.
1616 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
1617 //    the lock operators would typically be faster than reifying Self.
1618 //
1619 // *  Ideally I'd define the primitives as:
1620 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
1621 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
1622 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
1623 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
1624 //    Furthermore the register assignments are overconstrained, possibly resulting in
1625 //    sub-optimal code near the synchronization site.
1626 //
1627 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
1628 //    Alternately, use a better sp-proximity test.
1629 //
1630 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
1631 //    Either one is sufficient to uniquely identify a thread.
1632 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
1633 //
1634 // *  Intrinsify notify() and notifyAll() for the common cases where the
1635 //    object is locked by the calling thread but the waitlist is empty.
1636 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
1637 //
1638 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
1639 //    But beware of excessive branch density on AMD Opterons.
1640 //
1641 // *  Both Fast_Lock and Fast_Unlock set the ICC.ZF to indicate success
1642 //    or failure of the fast-path.  If the fast-path fails then we pass
1643 //    control to the slow-path, typically in C.  In Fast_Lock and
1644 //    Fast_Unlock we often branch to DONE_LABEL, just to find that C2
1645 //    will emit a conditional branch immediately after the node.
1646 //    So we have branches to branches and lots of ICC.ZF games.
1647 //    Instead, it might be better to have C2 pass a "FailureLabel"
1648 //    into Fast_Lock and Fast_Unlock.  In the case of success, control
1649 //    will drop through the node.  ICC.ZF is undefined at exit.
1650 //    In the case of failure, the node will branch directly to the
1651 //    FailureLabel
1652 
1653 
1654 // obj: object to lock
1655 // box: on-stack box address (displaced header location) - KILLED
1656 // rax,: tmp -- KILLED
1657 // scr: tmp -- KILLED
1658 void MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
1659                                Register scrReg, Register cx1Reg, Register cx2Reg,
1660                                BiasedLockingCounters* counters,
1661                                RTMLockingCounters* rtm_counters,
1662                                RTMLockingCounters* stack_rtm_counters,
1663                                Metadata* method_data,
1664                                bool use_rtm, bool profile_rtm) {
1665   // Ensure the register assignents are disjoint
1666   assert(tmpReg == rax, "");
1667 
1668   if (use_rtm) {
1669     assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
1670   } else {
1671     assert(cx1Reg == noreg, "");
1672     assert(cx2Reg == noreg, "");
1673     assert_different_registers(objReg, boxReg, tmpReg, scrReg);
1674   }
1675 
1676   if (counters != NULL) {
1677     atomic_incl(ExternalAddress((address)counters->total_entry_count_addr()), scrReg);
1678   }
1679   if (EmitSync & 1) {
1680       // set box->dhw = markOopDesc::unused_mark()
1681       // Force all sync thru slow-path: slow_enter() and slow_exit()
1682       movptr (Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark()));
1683       cmpptr (rsp, (int32_t)NULL_WORD);
1684   } else {
1685     // Possible cases that we'll encounter in fast_lock
1686     // ------------------------------------------------
1687     // * Inflated
1688     //    -- unlocked
1689     //    -- Locked
1690     //       = by self
1691     //       = by other
1692     // * biased
1693     //    -- by Self
1694     //    -- by other
1695     // * neutral
1696     // * stack-locked
1697     //    -- by self
1698     //       = sp-proximity test hits
1699     //       = sp-proximity test generates false-negative
1700     //    -- by other
1701     //
1702 
1703     Label IsInflated, DONE_LABEL;
1704 
1705     // it's stack-locked, biased or neutral
1706     // TODO: optimize away redundant LDs of obj->mark and improve the markword triage
1707     // order to reduce the number of conditional branches in the most common cases.
1708     // Beware -- there's a subtle invariant that fetch of the markword
1709     // at [FETCH], below, will never observe a biased encoding (*101b).
1710     // If this invariant is not held we risk exclusion (safety) failure.
1711     if (UseBiasedLocking && !UseOptoBiasInlining) {
1712       biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL, counters);
1713     }
1714 
1715 #if INCLUDE_RTM_OPT
1716     if (UseRTMForStackLocks && use_rtm) {
1717       rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
1718                         stack_rtm_counters, method_data, profile_rtm,
1719                         DONE_LABEL, IsInflated);
1720     }
1721 #endif // INCLUDE_RTM_OPT
1722 
1723     movptr(tmpReg, Address(objReg, 0));          // [FETCH]
1724     testptr(tmpReg, markOopDesc::monitor_value); // inflated vs stack-locked|neutral|biased
1725     jccb(Assembler::notZero, IsInflated);
1726 
1727     // Attempt stack-locking ...
1728     orptr (tmpReg, markOopDesc::unlocked_value);
1729     movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
1730     if (os::is_MP()) {
1731       lock();
1732     }
1733     cmpxchgptr(boxReg, Address(objReg, 0));      // Updates tmpReg
1734     if (counters != NULL) {
1735       cond_inc32(Assembler::equal,
1736                  ExternalAddress((address)counters->fast_path_entry_count_addr()));
1737     }
1738     jcc(Assembler::equal, DONE_LABEL);           // Success
1739 
1740     // Recursive locking.
1741     // The object is stack-locked: markword contains stack pointer to BasicLock.
1742     // Locked by current thread if difference with current SP is less than one page.
1743     subptr(tmpReg, rsp);
1744     // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
1745     andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );
1746     movptr(Address(boxReg, 0), tmpReg);
1747     if (counters != NULL) {
1748       cond_inc32(Assembler::equal,
1749                  ExternalAddress((address)counters->fast_path_entry_count_addr()));
1750     }
1751     jmp(DONE_LABEL);
1752 
1753     bind(IsInflated);
1754     // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markOopDesc::monitor_value
1755 
1756 #if INCLUDE_RTM_OPT
1757     // Use the same RTM locking code in 32- and 64-bit VM.
1758     if (use_rtm) {
1759       rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
1760                            rtm_counters, method_data, profile_rtm, DONE_LABEL);
1761     } else {
1762 #endif // INCLUDE_RTM_OPT
1763 
1764 #ifndef _LP64
1765     // The object is inflated.
1766 
1767     // boxReg refers to the on-stack BasicLock in the current frame.
1768     // We'd like to write:
1769     //   set box->_displaced_header = markOopDesc::unused_mark().  Any non-0 value suffices.
1770     // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
1771     // additional latency as we have another ST in the store buffer that must drain.
1772 
1773     if (EmitSync & 8192) {
1774        movptr(Address(boxReg, 0), 3);            // results in ST-before-CAS penalty
1775        get_thread (scrReg);
1776        movptr(boxReg, tmpReg);                    // consider: LEA box, [tmp-2]
1777        movptr(tmpReg, NULL_WORD);                 // consider: xor vs mov
1778        if (os::is_MP()) {
1779          lock();
1780        }
1781        cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1782     } else
1783     if ((EmitSync & 128) == 0) {                      // avoid ST-before-CAS
1784        movptr(scrReg, boxReg);
1785        movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
1786 
1787        // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes
1788        if ((EmitSync & 2048) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
1789           // prefetchw [eax + Offset(_owner)-2]
1790           prefetchw(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1791        }
1792 
1793        if ((EmitSync & 64) == 0) {
1794          // Optimistic form: consider XORL tmpReg,tmpReg
1795          movptr(tmpReg, NULL_WORD);
1796        } else {
1797          // Can suffer RTS->RTO upgrades on shared or cold $ lines
1798          // Test-And-CAS instead of CAS
1799          movptr(tmpReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));   // rax, = m->_owner
1800          testptr(tmpReg, tmpReg);                   // Locked ?
1801          jccb  (Assembler::notZero, DONE_LABEL);
1802        }
1803 
1804        // Appears unlocked - try to swing _owner from null to non-null.
1805        // Ideally, I'd manifest "Self" with get_thread and then attempt
1806        // to CAS the register containing Self into m->Owner.
1807        // But we don't have enough registers, so instead we can either try to CAS
1808        // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
1809        // we later store "Self" into m->Owner.  Transiently storing a stack address
1810        // (rsp or the address of the box) into  m->owner is harmless.
1811        // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
1812        if (os::is_MP()) {
1813          lock();
1814        }
1815        cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1816        movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
1817        jccb  (Assembler::notZero, DONE_LABEL);
1818        get_thread (scrReg);                    // beware: clobbers ICCs
1819        movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
1820        xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
1821 
1822        // If the CAS fails we can either retry or pass control to the slow-path.
1823        // We use the latter tactic.
1824        // Pass the CAS result in the icc.ZFlag into DONE_LABEL
1825        // If the CAS was successful ...
1826        //   Self has acquired the lock
1827        //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
1828        // Intentional fall-through into DONE_LABEL ...
1829     } else {
1830        movptr(Address(boxReg, 0), intptr_t(markOopDesc::unused_mark()));  // results in ST-before-CAS penalty
1831        movptr(boxReg, tmpReg);
1832 
1833        // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes
1834        if ((EmitSync & 2048) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
1835           // prefetchw [eax + Offset(_owner)-2]
1836           prefetchw(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1837        }
1838 
1839        if ((EmitSync & 64) == 0) {
1840          // Optimistic form
1841          xorptr  (tmpReg, tmpReg);
1842        } else {
1843          // Can suffer RTS->RTO upgrades on shared or cold $ lines
1844          movptr(tmpReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));   // rax, = m->_owner
1845          testptr(tmpReg, tmpReg);                   // Locked ?
1846          jccb  (Assembler::notZero, DONE_LABEL);
1847        }
1848 
1849        // Appears unlocked - try to swing _owner from null to non-null.
1850        // Use either "Self" (in scr) or rsp as thread identity in _owner.
1851        // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
1852        get_thread (scrReg);
1853        if (os::is_MP()) {
1854          lock();
1855        }
1856        cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1857 
1858        // If the CAS fails we can either retry or pass control to the slow-path.
1859        // We use the latter tactic.
1860        // Pass the CAS result in the icc.ZFlag into DONE_LABEL
1861        // If the CAS was successful ...
1862        //   Self has acquired the lock
1863        //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
1864        // Intentional fall-through into DONE_LABEL ...
1865     }
1866 #else // _LP64
1867     // It's inflated
1868     movq(scrReg, tmpReg);
1869     xorq(tmpReg, tmpReg);
1870 
1871     if (os::is_MP()) {
1872       lock();
1873     }
1874     cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1875     // Unconditionally set box->_displaced_header = markOopDesc::unused_mark().
1876     // Without cast to int32_t movptr will destroy r10 which is typically obj.
1877     movptr(Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark()));
1878     // Intentional fall-through into DONE_LABEL ...
1879     // Propagate ICC.ZF from CAS above into DONE_LABEL.
1880 #endif // _LP64
1881 #if INCLUDE_RTM_OPT
1882     } // use_rtm()
1883 #endif
1884     // DONE_LABEL is a hot target - we'd really like to place it at the
1885     // start of cache line by padding with NOPs.
1886     // See the AMD and Intel software optimization manuals for the
1887     // most efficient "long" NOP encodings.
1888     // Unfortunately none of our alignment mechanisms suffice.
1889     bind(DONE_LABEL);
1890 
1891     // At DONE_LABEL the icc ZFlag is set as follows ...
1892     // Fast_Unlock uses the same protocol.
1893     // ZFlag == 1 -> Success
1894     // ZFlag == 0 -> Failure - force control through the slow-path
1895   }
1896 }
1897 
1898 // obj: object to unlock
1899 // box: box address (displaced header location), killed.  Must be EAX.
1900 // tmp: killed, cannot be obj nor box.
1901 //
1902 // Some commentary on balanced locking:
1903 //
1904 // Fast_Lock and Fast_Unlock are emitted only for provably balanced lock sites.
1905 // Methods that don't have provably balanced locking are forced to run in the
1906 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
1907 // The interpreter provides two properties:
1908 // I1:  At return-time the interpreter automatically and quietly unlocks any
1909 //      objects acquired the current activation (frame).  Recall that the
1910 //      interpreter maintains an on-stack list of locks currently held by
1911 //      a frame.
1912 // I2:  If a method attempts to unlock an object that is not held by the
1913 //      the frame the interpreter throws IMSX.
1914 //
1915 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
1916 // B() doesn't have provably balanced locking so it runs in the interpreter.
1917 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
1918 // is still locked by A().
1919 //
1920 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
1921 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
1922 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
1923 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
1924 // Arguably given that the spec legislates the JNI case as undefined our implementation
1925 // could reasonably *avoid* checking owner in Fast_Unlock().
1926 // In the interest of performance we elide m->Owner==Self check in unlock.
1927 // A perfectly viable alternative is to elide the owner check except when
1928 // Xcheck:jni is enabled.
1929 
1930 void MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
1931   assert(boxReg == rax, "");
1932   assert_different_registers(objReg, boxReg, tmpReg);
1933 
1934   if (EmitSync & 4) {
1935     // Disable - inhibit all inlining.  Force control through the slow-path
1936     cmpptr (rsp, 0);
1937   } else {
1938     Label DONE_LABEL, Stacked, CheckSucc;
1939 
1940     // Critically, the biased locking test must have precedence over
1941     // and appear before the (box->dhw == 0) recursive stack-lock test.
1942     if (UseBiasedLocking && !UseOptoBiasInlining) {
1943        biased_locking_exit(objReg, tmpReg, DONE_LABEL);
1944     }
1945 
1946 #if INCLUDE_RTM_OPT
1947     if (UseRTMForStackLocks && use_rtm) {
1948       assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
1949       Label L_regular_unlock;
1950       movptr(tmpReg, Address(objReg, 0));           // fetch markword
1951       andptr(tmpReg, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
1952       cmpptr(tmpReg, markOopDesc::unlocked_value);            // bits = 001 unlocked
1953       jccb(Assembler::notEqual, L_regular_unlock);  // if !HLE RegularLock
1954       xend();                                       // otherwise end...
1955       jmp(DONE_LABEL);                              // ... and we're done
1956       bind(L_regular_unlock);
1957     }
1958 #endif
1959 
1960     cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD); // Examine the displaced header
1961     jcc   (Assembler::zero, DONE_LABEL);            // 0 indicates recursive stack-lock
1962     movptr(tmpReg, Address(objReg, 0));             // Examine the object's markword
1963     testptr(tmpReg, markOopDesc::monitor_value);    // Inflated?
1964     jccb  (Assembler::zero, Stacked);
1965 
1966     // It's inflated.
1967 #if INCLUDE_RTM_OPT
1968     if (use_rtm) {
1969       Label L_regular_inflated_unlock;
1970       int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
1971       movptr(boxReg, Address(tmpReg, owner_offset));
1972       testptr(boxReg, boxReg);
1973       jccb(Assembler::notZero, L_regular_inflated_unlock);
1974       xend();
1975       jmpb(DONE_LABEL);
1976       bind(L_regular_inflated_unlock);
1977     }
1978 #endif
1979 
1980     // Despite our balanced locking property we still check that m->_owner == Self
1981     // as java routines or native JNI code called by this thread might
1982     // have released the lock.
1983     // Refer to the comments in synchronizer.cpp for how we might encode extra
1984     // state in _succ so we can avoid fetching EntryList|cxq.
1985     //
1986     // I'd like to add more cases in fast_lock() and fast_unlock() --
1987     // such as recursive enter and exit -- but we have to be wary of
1988     // I$ bloat, T$ effects and BP$ effects.
1989     //
1990     // If there's no contention try a 1-0 exit.  That is, exit without
1991     // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
1992     // we detect and recover from the race that the 1-0 exit admits.
1993     //
1994     // Conceptually Fast_Unlock() must execute a STST|LDST "release" barrier
1995     // before it STs null into _owner, releasing the lock.  Updates
1996     // to data protected by the critical section must be visible before
1997     // we drop the lock (and thus before any other thread could acquire
1998     // the lock and observe the fields protected by the lock).
1999     // IA32's memory-model is SPO, so STs are ordered with respect to
2000     // each other and there's no need for an explicit barrier (fence).
2001     // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
2002 #ifndef _LP64
2003     get_thread (boxReg);
2004     if ((EmitSync & 4096) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
2005       // prefetchw [ebx + Offset(_owner)-2]
2006       prefetchw(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
2007     }
2008 
2009     // Note that we could employ various encoding schemes to reduce
2010     // the number of loads below (currently 4) to just 2 or 3.
2011     // Refer to the comments in synchronizer.cpp.
2012     // In practice the chain of fetches doesn't seem to impact performance, however.
2013     xorptr(boxReg, boxReg);
2014     if ((EmitSync & 65536) == 0 && (EmitSync & 256)) {
2015        // Attempt to reduce branch density - AMD's branch predictor.
2016        orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
2017        orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
2018        orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
2019        jccb  (Assembler::notZero, DONE_LABEL);
2020        movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
2021        jmpb  (DONE_LABEL);
2022     } else {
2023        orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
2024        jccb  (Assembler::notZero, DONE_LABEL);
2025        movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
2026        orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
2027        jccb  (Assembler::notZero, CheckSucc);
2028        movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
2029        jmpb  (DONE_LABEL);
2030     }
2031 
2032     // The Following code fragment (EmitSync & 65536) improves the performance of
2033     // contended applications and contended synchronization microbenchmarks.
2034     // Unfortunately the emission of the code - even though not executed - causes regressions
2035     // in scimark and jetstream, evidently because of $ effects.  Replacing the code
2036     // with an equal number of never-executed NOPs results in the same regression.
2037     // We leave it off by default.
2038 
2039     if ((EmitSync & 65536) != 0) {
2040        Label LSuccess, LGoSlowPath ;
2041 
2042        bind  (CheckSucc);
2043 
2044        // Optional pre-test ... it's safe to elide this
2045        cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
2046        jccb(Assembler::zero, LGoSlowPath);
2047 
2048        // We have a classic Dekker-style idiom:
2049        //    ST m->_owner = 0 ; MEMBAR; LD m->_succ
2050        // There are a number of ways to implement the barrier:
2051        // (1) lock:andl &m->_owner, 0
2052        //     is fast, but mask doesn't currently support the "ANDL M,IMM32" form.
2053        //     LOCK: ANDL [ebx+Offset(_Owner)-2], 0
2054        //     Encodes as 81 31 OFF32 IMM32 or 83 63 OFF8 IMM8
2055        // (2) If supported, an explicit MFENCE is appealing.
2056        //     In older IA32 processors MFENCE is slower than lock:add or xchg
2057        //     particularly if the write-buffer is full as might be the case if
2058        //     if stores closely precede the fence or fence-equivalent instruction.
2059        //     See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
2060        //     as the situation has changed with Nehalem and Shanghai.
2061        // (3) In lieu of an explicit fence, use lock:addl to the top-of-stack
2062        //     The $lines underlying the top-of-stack should be in M-state.
2063        //     The locked add instruction is serializing, of course.
2064        // (4) Use xchg, which is serializing
2065        //     mov boxReg, 0; xchgl boxReg, [tmpReg + Offset(_owner)-2] also works
2066        // (5) ST m->_owner = 0 and then execute lock:orl &m->_succ, 0.
2067        //     The integer condition codes will tell us if succ was 0.
2068        //     Since _succ and _owner should reside in the same $line and
2069        //     we just stored into _owner, it's likely that the $line
2070        //     remains in M-state for the lock:orl.
2071        //
2072        // We currently use (3), although it's likely that switching to (2)
2073        // is correct for the future.
2074 
2075        movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
2076        if (os::is_MP()) {
2077          lock(); addptr(Address(rsp, 0), 0);
2078        }
2079        // Ratify _succ remains non-null
2080        cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), 0);
2081        jccb  (Assembler::notZero, LSuccess);
2082 
2083        xorptr(boxReg, boxReg);                  // box is really EAX
2084        if (os::is_MP()) { lock(); }
2085        cmpxchgptr(rsp, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
2086        jccb  (Assembler::notEqual, LSuccess);
2087        // Since we're low on registers we installed rsp as a placeholding in _owner.
2088        // Now install Self over rsp.  This is safe as we're transitioning from
2089        // non-null to non=null
2090        get_thread (boxReg);
2091        movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), boxReg);
2092        // Intentional fall-through into LGoSlowPath ...
2093 
2094        bind  (LGoSlowPath);
2095        orptr(boxReg, 1);                      // set ICC.ZF=0 to indicate failure
2096        jmpb  (DONE_LABEL);
2097 
2098        bind  (LSuccess);
2099        xorptr(boxReg, boxReg);                 // set ICC.ZF=1 to indicate success
2100        jmpb  (DONE_LABEL);
2101     }
2102 
2103     bind (Stacked);
2104     // It's not inflated and it's not recursively stack-locked and it's not biased.
2105     // It must be stack-locked.
2106     // Try to reset the header to displaced header.
2107     // The "box" value on the stack is stable, so we can reload
2108     // and be assured we observe the same value as above.
2109     movptr(tmpReg, Address(boxReg, 0));
2110     if (os::is_MP()) {
2111       lock();
2112     }
2113     cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses RAX which is box
2114     // Intention fall-thru into DONE_LABEL
2115 
2116     // DONE_LABEL is a hot target - we'd really like to place it at the
2117     // start of cache line by padding with NOPs.
2118     // See the AMD and Intel software optimization manuals for the
2119     // most efficient "long" NOP encodings.
2120     // Unfortunately none of our alignment mechanisms suffice.
2121     if ((EmitSync & 65536) == 0) {
2122        bind (CheckSucc);
2123     }
2124 #else // _LP64
2125     // It's inflated
2126     if (EmitSync & 1024) {
2127       // Emit code to check that _owner == Self
2128       // We could fold the _owner test into subsequent code more efficiently
2129       // than using a stand-alone check, but since _owner checking is off by
2130       // default we don't bother. We also might consider predicating the
2131       // _owner==Self check on Xcheck:jni or running on a debug build.
2132       movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
2133       xorptr(boxReg, r15_thread);
2134     } else {
2135       xorptr(boxReg, boxReg);
2136     }
2137     orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
2138     jccb  (Assembler::notZero, DONE_LABEL);
2139     movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
2140     orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
2141     jccb  (Assembler::notZero, CheckSucc);
2142     movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
2143     jmpb  (DONE_LABEL);
2144 
2145     if ((EmitSync & 65536) == 0) {
2146       // Try to avoid passing control into the slow_path ...
2147       Label LSuccess, LGoSlowPath ;
2148       bind  (CheckSucc);
2149 
2150       // The following optional optimization can be elided if necessary
2151       // Effectively: if (succ == null) goto SlowPath
2152       // The code reduces the window for a race, however,
2153       // and thus benefits performance.
2154       cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
2155       jccb  (Assembler::zero, LGoSlowPath);
2156 
2157       if ((EmitSync & 16) && os::is_MP()) {
2158         orptr(boxReg, boxReg);
2159         xchgptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
2160       } else {
2161         movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
2162         if (os::is_MP()) {
2163           // Memory barrier/fence
2164           // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
2165           // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
2166           // This is faster on Nehalem and AMD Shanghai/Barcelona.
2167           // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
2168           // We might also restructure (ST Owner=0;barrier;LD _Succ) to
2169           // (mov box,0; xchgq box, &m->Owner; LD _succ) .
2170           lock(); addl(Address(rsp, 0), 0);
2171         }
2172       }
2173       cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
2174       jccb  (Assembler::notZero, LSuccess);
2175 
2176       // Rare inopportune interleaving - race.
2177       // The successor vanished in the small window above.
2178       // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
2179       // We need to ensure progress and succession.
2180       // Try to reacquire the lock.
2181       // If that fails then the new owner is responsible for succession and this
2182       // thread needs to take no further action and can exit via the fast path (success).
2183       // If the re-acquire succeeds then pass control into the slow path.
2184       // As implemented, this latter mode is horrible because we generated more
2185       // coherence traffic on the lock *and* artifically extended the critical section
2186       // length while by virtue of passing control into the slow path.
2187 
2188       // box is really RAX -- the following CMPXCHG depends on that binding
2189       // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
2190       movptr(boxReg, (int32_t)NULL_WORD);
2191       if (os::is_MP()) { lock(); }
2192       cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
2193       jccb  (Assembler::notEqual, LSuccess);
2194       // Intentional fall-through into slow-path
2195 
2196       bind  (LGoSlowPath);
2197       orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
2198       jmpb  (DONE_LABEL);
2199 
2200       bind  (LSuccess);
2201       testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
2202       jmpb  (DONE_LABEL);
2203     }
2204 
2205     bind  (Stacked);
2206     movptr(tmpReg, Address (boxReg, 0));      // re-fetch
2207     if (os::is_MP()) { lock(); }
2208     cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses RAX which is box
2209 
2210     if (EmitSync & 65536) {
2211        bind (CheckSucc);
2212     }
2213 #endif
2214     bind(DONE_LABEL);
2215   }
2216 }
2217 #endif // COMPILER2
2218 
2219 void MacroAssembler::c2bool(Register x) {
2220   // implements x == 0 ? 0 : 1
2221   // note: must only look at least-significant byte of x
2222   //       since C-style booleans are stored in one byte
2223   //       only! (was bug)
2224   andl(x, 0xFF);
2225   setb(Assembler::notZero, x);
2226 }
2227 
2228 // Wouldn't need if AddressLiteral version had new name
2229 void MacroAssembler::call(Label& L, relocInfo::relocType rtype) {
2230   Assembler::call(L, rtype);
2231 }
2232 
2233 void MacroAssembler::call(Register entry) {
2234   Assembler::call(entry);
2235 }
2236 
2237 void MacroAssembler::call(AddressLiteral entry) {
2238   if (reachable(entry)) {
2239     Assembler::call_literal(entry.target(), entry.rspec());
2240   } else {
2241     lea(rscratch1, entry);
2242     Assembler::call(rscratch1);
2243   }
2244 }
2245 
2246 void MacroAssembler::ic_call(address entry) {
2247   RelocationHolder rh = virtual_call_Relocation::spec(pc());
2248   movptr(rax, (intptr_t)Universe::non_oop_word());
2249   call(AddressLiteral(entry, rh));
2250 }
2251 
2252 // Implementation of call_VM versions
2253 
2254 void MacroAssembler::call_VM(Register oop_result,
2255                              address entry_point,
2256                              bool check_exceptions) {
2257   Label C, E;
2258   call(C, relocInfo::none);
2259   jmp(E);
2260 
2261   bind(C);
2262   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
2263   ret(0);
2264 
2265   bind(E);
2266 }
2267 
2268 void MacroAssembler::call_VM(Register oop_result,
2269                              address entry_point,
2270                              Register arg_1,
2271                              bool check_exceptions) {
2272   Label C, E;
2273   call(C, relocInfo::none);
2274   jmp(E);
2275 
2276   bind(C);
2277   pass_arg1(this, arg_1);
2278   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
2279   ret(0);
2280 
2281   bind(E);
2282 }
2283 
2284 void MacroAssembler::call_VM(Register oop_result,
2285                              address entry_point,
2286                              Register arg_1,
2287                              Register arg_2,
2288                              bool check_exceptions) {
2289   Label C, E;
2290   call(C, relocInfo::none);
2291   jmp(E);
2292 
2293   bind(C);
2294 
2295   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2296 
2297   pass_arg2(this, arg_2);
2298   pass_arg1(this, arg_1);
2299   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
2300   ret(0);
2301 
2302   bind(E);
2303 }
2304 
2305 void MacroAssembler::call_VM(Register oop_result,
2306                              address entry_point,
2307                              Register arg_1,
2308                              Register arg_2,
2309                              Register arg_3,
2310                              bool check_exceptions) {
2311   Label C, E;
2312   call(C, relocInfo::none);
2313   jmp(E);
2314 
2315   bind(C);
2316 
2317   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2318   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2319   pass_arg3(this, arg_3);
2320 
2321   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2322   pass_arg2(this, arg_2);
2323 
2324   pass_arg1(this, arg_1);
2325   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
2326   ret(0);
2327 
2328   bind(E);
2329 }
2330 
2331 void MacroAssembler::call_VM(Register oop_result,
2332                              Register last_java_sp,
2333                              address entry_point,
2334                              int number_of_arguments,
2335                              bool check_exceptions) {
2336   Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
2337   call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
2338 }
2339 
2340 void MacroAssembler::call_VM(Register oop_result,
2341                              Register last_java_sp,
2342                              address entry_point,
2343                              Register arg_1,
2344                              bool check_exceptions) {
2345   pass_arg1(this, arg_1);
2346   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
2347 }
2348 
2349 void MacroAssembler::call_VM(Register oop_result,
2350                              Register last_java_sp,
2351                              address entry_point,
2352                              Register arg_1,
2353                              Register arg_2,
2354                              bool check_exceptions) {
2355 
2356   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2357   pass_arg2(this, arg_2);
2358   pass_arg1(this, arg_1);
2359   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
2360 }
2361 
2362 void MacroAssembler::call_VM(Register oop_result,
2363                              Register last_java_sp,
2364                              address entry_point,
2365                              Register arg_1,
2366                              Register arg_2,
2367                              Register arg_3,
2368                              bool check_exceptions) {
2369   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2370   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2371   pass_arg3(this, arg_3);
2372   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2373   pass_arg2(this, arg_2);
2374   pass_arg1(this, arg_1);
2375   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
2376 }
2377 
2378 void MacroAssembler::super_call_VM(Register oop_result,
2379                                    Register last_java_sp,
2380                                    address entry_point,
2381                                    int number_of_arguments,
2382                                    bool check_exceptions) {
2383   Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
2384   MacroAssembler::call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
2385 }
2386 
2387 void MacroAssembler::super_call_VM(Register oop_result,
2388                                    Register last_java_sp,
2389                                    address entry_point,
2390                                    Register arg_1,
2391                                    bool check_exceptions) {
2392   pass_arg1(this, arg_1);
2393   super_call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
2394 }
2395 
2396 void MacroAssembler::super_call_VM(Register oop_result,
2397                                    Register last_java_sp,
2398                                    address entry_point,
2399                                    Register arg_1,
2400                                    Register arg_2,
2401                                    bool check_exceptions) {
2402 
2403   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2404   pass_arg2(this, arg_2);
2405   pass_arg1(this, arg_1);
2406   super_call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
2407 }
2408 
2409 void MacroAssembler::super_call_VM(Register oop_result,
2410                                    Register last_java_sp,
2411                                    address entry_point,
2412                                    Register arg_1,
2413                                    Register arg_2,
2414                                    Register arg_3,
2415                                    bool check_exceptions) {
2416   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2417   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2418   pass_arg3(this, arg_3);
2419   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2420   pass_arg2(this, arg_2);
2421   pass_arg1(this, arg_1);
2422   super_call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
2423 }
2424 
2425 void MacroAssembler::call_VM_base(Register oop_result,
2426                                   Register java_thread,
2427                                   Register last_java_sp,
2428                                   address  entry_point,
2429                                   int      number_of_arguments,
2430                                   bool     check_exceptions) {
2431   // determine java_thread register
2432   if (!java_thread->is_valid()) {
2433 #ifdef _LP64
2434     java_thread = r15_thread;
2435 #else
2436     java_thread = rdi;
2437     get_thread(java_thread);
2438 #endif // LP64
2439   }
2440   // determine last_java_sp register
2441   if (!last_java_sp->is_valid()) {
2442     last_java_sp = rsp;
2443   }
2444   // debugging support
2445   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
2446   LP64_ONLY(assert(java_thread == r15_thread, "unexpected register"));
2447 #ifdef ASSERT
2448   // TraceBytecodes does not use r12 but saves it over the call, so don't verify
2449   // r12 is the heapbase.
2450   LP64_ONLY(if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");)
2451 #endif // ASSERT
2452 
2453   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
2454   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
2455 
2456   // push java thread (becomes first argument of C function)
2457 
2458   NOT_LP64(push(java_thread); number_of_arguments++);
2459   LP64_ONLY(mov(c_rarg0, r15_thread));
2460 
2461   // set last Java frame before call
2462   assert(last_java_sp != rbp, "can't use ebp/rbp");
2463 
2464   // Only interpreter should have to set fp
2465   set_last_Java_frame(java_thread, last_java_sp, rbp, NULL);
2466 
2467   // do the call, remove parameters
2468   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments);
2469 
2470   // restore the thread (cannot use the pushed argument since arguments
2471   // may be overwritten by C code generated by an optimizing compiler);
2472   // however can use the register value directly if it is callee saved.
2473   if (LP64_ONLY(true ||) java_thread == rdi || java_thread == rsi) {
2474     // rdi & rsi (also r15) are callee saved -> nothing to do
2475 #ifdef ASSERT
2476     guarantee(java_thread != rax, "change this code");
2477     push(rax);
2478     { Label L;
2479       get_thread(rax);
2480       cmpptr(java_thread, rax);
2481       jcc(Assembler::equal, L);
2482       STOP("MacroAssembler::call_VM_base: rdi not callee saved?");
2483       bind(L);
2484     }
2485     pop(rax);
2486 #endif
2487   } else {
2488     get_thread(java_thread);
2489   }
2490   // reset last Java frame
2491   // Only interpreter should have to clear fp
2492   reset_last_Java_frame(java_thread, true, false);
2493 
2494 #ifndef CC_INTERP
2495    // C++ interp handles this in the interpreter
2496   check_and_handle_popframe(java_thread);
2497   check_and_handle_earlyret(java_thread);
2498 #endif /* CC_INTERP */
2499 
2500   if (check_exceptions) {
2501     // check for pending exceptions (java_thread is set upon return)
2502     cmpptr(Address(java_thread, Thread::pending_exception_offset()), (int32_t) NULL_WORD);
2503 #ifndef _LP64
2504     jump_cc(Assembler::notEqual,
2505             RuntimeAddress(StubRoutines::forward_exception_entry()));
2506 #else
2507     // This used to conditionally jump to forward_exception however it is
2508     // possible if we relocate that the branch will not reach. So we must jump
2509     // around so we can always reach
2510 
2511     Label ok;
2512     jcc(Assembler::equal, ok);
2513     jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2514     bind(ok);
2515 #endif // LP64
2516   }
2517 
2518   // get oop result if there is one and reset the value in the thread
2519   if (oop_result->is_valid()) {
2520     get_vm_result(oop_result, java_thread);
2521   }
2522 }
2523 
2524 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
2525 
2526   // Calculate the value for last_Java_sp
2527   // somewhat subtle. call_VM does an intermediate call
2528   // which places a return address on the stack just under the
2529   // stack pointer as the user finsihed with it. This allows
2530   // use to retrieve last_Java_pc from last_Java_sp[-1].
2531   // On 32bit we then have to push additional args on the stack to accomplish
2532   // the actual requested call. On 64bit call_VM only can use register args
2533   // so the only extra space is the return address that call_VM created.
2534   // This hopefully explains the calculations here.
2535 
2536 #ifdef _LP64
2537   // We've pushed one address, correct last_Java_sp
2538   lea(rax, Address(rsp, wordSize));
2539 #else
2540   lea(rax, Address(rsp, (1 + number_of_arguments) * wordSize));
2541 #endif // LP64
2542 
2543   call_VM_base(oop_result, noreg, rax, entry_point, number_of_arguments, check_exceptions);
2544 
2545 }
2546 
2547 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
2548   call_VM_leaf_base(entry_point, number_of_arguments);
2549 }
2550 
2551 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
2552   pass_arg0(this, arg_0);
2553   call_VM_leaf(entry_point, 1);
2554 }
2555 
2556 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
2557 
2558   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2559   pass_arg1(this, arg_1);
2560   pass_arg0(this, arg_0);
2561   call_VM_leaf(entry_point, 2);
2562 }
2563 
2564 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
2565   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
2566   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2567   pass_arg2(this, arg_2);
2568   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2569   pass_arg1(this, arg_1);
2570   pass_arg0(this, arg_0);
2571   call_VM_leaf(entry_point, 3);
2572 }
2573 
2574 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
2575   pass_arg0(this, arg_0);
2576   MacroAssembler::call_VM_leaf_base(entry_point, 1);
2577 }
2578 
2579 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
2580 
2581   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2582   pass_arg1(this, arg_1);
2583   pass_arg0(this, arg_0);
2584   MacroAssembler::call_VM_leaf_base(entry_point, 2);
2585 }
2586 
2587 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
2588   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
2589   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2590   pass_arg2(this, arg_2);
2591   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2592   pass_arg1(this, arg_1);
2593   pass_arg0(this, arg_0);
2594   MacroAssembler::call_VM_leaf_base(entry_point, 3);
2595 }
2596 
2597 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
2598   LP64_ONLY(assert(arg_0 != c_rarg3, "smashed arg"));
2599   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2600   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2601   pass_arg3(this, arg_3);
2602   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
2603   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2604   pass_arg2(this, arg_2);
2605   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2606   pass_arg1(this, arg_1);
2607   pass_arg0(this, arg_0);
2608   MacroAssembler::call_VM_leaf_base(entry_point, 4);
2609 }
2610 
2611 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
2612   movptr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
2613   movptr(Address(java_thread, JavaThread::vm_result_offset()), NULL_WORD);
2614   verify_oop(oop_result, "broken oop in call_VM_base");
2615 }
2616 
2617 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
2618   movptr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
2619   movptr(Address(java_thread, JavaThread::vm_result_2_offset()), NULL_WORD);
2620 }
2621 
2622 void MacroAssembler::check_and_handle_earlyret(Register java_thread) {
2623 }
2624 
2625 void MacroAssembler::check_and_handle_popframe(Register java_thread) {
2626 }
2627 
2628 void MacroAssembler::cmp32(AddressLiteral src1, int32_t imm) {
2629   if (reachable(src1)) {
2630     cmpl(as_Address(src1), imm);
2631   } else {
2632     lea(rscratch1, src1);
2633     cmpl(Address(rscratch1, 0), imm);
2634   }
2635 }
2636 
2637 void MacroAssembler::cmp32(Register src1, AddressLiteral src2) {
2638   assert(!src2.is_lval(), "use cmpptr");
2639   if (reachable(src2)) {
2640     cmpl(src1, as_Address(src2));
2641   } else {
2642     lea(rscratch1, src2);
2643     cmpl(src1, Address(rscratch1, 0));
2644   }
2645 }
2646 
2647 void MacroAssembler::cmp32(Register src1, int32_t imm) {
2648   Assembler::cmpl(src1, imm);
2649 }
2650 
2651 void MacroAssembler::cmp32(Register src1, Address src2) {
2652   Assembler::cmpl(src1, src2);
2653 }
2654 
2655 void MacroAssembler::cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
2656   ucomisd(opr1, opr2);
2657 
2658   Label L;
2659   if (unordered_is_less) {
2660     movl(dst, -1);
2661     jcc(Assembler::parity, L);
2662     jcc(Assembler::below , L);
2663     movl(dst, 0);
2664     jcc(Assembler::equal , L);
2665     increment(dst);
2666   } else { // unordered is greater
2667     movl(dst, 1);
2668     jcc(Assembler::parity, L);
2669     jcc(Assembler::above , L);
2670     movl(dst, 0);
2671     jcc(Assembler::equal , L);
2672     decrementl(dst);
2673   }
2674   bind(L);
2675 }
2676 
2677 void MacroAssembler::cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
2678   ucomiss(opr1, opr2);
2679 
2680   Label L;
2681   if (unordered_is_less) {
2682     movl(dst, -1);
2683     jcc(Assembler::parity, L);
2684     jcc(Assembler::below , L);
2685     movl(dst, 0);
2686     jcc(Assembler::equal , L);
2687     increment(dst);
2688   } else { // unordered is greater
2689     movl(dst, 1);
2690     jcc(Assembler::parity, L);
2691     jcc(Assembler::above , L);
2692     movl(dst, 0);
2693     jcc(Assembler::equal , L);
2694     decrementl(dst);
2695   }
2696   bind(L);
2697 }
2698 
2699 
2700 void MacroAssembler::cmp8(AddressLiteral src1, int imm) {
2701   if (reachable(src1)) {
2702     cmpb(as_Address(src1), imm);
2703   } else {
2704     lea(rscratch1, src1);
2705     cmpb(Address(rscratch1, 0), imm);
2706   }
2707 }
2708 
2709 void MacroAssembler::cmpptr(Register src1, AddressLiteral src2) {
2710 #ifdef _LP64
2711   if (src2.is_lval()) {
2712     movptr(rscratch1, src2);
2713     Assembler::cmpq(src1, rscratch1);
2714   } else if (reachable(src2)) {
2715     cmpq(src1, as_Address(src2));
2716   } else {
2717     lea(rscratch1, src2);
2718     Assembler::cmpq(src1, Address(rscratch1, 0));
2719   }
2720 #else
2721   if (src2.is_lval()) {
2722     cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
2723   } else {
2724     cmpl(src1, as_Address(src2));
2725   }
2726 #endif // _LP64
2727 }
2728 
2729 void MacroAssembler::cmpptr(Address src1, AddressLiteral src2) {
2730   assert(src2.is_lval(), "not a mem-mem compare");
2731 #ifdef _LP64
2732   // moves src2's literal address
2733   movptr(rscratch1, src2);
2734   Assembler::cmpq(src1, rscratch1);
2735 #else
2736   cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
2737 #endif // _LP64
2738 }
2739 
2740 void MacroAssembler::locked_cmpxchgptr(Register reg, AddressLiteral adr) {
2741   if (reachable(adr)) {
2742     if (os::is_MP())
2743       lock();
2744     cmpxchgptr(reg, as_Address(adr));
2745   } else {
2746     lea(rscratch1, adr);
2747     if (os::is_MP())
2748       lock();
2749     cmpxchgptr(reg, Address(rscratch1, 0));
2750   }
2751 }
2752 
2753 void MacroAssembler::cmpxchgptr(Register reg, Address adr) {
2754   LP64_ONLY(cmpxchgq(reg, adr)) NOT_LP64(cmpxchgl(reg, adr));
2755 }
2756 
2757 void MacroAssembler::comisd(XMMRegister dst, AddressLiteral src) {
2758   if (reachable(src)) {
2759     Assembler::comisd(dst, as_Address(src));
2760   } else {
2761     lea(rscratch1, src);
2762     Assembler::comisd(dst, Address(rscratch1, 0));
2763   }
2764 }
2765 
2766 void MacroAssembler::comiss(XMMRegister dst, AddressLiteral src) {
2767   if (reachable(src)) {
2768     Assembler::comiss(dst, as_Address(src));
2769   } else {
2770     lea(rscratch1, src);
2771     Assembler::comiss(dst, Address(rscratch1, 0));
2772   }
2773 }
2774 
2775 
2776 void MacroAssembler::cond_inc32(Condition cond, AddressLiteral counter_addr) {
2777   Condition negated_cond = negate_condition(cond);
2778   Label L;
2779   jcc(negated_cond, L);
2780   pushf(); // Preserve flags
2781   atomic_incl(counter_addr);
2782   popf();
2783   bind(L);
2784 }
2785 
2786 int MacroAssembler::corrected_idivl(Register reg) {
2787   // Full implementation of Java idiv and irem; checks for
2788   // special case as described in JVM spec., p.243 & p.271.
2789   // The function returns the (pc) offset of the idivl
2790   // instruction - may be needed for implicit exceptions.
2791   //
2792   //         normal case                           special case
2793   //
2794   // input : rax,: dividend                         min_int
2795   //         reg: divisor   (may not be rax,/rdx)   -1
2796   //
2797   // output: rax,: quotient  (= rax, idiv reg)       min_int
2798   //         rdx: remainder (= rax, irem reg)       0
2799   assert(reg != rax && reg != rdx, "reg cannot be rax, or rdx register");
2800   const int min_int = 0x80000000;
2801   Label normal_case, special_case;
2802 
2803   // check for special case
2804   cmpl(rax, min_int);
2805   jcc(Assembler::notEqual, normal_case);
2806   xorl(rdx, rdx); // prepare rdx for possible special case (where remainder = 0)
2807   cmpl(reg, -1);
2808   jcc(Assembler::equal, special_case);
2809 
2810   // handle normal case
2811   bind(normal_case);
2812   cdql();
2813   int idivl_offset = offset();
2814   idivl(reg);
2815 
2816   // normal and special case exit
2817   bind(special_case);
2818 
2819   return idivl_offset;
2820 }
2821 
2822 
2823 
2824 void MacroAssembler::decrementl(Register reg, int value) {
2825   if (value == min_jint) {subl(reg, value) ; return; }
2826   if (value <  0) { incrementl(reg, -value); return; }
2827   if (value == 0) {                        ; return; }
2828   if (value == 1 && UseIncDec) { decl(reg) ; return; }
2829   /* else */      { subl(reg, value)       ; return; }
2830 }
2831 
2832 void MacroAssembler::decrementl(Address dst, int value) {
2833   if (value == min_jint) {subl(dst, value) ; return; }
2834   if (value <  0) { incrementl(dst, -value); return; }
2835   if (value == 0) {                        ; return; }
2836   if (value == 1 && UseIncDec) { decl(dst) ; return; }
2837   /* else */      { subl(dst, value)       ; return; }
2838 }
2839 
2840 void MacroAssembler::division_with_shift (Register reg, int shift_value) {
2841   assert (shift_value > 0, "illegal shift value");
2842   Label _is_positive;
2843   testl (reg, reg);
2844   jcc (Assembler::positive, _is_positive);
2845   int offset = (1 << shift_value) - 1 ;
2846 
2847   if (offset == 1) {
2848     incrementl(reg);
2849   } else {
2850     addl(reg, offset);
2851   }
2852 
2853   bind (_is_positive);
2854   sarl(reg, shift_value);
2855 }
2856 
2857 void MacroAssembler::divsd(XMMRegister dst, AddressLiteral src) {
2858   if (reachable(src)) {
2859     Assembler::divsd(dst, as_Address(src));
2860   } else {
2861     lea(rscratch1, src);
2862     Assembler::divsd(dst, Address(rscratch1, 0));
2863   }
2864 }
2865 
2866 void MacroAssembler::divss(XMMRegister dst, AddressLiteral src) {
2867   if (reachable(src)) {
2868     Assembler::divss(dst, as_Address(src));
2869   } else {
2870     lea(rscratch1, src);
2871     Assembler::divss(dst, Address(rscratch1, 0));
2872   }
2873 }
2874 
2875 // !defined(COMPILER2) is because of stupid core builds
2876 #if !defined(_LP64) || defined(COMPILER1) || !defined(COMPILER2)
2877 void MacroAssembler::empty_FPU_stack() {
2878   if (VM_Version::supports_mmx()) {
2879     emms();
2880   } else {
2881     for (int i = 8; i-- > 0; ) ffree(i);
2882   }
2883 }
2884 #endif // !LP64 || C1 || !C2
2885 
2886 
2887 // Defines obj, preserves var_size_in_bytes
2888 void MacroAssembler::eden_allocate(Register obj,
2889                                    Register var_size_in_bytes,
2890                                    int con_size_in_bytes,
2891                                    Register t1,
2892                                    Label& slow_case) {
2893   assert(obj == rax, "obj must be in rax, for cmpxchg");
2894   assert_different_registers(obj, var_size_in_bytes, t1);
2895   if (!Universe::heap()->supports_inline_contig_alloc()) {
2896     jmp(slow_case);
2897   } else {
2898     Register end = t1;
2899     Label retry;
2900     bind(retry);
2901     ExternalAddress heap_top((address) Universe::heap()->top_addr());
2902     movptr(obj, heap_top);
2903     if (var_size_in_bytes == noreg) {
2904       lea(end, Address(obj, con_size_in_bytes));
2905     } else {
2906       lea(end, Address(obj, var_size_in_bytes, Address::times_1));
2907     }
2908     // if end < obj then we wrapped around => object too long => slow case
2909     cmpptr(end, obj);
2910     jcc(Assembler::below, slow_case);
2911     cmpptr(end, ExternalAddress((address) Universe::heap()->end_addr()));
2912     jcc(Assembler::above, slow_case);
2913     // Compare obj with the top addr, and if still equal, store the new top addr in
2914     // end at the address of the top addr pointer. Sets ZF if was equal, and clears
2915     // it otherwise. Use lock prefix for atomicity on MPs.
2916     locked_cmpxchgptr(end, heap_top);
2917     jcc(Assembler::notEqual, retry);
2918   }
2919 }
2920 
2921 void MacroAssembler::enter() {
2922   push(rbp);
2923   mov(rbp, rsp);
2924 }
2925 
2926 // A 5 byte nop that is safe for patching (see patch_verified_entry)
2927 void MacroAssembler::fat_nop() {
2928   if (UseAddressNop) {
2929     addr_nop_5();
2930   } else {
2931     emit_int8(0x26); // es:
2932     emit_int8(0x2e); // cs:
2933     emit_int8(0x64); // fs:
2934     emit_int8(0x65); // gs:
2935     emit_int8((unsigned char)0x90);
2936   }
2937 }
2938 
2939 void MacroAssembler::fcmp(Register tmp) {
2940   fcmp(tmp, 1, true, true);
2941 }
2942 
2943 void MacroAssembler::fcmp(Register tmp, int index, bool pop_left, bool pop_right) {
2944   assert(!pop_right || pop_left, "usage error");
2945   if (VM_Version::supports_cmov()) {
2946     assert(tmp == noreg, "unneeded temp");
2947     if (pop_left) {
2948       fucomip(index);
2949     } else {
2950       fucomi(index);
2951     }
2952     if (pop_right) {
2953       fpop();
2954     }
2955   } else {
2956     assert(tmp != noreg, "need temp");
2957     if (pop_left) {
2958       if (pop_right) {
2959         fcompp();
2960       } else {
2961         fcomp(index);
2962       }
2963     } else {
2964       fcom(index);
2965     }
2966     // convert FPU condition into eflags condition via rax,
2967     save_rax(tmp);
2968     fwait(); fnstsw_ax();
2969     sahf();
2970     restore_rax(tmp);
2971   }
2972   // condition codes set as follows:
2973   //
2974   // CF (corresponds to C0) if x < y
2975   // PF (corresponds to C2) if unordered
2976   // ZF (corresponds to C3) if x = y
2977 }
2978 
2979 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less) {
2980   fcmp2int(dst, unordered_is_less, 1, true, true);
2981 }
2982 
2983 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less, int index, bool pop_left, bool pop_right) {
2984   fcmp(VM_Version::supports_cmov() ? noreg : dst, index, pop_left, pop_right);
2985   Label L;
2986   if (unordered_is_less) {
2987     movl(dst, -1);
2988     jcc(Assembler::parity, L);
2989     jcc(Assembler::below , L);
2990     movl(dst, 0);
2991     jcc(Assembler::equal , L);
2992     increment(dst);
2993   } else { // unordered is greater
2994     movl(dst, 1);
2995     jcc(Assembler::parity, L);
2996     jcc(Assembler::above , L);
2997     movl(dst, 0);
2998     jcc(Assembler::equal , L);
2999     decrementl(dst);
3000   }
3001   bind(L);
3002 }
3003 
3004 void MacroAssembler::fld_d(AddressLiteral src) {
3005   fld_d(as_Address(src));
3006 }
3007 
3008 void MacroAssembler::fld_s(AddressLiteral src) {
3009   fld_s(as_Address(src));
3010 }
3011 
3012 void MacroAssembler::fld_x(AddressLiteral src) {
3013   Assembler::fld_x(as_Address(src));
3014 }
3015 
3016 void MacroAssembler::fldcw(AddressLiteral src) {
3017   Assembler::fldcw(as_Address(src));
3018 }
3019 
3020 void MacroAssembler::pow_exp_core_encoding() {
3021   // kills rax, rcx, rdx
3022   subptr(rsp,sizeof(jdouble));
3023   // computes 2^X. Stack: X ...
3024   // f2xm1 computes 2^X-1 but only operates on -1<=X<=1. Get int(X) and
3025   // keep it on the thread's stack to compute 2^int(X) later
3026   // then compute 2^(X-int(X)) as (2^(X-int(X)-1+1)
3027   // final result is obtained with: 2^X = 2^int(X) * 2^(X-int(X))
3028   fld_s(0);                 // Stack: X X ...
3029   frndint();                // Stack: int(X) X ...
3030   fsuba(1);                 // Stack: int(X) X-int(X) ...
3031   fistp_s(Address(rsp,0));  // move int(X) as integer to thread's stack. Stack: X-int(X) ...
3032   f2xm1();                  // Stack: 2^(X-int(X))-1 ...
3033   fld1();                   // Stack: 1 2^(X-int(X))-1 ...
3034   faddp(1);                 // Stack: 2^(X-int(X))
3035   // computes 2^(int(X)): add exponent bias (1023) to int(X), then
3036   // shift int(X)+1023 to exponent position.
3037   // Exponent is limited to 11 bits if int(X)+1023 does not fit in 11
3038   // bits, set result to NaN. 0x000 and 0x7FF are reserved exponent
3039   // values so detect them and set result to NaN.
3040   movl(rax,Address(rsp,0));
3041   movl(rcx, -2048); // 11 bit mask and valid NaN binary encoding
3042   addl(rax, 1023);
3043   movl(rdx,rax);
3044   shll(rax,20);
3045   // Check that 0 < int(X)+1023 < 2047. Otherwise set rax to NaN.
3046   addl(rdx,1);
3047   // Check that 1 < int(X)+1023+1 < 2048
3048   // in 3 steps:
3049   // 1- (int(X)+1023+1)&-2048 == 0 => 0 <= int(X)+1023+1 < 2048
3050   // 2- (int(X)+1023+1)&-2048 != 0
3051   // 3- (int(X)+1023+1)&-2048 != 1
3052   // Do 2- first because addl just updated the flags.
3053   cmov32(Assembler::equal,rax,rcx);
3054   cmpl(rdx,1);
3055   cmov32(Assembler::equal,rax,rcx);
3056   testl(rdx,rcx);
3057   cmov32(Assembler::notEqual,rax,rcx);
3058   movl(Address(rsp,4),rax);
3059   movl(Address(rsp,0),0);
3060   fmul_d(Address(rsp,0));   // Stack: 2^X ...
3061   addptr(rsp,sizeof(jdouble));
3062 }
3063 
3064 void MacroAssembler::increase_precision() {
3065   subptr(rsp, BytesPerWord);
3066   fnstcw(Address(rsp, 0));
3067   movl(rax, Address(rsp, 0));
3068   orl(rax, 0x300);
3069   push(rax);
3070   fldcw(Address(rsp, 0));
3071   pop(rax);
3072 }
3073 
3074 void MacroAssembler::restore_precision() {
3075   fldcw(Address(rsp, 0));
3076   addptr(rsp, BytesPerWord);
3077 }
3078 
3079 void MacroAssembler::fast_pow() {
3080   // computes X^Y = 2^(Y * log2(X))
3081   // if fast computation is not possible, result is NaN. Requires
3082   // fallback from user of this macro.
3083   // increase precision for intermediate steps of the computation
3084   BLOCK_COMMENT("fast_pow {");
3085   increase_precision();
3086   fyl2x();                 // Stack: (Y*log2(X)) ...
3087   pow_exp_core_encoding(); // Stack: exp(X) ...
3088   restore_precision();
3089   BLOCK_COMMENT("} fast_pow");
3090 }
3091 
3092 void MacroAssembler::fast_exp() {
3093   // computes exp(X) = 2^(X * log2(e))
3094   // if fast computation is not possible, result is NaN. Requires
3095   // fallback from user of this macro.
3096   // increase precision for intermediate steps of the computation
3097   increase_precision();
3098   fldl2e();                // Stack: log2(e) X ...
3099   fmulp(1);                // Stack: (X*log2(e)) ...
3100   pow_exp_core_encoding(); // Stack: exp(X) ...
3101   restore_precision();
3102 }
3103 
3104 void MacroAssembler::pow_or_exp(bool is_exp, int num_fpu_regs_in_use) {
3105   // kills rax, rcx, rdx
3106   // pow and exp needs 2 extra registers on the fpu stack.
3107   Label slow_case, done;
3108   Register tmp = noreg;
3109   if (!VM_Version::supports_cmov()) {
3110     // fcmp needs a temporary so preserve rdx,
3111     tmp = rdx;
3112   }
3113   Register tmp2 = rax;
3114   Register tmp3 = rcx;
3115 
3116   if (is_exp) {
3117     // Stack: X
3118     fld_s(0);                   // duplicate argument for runtime call. Stack: X X
3119     fast_exp();                 // Stack: exp(X) X
3120     fcmp(tmp, 0, false, false); // Stack: exp(X) X
3121     // exp(X) not equal to itself: exp(X) is NaN go to slow case.
3122     jcc(Assembler::parity, slow_case);
3123     // get rid of duplicate argument. Stack: exp(X)
3124     if (num_fpu_regs_in_use > 0) {
3125       fxch();
3126       fpop();
3127     } else {
3128       ffree(1);
3129     }
3130     jmp(done);
3131   } else {
3132     // Stack: X Y
3133     Label x_negative, y_not_2;
3134 
3135     static double two = 2.0;
3136     ExternalAddress two_addr((address)&two);
3137 
3138     // constant maybe too far on 64 bit
3139     lea(tmp2, two_addr);
3140     fld_d(Address(tmp2, 0));    // Stack: 2 X Y
3141     fcmp(tmp, 2, true, false);  // Stack: X Y
3142     jcc(Assembler::parity, y_not_2);
3143     jcc(Assembler::notEqual, y_not_2);
3144 
3145     fxch(); fpop();             // Stack: X
3146     fmul(0);                    // Stack: X*X
3147 
3148     jmp(done);
3149 
3150     bind(y_not_2);
3151 
3152     fldz();                     // Stack: 0 X Y
3153     fcmp(tmp, 1, true, false);  // Stack: X Y
3154     jcc(Assembler::above, x_negative);
3155 
3156     // X >= 0
3157 
3158     fld_s(1);                   // duplicate arguments for runtime call. Stack: Y X Y
3159     fld_s(1);                   // Stack: X Y X Y
3160     fast_pow();                 // Stack: X^Y X Y
3161     fcmp(tmp, 0, false, false); // Stack: X^Y X Y
3162     // X^Y not equal to itself: X^Y is NaN go to slow case.
3163     jcc(Assembler::parity, slow_case);
3164     // get rid of duplicate arguments. Stack: X^Y
3165     if (num_fpu_regs_in_use > 0) {
3166       fxch(); fpop();
3167       fxch(); fpop();
3168     } else {
3169       ffree(2);
3170       ffree(1);
3171     }
3172     jmp(done);
3173 
3174     // X <= 0
3175     bind(x_negative);
3176 
3177     fld_s(1);                   // Stack: Y X Y
3178     frndint();                  // Stack: int(Y) X Y
3179     fcmp(tmp, 2, false, false); // Stack: int(Y) X Y
3180     jcc(Assembler::notEqual, slow_case);
3181 
3182     subptr(rsp, 8);
3183 
3184     // For X^Y, when X < 0, Y has to be an integer and the final
3185     // result depends on whether it's odd or even. We just checked
3186     // that int(Y) == Y.  We move int(Y) to gp registers as a 64 bit
3187     // integer to test its parity. If int(Y) is huge and doesn't fit
3188     // in the 64 bit integer range, the integer indefinite value will
3189     // end up in the gp registers. Huge numbers are all even, the
3190     // integer indefinite number is even so it's fine.
3191 
3192 #ifdef ASSERT
3193     // Let's check we don't end up with an integer indefinite number
3194     // when not expected. First test for huge numbers: check whether
3195     // int(Y)+1 == int(Y) which is true for very large numbers and
3196     // those are all even. A 64 bit integer is guaranteed to not
3197     // overflow for numbers where y+1 != y (when precision is set to
3198     // double precision).
3199     Label y_not_huge;
3200 
3201     fld1();                     // Stack: 1 int(Y) X Y
3202     fadd(1);                    // Stack: 1+int(Y) int(Y) X Y
3203 
3204 #ifdef _LP64
3205     // trip to memory to force the precision down from double extended
3206     // precision
3207     fstp_d(Address(rsp, 0));
3208     fld_d(Address(rsp, 0));
3209 #endif
3210 
3211     fcmp(tmp, 1, true, false);  // Stack: int(Y) X Y
3212 #endif
3213 
3214     // move int(Y) as 64 bit integer to thread's stack
3215     fistp_d(Address(rsp,0));    // Stack: X Y
3216 
3217 #ifdef ASSERT
3218     jcc(Assembler::notEqual, y_not_huge);
3219 
3220     // Y is huge so we know it's even. It may not fit in a 64 bit
3221     // integer and we don't want the debug code below to see the
3222     // integer indefinite value so overwrite int(Y) on the thread's
3223     // stack with 0.
3224     movl(Address(rsp, 0), 0);
3225     movl(Address(rsp, 4), 0);
3226 
3227     bind(y_not_huge);
3228 #endif
3229 
3230     fld_s(1);                   // duplicate arguments for runtime call. Stack: Y X Y
3231     fld_s(1);                   // Stack: X Y X Y
3232     fabs();                     // Stack: abs(X) Y X Y
3233     fast_pow();                 // Stack: abs(X)^Y X Y
3234     fcmp(tmp, 0, false, false); // Stack: abs(X)^Y X Y
3235     // abs(X)^Y not equal to itself: abs(X)^Y is NaN go to slow case.
3236 
3237     pop(tmp2);
3238     NOT_LP64(pop(tmp3));
3239     jcc(Assembler::parity, slow_case);
3240 
3241 #ifdef ASSERT
3242     // Check that int(Y) is not integer indefinite value (int
3243     // overflow). Shouldn't happen because for values that would
3244     // overflow, 1+int(Y)==Y which was tested earlier.
3245 #ifndef _LP64
3246     {
3247       Label integer;
3248       testl(tmp2, tmp2);
3249       jcc(Assembler::notZero, integer);
3250       cmpl(tmp3, 0x80000000);
3251       jcc(Assembler::notZero, integer);
3252       STOP("integer indefinite value shouldn't be seen here");
3253       bind(integer);
3254     }
3255 #else
3256     {
3257       Label integer;
3258       mov(tmp3, tmp2); // preserve tmp2 for parity check below
3259       shlq(tmp3, 1);
3260       jcc(Assembler::carryClear, integer);
3261       jcc(Assembler::notZero, integer);
3262       STOP("integer indefinite value shouldn't be seen here");
3263       bind(integer);
3264     }
3265 #endif
3266 #endif
3267 
3268     // get rid of duplicate arguments. Stack: X^Y
3269     if (num_fpu_regs_in_use > 0) {
3270       fxch(); fpop();
3271       fxch(); fpop();
3272     } else {
3273       ffree(2);
3274       ffree(1);
3275     }
3276 
3277     testl(tmp2, 1);
3278     jcc(Assembler::zero, done); // X <= 0, Y even: X^Y = abs(X)^Y
3279     // X <= 0, Y even: X^Y = -abs(X)^Y
3280 
3281     fchs();                     // Stack: -abs(X)^Y Y
3282     jmp(done);
3283   }
3284 
3285   // slow case: runtime call
3286   bind(slow_case);
3287 
3288   fpop();                       // pop incorrect result or int(Y)
3289 
3290   fp_runtime_fallback(is_exp ? CAST_FROM_FN_PTR(address, SharedRuntime::dexp) : CAST_FROM_FN_PTR(address, SharedRuntime::dpow),
3291                       is_exp ? 1 : 2, num_fpu_regs_in_use);
3292 
3293   // Come here with result in F-TOS
3294   bind(done);
3295 }
3296 
3297 void MacroAssembler::fpop() {
3298   ffree();
3299   fincstp();
3300 }
3301 
3302 void MacroAssembler::fremr(Register tmp) {
3303   save_rax(tmp);
3304   { Label L;
3305     bind(L);
3306     fprem();
3307     fwait(); fnstsw_ax();
3308 #ifdef _LP64
3309     testl(rax, 0x400);
3310     jcc(Assembler::notEqual, L);
3311 #else
3312     sahf();
3313     jcc(Assembler::parity, L);
3314 #endif // _LP64
3315   }
3316   restore_rax(tmp);
3317   // Result is in ST0.
3318   // Note: fxch & fpop to get rid of ST1
3319   // (otherwise FPU stack could overflow eventually)
3320   fxch(1);
3321   fpop();
3322 }
3323 
3324 
3325 void MacroAssembler::incrementl(AddressLiteral dst) {
3326   if (reachable(dst)) {
3327     incrementl(as_Address(dst));
3328   } else {
3329     lea(rscratch1, dst);
3330     incrementl(Address(rscratch1, 0));
3331   }
3332 }
3333 
3334 void MacroAssembler::incrementl(ArrayAddress dst) {
3335   incrementl(as_Address(dst));
3336 }
3337 
3338 void MacroAssembler::incrementl(Register reg, int value) {
3339   if (value == min_jint) {addl(reg, value) ; return; }
3340   if (value <  0) { decrementl(reg, -value); return; }
3341   if (value == 0) {                        ; return; }
3342   if (value == 1 && UseIncDec) { incl(reg) ; return; }
3343   /* else */      { addl(reg, value)       ; return; }
3344 }
3345 
3346 void MacroAssembler::incrementl(Address dst, int value) {
3347   if (value == min_jint) {addl(dst, value) ; return; }
3348   if (value <  0) { decrementl(dst, -value); return; }
3349   if (value == 0) {                        ; return; }
3350   if (value == 1 && UseIncDec) { incl(dst) ; return; }
3351   /* else */      { addl(dst, value)       ; return; }
3352 }
3353 
3354 void MacroAssembler::jump(AddressLiteral dst) {
3355   if (reachable(dst)) {
3356     jmp_literal(dst.target(), dst.rspec());
3357   } else {
3358     lea(rscratch1, dst);
3359     jmp(rscratch1);
3360   }
3361 }
3362 
3363 void MacroAssembler::jump_cc(Condition cc, AddressLiteral dst) {
3364   if (reachable(dst)) {
3365     InstructionMark im(this);
3366     relocate(dst.reloc());
3367     const int short_size = 2;
3368     const int long_size = 6;
3369     int offs = (intptr_t)dst.target() - ((intptr_t)pc());
3370     if (dst.reloc() == relocInfo::none && is8bit(offs - short_size)) {
3371       // 0111 tttn #8-bit disp
3372       emit_int8(0x70 | cc);
3373       emit_int8((offs - short_size) & 0xFF);
3374     } else {
3375       // 0000 1111 1000 tttn #32-bit disp
3376       emit_int8(0x0F);
3377       emit_int8((unsigned char)(0x80 | cc));
3378       emit_int32(offs - long_size);
3379     }
3380   } else {
3381 #ifdef ASSERT
3382     warning("reversing conditional branch");
3383 #endif /* ASSERT */
3384     Label skip;
3385     jccb(reverse[cc], skip);
3386     lea(rscratch1, dst);
3387     Assembler::jmp(rscratch1);
3388     bind(skip);
3389   }
3390 }
3391 
3392 void MacroAssembler::ldmxcsr(AddressLiteral src) {
3393   if (reachable(src)) {
3394     Assembler::ldmxcsr(as_Address(src));
3395   } else {
3396     lea(rscratch1, src);
3397     Assembler::ldmxcsr(Address(rscratch1, 0));
3398   }
3399 }
3400 
3401 int MacroAssembler::load_signed_byte(Register dst, Address src) {
3402   int off;
3403   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
3404     off = offset();
3405     movsbl(dst, src); // movsxb
3406   } else {
3407     off = load_unsigned_byte(dst, src);
3408     shll(dst, 24);
3409     sarl(dst, 24);
3410   }
3411   return off;
3412 }
3413 
3414 // Note: load_signed_short used to be called load_signed_word.
3415 // Although the 'w' in x86 opcodes refers to the term "word" in the assembler
3416 // manual, which means 16 bits, that usage is found nowhere in HotSpot code.
3417 // The term "word" in HotSpot means a 32- or 64-bit machine word.
3418 int MacroAssembler::load_signed_short(Register dst, Address src) {
3419   int off;
3420   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
3421     // This is dubious to me since it seems safe to do a signed 16 => 64 bit
3422     // version but this is what 64bit has always done. This seems to imply
3423     // that users are only using 32bits worth.
3424     off = offset();
3425     movswl(dst, src); // movsxw
3426   } else {
3427     off = load_unsigned_short(dst, src);
3428     shll(dst, 16);
3429     sarl(dst, 16);
3430   }
3431   return off;
3432 }
3433 
3434 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
3435   // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
3436   // and "3.9 Partial Register Penalties", p. 22).
3437   int off;
3438   if (LP64_ONLY(true || ) VM_Version::is_P6() || src.uses(dst)) {
3439     off = offset();
3440     movzbl(dst, src); // movzxb
3441   } else {
3442     xorl(dst, dst);
3443     off = offset();
3444     movb(dst, src);
3445   }
3446   return off;
3447 }
3448 
3449 // Note: load_unsigned_short used to be called load_unsigned_word.
3450 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
3451   // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
3452   // and "3.9 Partial Register Penalties", p. 22).
3453   int off;
3454   if (LP64_ONLY(true ||) VM_Version::is_P6() || src.uses(dst)) {
3455     off = offset();
3456     movzwl(dst, src); // movzxw
3457   } else {
3458     xorl(dst, dst);
3459     off = offset();
3460     movw(dst, src);
3461   }
3462   return off;
3463 }
3464 
3465 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
3466   switch (size_in_bytes) {
3467 #ifndef _LP64
3468   case  8:
3469     assert(dst2 != noreg, "second dest register required");
3470     movl(dst,  src);
3471     movl(dst2, src.plus_disp(BytesPerInt));
3472     break;
3473 #else
3474   case  8:  movq(dst, src); break;
3475 #endif
3476   case  4:  movl(dst, src); break;
3477   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
3478   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
3479   default:  ShouldNotReachHere();
3480   }
3481 }
3482 
3483 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
3484   switch (size_in_bytes) {
3485 #ifndef _LP64
3486   case  8:
3487     assert(src2 != noreg, "second source register required");
3488     movl(dst,                        src);
3489     movl(dst.plus_disp(BytesPerInt), src2);
3490     break;
3491 #else
3492   case  8:  movq(dst, src); break;
3493 #endif
3494   case  4:  movl(dst, src); break;
3495   case  2:  movw(dst, src); break;
3496   case  1:  movb(dst, src); break;
3497   default:  ShouldNotReachHere();
3498   }
3499 }
3500 
3501 void MacroAssembler::mov32(AddressLiteral dst, Register src) {
3502   if (reachable(dst)) {
3503     movl(as_Address(dst), src);
3504   } else {
3505     lea(rscratch1, dst);
3506     movl(Address(rscratch1, 0), src);
3507   }
3508 }
3509 
3510 void MacroAssembler::mov32(Register dst, AddressLiteral src) {
3511   if (reachable(src)) {
3512     movl(dst, as_Address(src));
3513   } else {
3514     lea(rscratch1, src);
3515     movl(dst, Address(rscratch1, 0));
3516   }
3517 }
3518 
3519 // C++ bool manipulation
3520 
3521 void MacroAssembler::movbool(Register dst, Address src) {
3522   if(sizeof(bool) == 1)
3523     movb(dst, src);
3524   else if(sizeof(bool) == 2)
3525     movw(dst, src);
3526   else if(sizeof(bool) == 4)
3527     movl(dst, src);
3528   else
3529     // unsupported
3530     ShouldNotReachHere();
3531 }
3532 
3533 void MacroAssembler::movbool(Address dst, bool boolconst) {
3534   if(sizeof(bool) == 1)
3535     movb(dst, (int) boolconst);
3536   else if(sizeof(bool) == 2)
3537     movw(dst, (int) boolconst);
3538   else if(sizeof(bool) == 4)
3539     movl(dst, (int) boolconst);
3540   else
3541     // unsupported
3542     ShouldNotReachHere();
3543 }
3544 
3545 void MacroAssembler::movbool(Address dst, Register src) {
3546   if(sizeof(bool) == 1)
3547     movb(dst, src);
3548   else if(sizeof(bool) == 2)
3549     movw(dst, src);
3550   else if(sizeof(bool) == 4)
3551     movl(dst, src);
3552   else
3553     // unsupported
3554     ShouldNotReachHere();
3555 }
3556 
3557 void MacroAssembler::movbyte(ArrayAddress dst, int src) {
3558   movb(as_Address(dst), src);
3559 }
3560 
3561 void MacroAssembler::movdl(XMMRegister dst, AddressLiteral src) {
3562   if (reachable(src)) {
3563     movdl(dst, as_Address(src));
3564   } else {
3565     lea(rscratch1, src);
3566     movdl(dst, Address(rscratch1, 0));
3567   }
3568 }
3569 
3570 void MacroAssembler::movq(XMMRegister dst, AddressLiteral src) {
3571   if (reachable(src)) {
3572     movq(dst, as_Address(src));
3573   } else {
3574     lea(rscratch1, src);
3575     movq(dst, Address(rscratch1, 0));
3576   }
3577 }
3578 
3579 void MacroAssembler::movdbl(XMMRegister dst, AddressLiteral src) {
3580   if (reachable(src)) {
3581     if (UseXmmLoadAndClearUpper) {
3582       movsd (dst, as_Address(src));
3583     } else {
3584       movlpd(dst, as_Address(src));
3585     }
3586   } else {
3587     lea(rscratch1, src);
3588     if (UseXmmLoadAndClearUpper) {
3589       movsd (dst, Address(rscratch1, 0));
3590     } else {
3591       movlpd(dst, Address(rscratch1, 0));
3592     }
3593   }
3594 }
3595 
3596 void MacroAssembler::movflt(XMMRegister dst, AddressLiteral src) {
3597   if (reachable(src)) {
3598     movss(dst, as_Address(src));
3599   } else {
3600     lea(rscratch1, src);
3601     movss(dst, Address(rscratch1, 0));
3602   }
3603 }
3604 
3605 void MacroAssembler::movptr(Register dst, Register src) {
3606   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3607 }
3608 
3609 void MacroAssembler::movptr(Register dst, Address src) {
3610   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3611 }
3612 
3613 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
3614 void MacroAssembler::movptr(Register dst, intptr_t src) {
3615   LP64_ONLY(mov64(dst, src)) NOT_LP64(movl(dst, src));
3616 }
3617 
3618 void MacroAssembler::movptr(Address dst, Register src) {
3619   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3620 }
3621 
3622 void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src) {
3623   if (reachable(src)) {
3624     Assembler::movdqu(dst, as_Address(src));
3625   } else {
3626     lea(rscratch1, src);
3627     Assembler::movdqu(dst, Address(rscratch1, 0));
3628   }
3629 }
3630 
3631 void MacroAssembler::movdqa(XMMRegister dst, AddressLiteral src) {
3632   if (reachable(src)) {
3633     Assembler::movdqa(dst, as_Address(src));
3634   } else {
3635     lea(rscratch1, src);
3636     Assembler::movdqa(dst, Address(rscratch1, 0));
3637   }
3638 }
3639 
3640 void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src) {
3641   if (reachable(src)) {
3642     Assembler::movsd(dst, as_Address(src));
3643   } else {
3644     lea(rscratch1, src);
3645     Assembler::movsd(dst, Address(rscratch1, 0));
3646   }
3647 }
3648 
3649 void MacroAssembler::movss(XMMRegister dst, AddressLiteral src) {
3650   if (reachable(src)) {
3651     Assembler::movss(dst, as_Address(src));
3652   } else {
3653     lea(rscratch1, src);
3654     Assembler::movss(dst, Address(rscratch1, 0));
3655   }
3656 }
3657 
3658 void MacroAssembler::mulsd(XMMRegister dst, AddressLiteral src) {
3659   if (reachable(src)) {
3660     Assembler::mulsd(dst, as_Address(src));
3661   } else {
3662     lea(rscratch1, src);
3663     Assembler::mulsd(dst, Address(rscratch1, 0));
3664   }
3665 }
3666 
3667 void MacroAssembler::mulss(XMMRegister dst, AddressLiteral src) {
3668   if (reachable(src)) {
3669     Assembler::mulss(dst, as_Address(src));
3670   } else {
3671     lea(rscratch1, src);
3672     Assembler::mulss(dst, Address(rscratch1, 0));
3673   }
3674 }
3675 
3676 void MacroAssembler::null_check(Register reg, int offset) {
3677   if (needs_explicit_null_check(offset)) {
3678     // provoke OS NULL exception if reg = NULL by
3679     // accessing M[reg] w/o changing any (non-CC) registers
3680     // NOTE: cmpl is plenty here to provoke a segv
3681     cmpptr(rax, Address(reg, 0));
3682     // Note: should probably use testl(rax, Address(reg, 0));
3683     //       may be shorter code (however, this version of
3684     //       testl needs to be implemented first)
3685   } else {
3686     // nothing to do, (later) access of M[reg + offset]
3687     // will provoke OS NULL exception if reg = NULL
3688   }
3689 }
3690 
3691 void MacroAssembler::os_breakpoint() {
3692   // instead of directly emitting a breakpoint, call os:breakpoint for better debugability
3693   // (e.g., MSVC can't call ps() otherwise)
3694   call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint)));
3695 }
3696 
3697 void MacroAssembler::pop_CPU_state() {
3698   pop_FPU_state();
3699   pop_IU_state();
3700 }
3701 
3702 void MacroAssembler::pop_FPU_state() {
3703   NOT_LP64(frstor(Address(rsp, 0));)
3704   LP64_ONLY(fxrstor(Address(rsp, 0));)
3705   addptr(rsp, FPUStateSizeInWords * wordSize);
3706 }
3707 
3708 void MacroAssembler::pop_IU_state() {
3709   popa();
3710   LP64_ONLY(addq(rsp, 8));
3711   popf();
3712 }
3713 
3714 // Save Integer and Float state
3715 // Warning: Stack must be 16 byte aligned (64bit)
3716 void MacroAssembler::push_CPU_state() {
3717   push_IU_state();
3718   push_FPU_state();
3719 }
3720 
3721 void MacroAssembler::push_FPU_state() {
3722   subptr(rsp, FPUStateSizeInWords * wordSize);
3723 #ifndef _LP64
3724   fnsave(Address(rsp, 0));
3725   fwait();
3726 #else
3727   fxsave(Address(rsp, 0));
3728 #endif // LP64
3729 }
3730 
3731 void MacroAssembler::push_IU_state() {
3732   // Push flags first because pusha kills them
3733   pushf();
3734   // Make sure rsp stays 16-byte aligned
3735   LP64_ONLY(subq(rsp, 8));
3736   pusha();
3737 }
3738 
3739 void MacroAssembler::reset_last_Java_frame(Register java_thread, bool clear_fp, bool clear_pc) {
3740   // determine java_thread register
3741   if (!java_thread->is_valid()) {
3742     java_thread = rdi;
3743     get_thread(java_thread);
3744   }
3745   // we must set sp to zero to clear frame
3746   movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
3747   if (clear_fp) {
3748     movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
3749   }
3750 
3751   if (clear_pc)
3752     movptr(Address(java_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
3753 
3754 }
3755 
3756 void MacroAssembler::restore_rax(Register tmp) {
3757   if (tmp == noreg) pop(rax);
3758   else if (tmp != rax) mov(rax, tmp);
3759 }
3760 
3761 void MacroAssembler::round_to(Register reg, int modulus) {
3762   addptr(reg, modulus - 1);
3763   andptr(reg, -modulus);
3764 }
3765 
3766 void MacroAssembler::save_rax(Register tmp) {
3767   if (tmp == noreg) push(rax);
3768   else if (tmp != rax) mov(tmp, rax);
3769 }
3770 
3771 // Write serialization page so VM thread can do a pseudo remote membar.
3772 // We use the current thread pointer to calculate a thread specific
3773 // offset to write to within the page. This minimizes bus traffic
3774 // due to cache line collision.
3775 void MacroAssembler::serialize_memory(Register thread, Register tmp) {
3776   movl(tmp, thread);
3777   shrl(tmp, os::get_serialize_page_shift_count());
3778   andl(tmp, (os::vm_page_size() - sizeof(int)));
3779 
3780   Address index(noreg, tmp, Address::times_1);
3781   ExternalAddress page(os::get_memory_serialize_page());
3782 
3783   // Size of store must match masking code above
3784   movl(as_Address(ArrayAddress(page, index)), tmp);
3785 }
3786 
3787 // Calls to C land
3788 //
3789 // When entering C land, the rbp, & rsp of the last Java frame have to be recorded
3790 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
3791 // has to be reset to 0. This is required to allow proper stack traversal.
3792 void MacroAssembler::set_last_Java_frame(Register java_thread,
3793                                          Register last_java_sp,
3794                                          Register last_java_fp,
3795                                          address  last_java_pc) {
3796   // determine java_thread register
3797   if (!java_thread->is_valid()) {
3798     java_thread = rdi;
3799     get_thread(java_thread);
3800   }
3801   // determine last_java_sp register
3802   if (!last_java_sp->is_valid()) {
3803     last_java_sp = rsp;
3804   }
3805 
3806   // last_java_fp is optional
3807 
3808   if (last_java_fp->is_valid()) {
3809     movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), last_java_fp);
3810   }
3811 
3812   // last_java_pc is optional
3813 
3814   if (last_java_pc != NULL) {
3815     lea(Address(java_thread,
3816                  JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()),
3817         InternalAddress(last_java_pc));
3818 
3819   }
3820   movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
3821 }
3822 
3823 void MacroAssembler::shlptr(Register dst, int imm8) {
3824   LP64_ONLY(shlq(dst, imm8)) NOT_LP64(shll(dst, imm8));
3825 }
3826 
3827 void MacroAssembler::shrptr(Register dst, int imm8) {
3828   LP64_ONLY(shrq(dst, imm8)) NOT_LP64(shrl(dst, imm8));
3829 }
3830 
3831 void MacroAssembler::sign_extend_byte(Register reg) {
3832   if (LP64_ONLY(true ||) (VM_Version::is_P6() && reg->has_byte_register())) {
3833     movsbl(reg, reg); // movsxb
3834   } else {
3835     shll(reg, 24);
3836     sarl(reg, 24);
3837   }
3838 }
3839 
3840 void MacroAssembler::sign_extend_short(Register reg) {
3841   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
3842     movswl(reg, reg); // movsxw
3843   } else {
3844     shll(reg, 16);
3845     sarl(reg, 16);
3846   }
3847 }
3848 
3849 void MacroAssembler::testl(Register dst, AddressLiteral src) {
3850   assert(reachable(src), "Address should be reachable");
3851   testl(dst, as_Address(src));
3852 }
3853 
3854 void MacroAssembler::sqrtsd(XMMRegister dst, AddressLiteral src) {
3855   if (reachable(src)) {
3856     Assembler::sqrtsd(dst, as_Address(src));
3857   } else {
3858     lea(rscratch1, src);
3859     Assembler::sqrtsd(dst, Address(rscratch1, 0));
3860   }
3861 }
3862 
3863 void MacroAssembler::sqrtss(XMMRegister dst, AddressLiteral src) {
3864   if (reachable(src)) {
3865     Assembler::sqrtss(dst, as_Address(src));
3866   } else {
3867     lea(rscratch1, src);
3868     Assembler::sqrtss(dst, Address(rscratch1, 0));
3869   }
3870 }
3871 
3872 void MacroAssembler::subsd(XMMRegister dst, AddressLiteral src) {
3873   if (reachable(src)) {
3874     Assembler::subsd(dst, as_Address(src));
3875   } else {
3876     lea(rscratch1, src);
3877     Assembler::subsd(dst, Address(rscratch1, 0));
3878   }
3879 }
3880 
3881 void MacroAssembler::subss(XMMRegister dst, AddressLiteral src) {
3882   if (reachable(src)) {
3883     Assembler::subss(dst, as_Address(src));
3884   } else {
3885     lea(rscratch1, src);
3886     Assembler::subss(dst, Address(rscratch1, 0));
3887   }
3888 }
3889 
3890 void MacroAssembler::ucomisd(XMMRegister dst, AddressLiteral src) {
3891   if (reachable(src)) {
3892     Assembler::ucomisd(dst, as_Address(src));
3893   } else {
3894     lea(rscratch1, src);
3895     Assembler::ucomisd(dst, Address(rscratch1, 0));
3896   }
3897 }
3898 
3899 void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src) {
3900   if (reachable(src)) {
3901     Assembler::ucomiss(dst, as_Address(src));
3902   } else {
3903     lea(rscratch1, src);
3904     Assembler::ucomiss(dst, Address(rscratch1, 0));
3905   }
3906 }
3907 
3908 void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src) {
3909   // Used in sign-bit flipping with aligned address.
3910   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
3911   if (reachable(src)) {
3912     Assembler::xorpd(dst, as_Address(src));
3913   } else {
3914     lea(rscratch1, src);
3915     Assembler::xorpd(dst, Address(rscratch1, 0));
3916   }
3917 }
3918 
3919 void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src) {
3920   // Used in sign-bit flipping with aligned address.
3921   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
3922   if (reachable(src)) {
3923     Assembler::xorps(dst, as_Address(src));
3924   } else {
3925     lea(rscratch1, src);
3926     Assembler::xorps(dst, Address(rscratch1, 0));
3927   }
3928 }
3929 
3930 void MacroAssembler::pshufb(XMMRegister dst, AddressLiteral src) {
3931   // Used in sign-bit flipping with aligned address.
3932   bool aligned_adr = (((intptr_t)src.target() & 15) == 0);
3933   assert((UseAVX > 0) || aligned_adr, "SSE mode requires address alignment 16 bytes");
3934   if (reachable(src)) {
3935     Assembler::pshufb(dst, as_Address(src));
3936   } else {
3937     lea(rscratch1, src);
3938     Assembler::pshufb(dst, Address(rscratch1, 0));
3939   }
3940 }
3941 
3942 // AVX 3-operands instructions
3943 
3944 void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3945   if (reachable(src)) {
3946     vaddsd(dst, nds, as_Address(src));
3947   } else {
3948     lea(rscratch1, src);
3949     vaddsd(dst, nds, Address(rscratch1, 0));
3950   }
3951 }
3952 
3953 void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3954   if (reachable(src)) {
3955     vaddss(dst, nds, as_Address(src));
3956   } else {
3957     lea(rscratch1, src);
3958     vaddss(dst, nds, Address(rscratch1, 0));
3959   }
3960 }
3961 
3962 void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
3963   if (reachable(src)) {
3964     vandpd(dst, nds, as_Address(src), vector_len);
3965   } else {
3966     lea(rscratch1, src);
3967     vandpd(dst, nds, Address(rscratch1, 0), vector_len);
3968   }
3969 }
3970 
3971 void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
3972   if (reachable(src)) {
3973     vandps(dst, nds, as_Address(src), vector_len);
3974   } else {
3975     lea(rscratch1, src);
3976     vandps(dst, nds, Address(rscratch1, 0), vector_len);
3977   }
3978 }
3979 
3980 void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3981   if (reachable(src)) {
3982     vdivsd(dst, nds, as_Address(src));
3983   } else {
3984     lea(rscratch1, src);
3985     vdivsd(dst, nds, Address(rscratch1, 0));
3986   }
3987 }
3988 
3989 void MacroAssembler::vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3990   if (reachable(src)) {
3991     vdivss(dst, nds, as_Address(src));
3992   } else {
3993     lea(rscratch1, src);
3994     vdivss(dst, nds, Address(rscratch1, 0));
3995   }
3996 }
3997 
3998 void MacroAssembler::vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3999   if (reachable(src)) {
4000     vmulsd(dst, nds, as_Address(src));
4001   } else {
4002     lea(rscratch1, src);
4003     vmulsd(dst, nds, Address(rscratch1, 0));
4004   }
4005 }
4006 
4007 void MacroAssembler::vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4008   if (reachable(src)) {
4009     vmulss(dst, nds, as_Address(src));
4010   } else {
4011     lea(rscratch1, src);
4012     vmulss(dst, nds, Address(rscratch1, 0));
4013   }
4014 }
4015 
4016 void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4017   if (reachable(src)) {
4018     vsubsd(dst, nds, as_Address(src));
4019   } else {
4020     lea(rscratch1, src);
4021     vsubsd(dst, nds, Address(rscratch1, 0));
4022   }
4023 }
4024 
4025 void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4026   if (reachable(src)) {
4027     vsubss(dst, nds, as_Address(src));
4028   } else {
4029     lea(rscratch1, src);
4030     vsubss(dst, nds, Address(rscratch1, 0));
4031   }
4032 }
4033 
4034 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
4035   if (reachable(src)) {
4036     vxorpd(dst, nds, as_Address(src), vector_len);
4037   } else {
4038     lea(rscratch1, src);
4039     vxorpd(dst, nds, Address(rscratch1, 0), vector_len);
4040   }
4041 }
4042 
4043 void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
4044   if (reachable(src)) {
4045     vxorps(dst, nds, as_Address(src), vector_len);
4046   } else {
4047     lea(rscratch1, src);
4048     vxorps(dst, nds, Address(rscratch1, 0), vector_len);
4049   }
4050 }
4051 
4052 
4053 //////////////////////////////////////////////////////////////////////////////////
4054 #if INCLUDE_ALL_GCS
4055 
4056 void MacroAssembler::g1_write_barrier_pre(Register obj,
4057                                           Register pre_val,
4058                                           Register thread,
4059                                           Register tmp,
4060                                           bool tosca_live,
4061                                           bool expand_call) {
4062 
4063   // If expand_call is true then we expand the call_VM_leaf macro
4064   // directly to skip generating the check by
4065   // InterpreterMacroAssembler::call_VM_leaf_base that checks _last_sp.
4066 
4067 #ifdef _LP64
4068   assert(thread == r15_thread, "must be");
4069 #endif // _LP64
4070 
4071   Label done;
4072   Label runtime;
4073 
4074   assert(pre_val != noreg, "check this code");
4075 
4076   if (obj != noreg) {
4077     assert_different_registers(obj, pre_val, tmp);
4078     assert(pre_val != rax, "check this code");
4079   }
4080 
4081   Address in_progress(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
4082                                        PtrQueue::byte_offset_of_active()));
4083   Address index(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
4084                                        PtrQueue::byte_offset_of_index()));
4085   Address buffer(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
4086                                        PtrQueue::byte_offset_of_buf()));
4087 
4088 
4089   // Is marking active?
4090   if (in_bytes(PtrQueue::byte_width_of_active()) == 4) {
4091     cmpl(in_progress, 0);
4092   } else {
4093     assert(in_bytes(PtrQueue::byte_width_of_active()) == 1, "Assumption");
4094     cmpb(in_progress, 0);
4095   }
4096   jcc(Assembler::equal, done);
4097 
4098   // Do we need to load the previous value?
4099   if (obj != noreg) {
4100     load_heap_oop(pre_val, Address(obj, 0));
4101   }
4102 
4103   // Is the previous value null?
4104   cmpptr(pre_val, (int32_t) NULL_WORD);
4105   jcc(Assembler::equal, done);
4106 
4107   // Can we store original value in the thread's buffer?
4108   // Is index == 0?
4109   // (The index field is typed as size_t.)
4110 
4111   movptr(tmp, index);                   // tmp := *index_adr
4112   cmpptr(tmp, 0);                       // tmp == 0?
4113   jcc(Assembler::equal, runtime);       // If yes, goto runtime
4114 
4115   subptr(tmp, wordSize);                // tmp := tmp - wordSize
4116   movptr(index, tmp);                   // *index_adr := tmp
4117   addptr(tmp, buffer);                  // tmp := tmp + *buffer_adr
4118 
4119   // Record the previous value
4120   movptr(Address(tmp, 0), pre_val);
4121   jmp(done);
4122 
4123   bind(runtime);
4124   // save the live input values
4125   if(tosca_live) push(rax);
4126 
4127   if (obj != noreg && obj != rax)
4128     push(obj);
4129 
4130   if (pre_val != rax)
4131     push(pre_val);
4132 
4133   // Calling the runtime using the regular call_VM_leaf mechanism generates
4134   // code (generated by InterpreterMacroAssember::call_VM_leaf_base)
4135   // that checks that the *(ebp+frame::interpreter_frame_last_sp) == NULL.
4136   //
4137   // If we care generating the pre-barrier without a frame (e.g. in the
4138   // intrinsified Reference.get() routine) then ebp might be pointing to
4139   // the caller frame and so this check will most likely fail at runtime.
4140   //
4141   // Expanding the call directly bypasses the generation of the check.
4142   // So when we do not have have a full interpreter frame on the stack
4143   // expand_call should be passed true.
4144 
4145   NOT_LP64( push(thread); )
4146 
4147   if (expand_call) {
4148     LP64_ONLY( assert(pre_val != c_rarg1, "smashed arg"); )
4149     pass_arg1(this, thread);
4150     pass_arg0(this, pre_val);
4151     MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), 2);
4152   } else {
4153     call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), pre_val, thread);
4154   }
4155 
4156   NOT_LP64( pop(thread); )
4157 
4158   // save the live input values
4159   if (pre_val != rax)
4160     pop(pre_val);
4161 
4162   if (obj != noreg && obj != rax)
4163     pop(obj);
4164 
4165   if(tosca_live) pop(rax);
4166 
4167   bind(done);
4168 }
4169 
4170 void MacroAssembler::g1_write_barrier_post(Register store_addr,
4171                                            Register new_val,
4172                                            Register thread,
4173                                            Register tmp,
4174                                            Register tmp2) {
4175 #ifdef _LP64
4176   assert(thread == r15_thread, "must be");
4177 #endif // _LP64
4178 
4179   Address queue_index(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
4180                                        PtrQueue::byte_offset_of_index()));
4181   Address buffer(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
4182                                        PtrQueue::byte_offset_of_buf()));
4183 
4184   CardTableModRefBS* ct =
4185     barrier_set_cast<CardTableModRefBS>(Universe::heap()->barrier_set());
4186   assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
4187 
4188   Label done;
4189   Label runtime;
4190 
4191   // Does store cross heap regions?
4192 
4193   movptr(tmp, store_addr);
4194   xorptr(tmp, new_val);
4195   shrptr(tmp, HeapRegion::LogOfHRGrainBytes);
4196   jcc(Assembler::equal, done);
4197 
4198   // crosses regions, storing NULL?
4199 
4200   cmpptr(new_val, (int32_t) NULL_WORD);
4201   jcc(Assembler::equal, done);
4202 
4203   // storing region crossing non-NULL, is card already dirty?
4204 
4205   const Register card_addr = tmp;
4206   const Register cardtable = tmp2;
4207 
4208   movptr(card_addr, store_addr);
4209   shrptr(card_addr, CardTableModRefBS::card_shift);
4210   // Do not use ExternalAddress to load 'byte_map_base', since 'byte_map_base' is NOT
4211   // a valid address and therefore is not properly handled by the relocation code.
4212   movptr(cardtable, (intptr_t)ct->byte_map_base);
4213   addptr(card_addr, cardtable);
4214 
4215   cmpb(Address(card_addr, 0), (int)G1SATBCardTableModRefBS::g1_young_card_val());
4216   jcc(Assembler::equal, done);
4217 
4218   membar(Assembler::Membar_mask_bits(Assembler::StoreLoad));
4219   cmpb(Address(card_addr, 0), (int)CardTableModRefBS::dirty_card_val());
4220   jcc(Assembler::equal, done);
4221 
4222 
4223   // storing a region crossing, non-NULL oop, card is clean.
4224   // dirty card and log.
4225 
4226   movb(Address(card_addr, 0), (int)CardTableModRefBS::dirty_card_val());
4227 
4228   cmpl(queue_index, 0);
4229   jcc(Assembler::equal, runtime);
4230   subl(queue_index, wordSize);
4231   movptr(tmp2, buffer);
4232 #ifdef _LP64
4233   movslq(rscratch1, queue_index);
4234   addq(tmp2, rscratch1);
4235   movq(Address(tmp2, 0), card_addr);
4236 #else
4237   addl(tmp2, queue_index);
4238   movl(Address(tmp2, 0), card_addr);
4239 #endif
4240   jmp(done);
4241 
4242   bind(runtime);
4243   // save the live input values
4244   push(store_addr);
4245   push(new_val);
4246 #ifdef _LP64
4247   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, r15_thread);
4248 #else
4249   push(thread);
4250   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, thread);
4251   pop(thread);
4252 #endif
4253   pop(new_val);
4254   pop(store_addr);
4255 
4256   bind(done);
4257 }
4258 
4259 #endif // INCLUDE_ALL_GCS
4260 //////////////////////////////////////////////////////////////////////////////////
4261 
4262 
4263 void MacroAssembler::store_check(Register obj, Address dst) {
4264   store_check(obj);
4265 }
4266 
4267 void MacroAssembler::store_check(Register obj) {
4268   // Does a store check for the oop in register obj. The content of
4269   // register obj is destroyed afterwards.
4270 
4271   BarrierSet* bs = Universe::heap()->barrier_set();
4272   assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
4273 
4274   CardTableModRefBS* ct = barrier_set_cast<CardTableModRefBS>(bs);
4275   assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
4276 
4277   shrptr(obj, CardTableModRefBS::card_shift);
4278 
4279   Address card_addr;
4280 
4281   // The calculation for byte_map_base is as follows:
4282   // byte_map_base = _byte_map - (uintptr_t(low_bound) >> card_shift);
4283   // So this essentially converts an address to a displacement and it will
4284   // never need to be relocated. On 64bit however the value may be too
4285   // large for a 32bit displacement.
4286   intptr_t disp = (intptr_t) ct->byte_map_base;
4287   if (is_simm32(disp)) {
4288     card_addr = Address(noreg, obj, Address::times_1, disp);
4289   } else {
4290     // By doing it as an ExternalAddress 'disp' could be converted to a rip-relative
4291     // displacement and done in a single instruction given favorable mapping and a
4292     // smarter version of as_Address. However, 'ExternalAddress' generates a relocation
4293     // entry and that entry is not properly handled by the relocation code.
4294     AddressLiteral cardtable((address)ct->byte_map_base, relocInfo::none);
4295     Address index(noreg, obj, Address::times_1);
4296     card_addr = as_Address(ArrayAddress(cardtable, index));
4297   }
4298 
4299   int dirty = CardTableModRefBS::dirty_card_val();
4300   if (UseCondCardMark) {
4301     Label L_already_dirty;
4302     if (UseConcMarkSweepGC) {
4303       membar(Assembler::StoreLoad);
4304     }
4305     cmpb(card_addr, dirty);
4306     jcc(Assembler::equal, L_already_dirty);
4307     movb(card_addr, dirty);
4308     bind(L_already_dirty);
4309   } else {
4310     movb(card_addr, dirty);
4311   }
4312 }
4313 
4314 void MacroAssembler::subptr(Register dst, int32_t imm32) {
4315   LP64_ONLY(subq(dst, imm32)) NOT_LP64(subl(dst, imm32));
4316 }
4317 
4318 // Force generation of a 4 byte immediate value even if it fits into 8bit
4319 void MacroAssembler::subptr_imm32(Register dst, int32_t imm32) {
4320   LP64_ONLY(subq_imm32(dst, imm32)) NOT_LP64(subl_imm32(dst, imm32));
4321 }
4322 
4323 void MacroAssembler::subptr(Register dst, Register src) {
4324   LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src));
4325 }
4326 
4327 // C++ bool manipulation
4328 void MacroAssembler::testbool(Register dst) {
4329   if(sizeof(bool) == 1)
4330     testb(dst, 0xff);
4331   else if(sizeof(bool) == 2) {
4332     // testw implementation needed for two byte bools
4333     ShouldNotReachHere();
4334   } else if(sizeof(bool) == 4)
4335     testl(dst, dst);
4336   else
4337     // unsupported
4338     ShouldNotReachHere();
4339 }
4340 
4341 void MacroAssembler::testptr(Register dst, Register src) {
4342   LP64_ONLY(testq(dst, src)) NOT_LP64(testl(dst, src));
4343 }
4344 
4345 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
4346 void MacroAssembler::tlab_allocate(Register obj,
4347                                    Register var_size_in_bytes,
4348                                    int con_size_in_bytes,
4349                                    Register t1,
4350                                    Register t2,
4351                                    Label& slow_case) {
4352   assert_different_registers(obj, t1, t2);
4353   assert_different_registers(obj, var_size_in_bytes, t1);
4354   Register end = t2;
4355   Register thread = NOT_LP64(t1) LP64_ONLY(r15_thread);
4356 
4357   verify_tlab();
4358 
4359   NOT_LP64(get_thread(thread));
4360 
4361   movptr(obj, Address(thread, JavaThread::tlab_top_offset()));
4362   if (var_size_in_bytes == noreg) {
4363     lea(end, Address(obj, con_size_in_bytes));
4364   } else {
4365     lea(end, Address(obj, var_size_in_bytes, Address::times_1));
4366   }
4367   cmpptr(end, Address(thread, JavaThread::tlab_end_offset()));
4368   jcc(Assembler::above, slow_case);
4369 
4370   // update the tlab top pointer
4371   movptr(Address(thread, JavaThread::tlab_top_offset()), end);
4372 
4373   // recover var_size_in_bytes if necessary
4374   if (var_size_in_bytes == end) {
4375     subptr(var_size_in_bytes, obj);
4376   }
4377   verify_tlab();
4378 }
4379 
4380 // Preserves rbx, and rdx.
4381 Register MacroAssembler::tlab_refill(Label& retry,
4382                                      Label& try_eden,
4383                                      Label& slow_case) {
4384   Register top = rax;
4385   Register t1  = rcx;
4386   Register t2  = rsi;
4387   Register thread_reg = NOT_LP64(rdi) LP64_ONLY(r15_thread);
4388   assert_different_registers(top, thread_reg, t1, t2, /* preserve: */ rbx, rdx);
4389   Label do_refill, discard_tlab;
4390 
4391   if (!Universe::heap()->supports_inline_contig_alloc()) {
4392     // No allocation in the shared eden.
4393     jmp(slow_case);
4394   }
4395 
4396   NOT_LP64(get_thread(thread_reg));
4397 
4398   movptr(top, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
4399   movptr(t1,  Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())));
4400 
4401   // calculate amount of free space
4402   subptr(t1, top);
4403   shrptr(t1, LogHeapWordSize);
4404 
4405   // Retain tlab and allocate object in shared space if
4406   // the amount free in the tlab is too large to discard.
4407   cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_refill_waste_limit_offset())));
4408   jcc(Assembler::lessEqual, discard_tlab);
4409 
4410   // Retain
4411   // %%% yuck as movptr...
4412   movptr(t2, (int32_t) ThreadLocalAllocBuffer::refill_waste_limit_increment());
4413   addptr(Address(thread_reg, in_bytes(JavaThread::tlab_refill_waste_limit_offset())), t2);
4414   if (TLABStats) {
4415     // increment number of slow_allocations
4416     addl(Address(thread_reg, in_bytes(JavaThread::tlab_slow_allocations_offset())), 1);
4417   }
4418   jmp(try_eden);
4419 
4420   bind(discard_tlab);
4421   if (TLABStats) {
4422     // increment number of refills
4423     addl(Address(thread_reg, in_bytes(JavaThread::tlab_number_of_refills_offset())), 1);
4424     // accumulate wastage -- t1 is amount free in tlab
4425     addl(Address(thread_reg, in_bytes(JavaThread::tlab_fast_refill_waste_offset())), t1);
4426   }
4427 
4428   // if tlab is currently allocated (top or end != null) then
4429   // fill [top, end + alignment_reserve) with array object
4430   testptr(top, top);
4431   jcc(Assembler::zero, do_refill);
4432 
4433   // set up the mark word
4434   movptr(Address(top, oopDesc::mark_offset_in_bytes()), (intptr_t)markOopDesc::prototype()->copy_set_hash(0x2));
4435   // set the length to the remaining space
4436   subptr(t1, typeArrayOopDesc::header_size(T_INT));
4437   addptr(t1, (int32_t)ThreadLocalAllocBuffer::alignment_reserve());
4438   shlptr(t1, log2_intptr(HeapWordSize/sizeof(jint)));
4439   movl(Address(top, arrayOopDesc::length_offset_in_bytes()), t1);
4440   // set klass to intArrayKlass
4441   // dubious reloc why not an oop reloc?
4442   movptr(t1, ExternalAddress((address)Universe::intArrayKlassObj_addr()));
4443   // store klass last.  concurrent gcs assumes klass length is valid if
4444   // klass field is not null.
4445   store_klass(top, t1);
4446 
4447   movptr(t1, top);
4448   subptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())));
4449   incr_allocated_bytes(thread_reg, t1, 0);
4450 
4451   // refill the tlab with an eden allocation
4452   bind(do_refill);
4453   movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_size_offset())));
4454   shlptr(t1, LogHeapWordSize);
4455   // allocate new tlab, address returned in top
4456   eden_allocate(top, t1, 0, t2, slow_case);
4457 
4458   // Check that t1 was preserved in eden_allocate.
4459 #ifdef ASSERT
4460   if (UseTLAB) {
4461     Label ok;
4462     Register tsize = rsi;
4463     assert_different_registers(tsize, thread_reg, t1);
4464     push(tsize);
4465     movptr(tsize, Address(thread_reg, in_bytes(JavaThread::tlab_size_offset())));
4466     shlptr(tsize, LogHeapWordSize);
4467     cmpptr(t1, tsize);
4468     jcc(Assembler::equal, ok);
4469     STOP("assert(t1 != tlab size)");
4470     should_not_reach_here();
4471 
4472     bind(ok);
4473     pop(tsize);
4474   }
4475 #endif
4476   movptr(Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())), top);
4477   movptr(Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())), top);
4478   addptr(top, t1);
4479   subptr(top, (int32_t)ThreadLocalAllocBuffer::alignment_reserve_in_bytes());
4480   movptr(Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())), top);
4481   verify_tlab();
4482   jmp(retry);
4483 
4484   return thread_reg; // for use by caller
4485 }
4486 
4487 void MacroAssembler::incr_allocated_bytes(Register thread,
4488                                           Register var_size_in_bytes,
4489                                           int con_size_in_bytes,
4490                                           Register t1) {
4491   if (!thread->is_valid()) {
4492 #ifdef _LP64
4493     thread = r15_thread;
4494 #else
4495     assert(t1->is_valid(), "need temp reg");
4496     thread = t1;
4497     get_thread(thread);
4498 #endif
4499   }
4500 
4501 #ifdef _LP64
4502   if (var_size_in_bytes->is_valid()) {
4503     addq(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), var_size_in_bytes);
4504   } else {
4505     addq(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), con_size_in_bytes);
4506   }
4507 #else
4508   if (var_size_in_bytes->is_valid()) {
4509     addl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), var_size_in_bytes);
4510   } else {
4511     addl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), con_size_in_bytes);
4512   }
4513   adcl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())+4), 0);
4514 #endif
4515 }
4516 
4517 void MacroAssembler::fp_runtime_fallback(address runtime_entry, int nb_args, int num_fpu_regs_in_use) {
4518   pusha();
4519 
4520   // if we are coming from c1, xmm registers may be live
4521   int off = 0;
4522   if (UseSSE == 1)  {
4523     subptr(rsp, sizeof(jdouble)*8);
4524     movflt(Address(rsp,off++*sizeof(jdouble)),xmm0);
4525     movflt(Address(rsp,off++*sizeof(jdouble)),xmm1);
4526     movflt(Address(rsp,off++*sizeof(jdouble)),xmm2);
4527     movflt(Address(rsp,off++*sizeof(jdouble)),xmm3);
4528     movflt(Address(rsp,off++*sizeof(jdouble)),xmm4);
4529     movflt(Address(rsp,off++*sizeof(jdouble)),xmm5);
4530     movflt(Address(rsp,off++*sizeof(jdouble)),xmm6);
4531     movflt(Address(rsp,off++*sizeof(jdouble)),xmm7);
4532   } else if (UseSSE >= 2)  {
4533     if (UseAVX > 2) {
4534       movl(rbx, 0xffff);
4535 #ifdef _LP64
4536       kmovql(k1, rbx);
4537 #else
4538       kmovdl(k1, rbx);
4539 #endif
4540     }
4541 #ifdef COMPILER2
4542     if (MaxVectorSize > 16) {
4543       assert(UseAVX > 0, "256bit vectors are supported only with AVX");
4544       // Save upper half of YMM registes
4545       subptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
4546       vextractf128h(Address(rsp,  0),xmm0);
4547       vextractf128h(Address(rsp, 16),xmm1);
4548       vextractf128h(Address(rsp, 32),xmm2);
4549       vextractf128h(Address(rsp, 48),xmm3);
4550       vextractf128h(Address(rsp, 64),xmm4);
4551       vextractf128h(Address(rsp, 80),xmm5);
4552       vextractf128h(Address(rsp, 96),xmm6);
4553       vextractf128h(Address(rsp,112),xmm7);
4554 #ifdef _LP64
4555       vextractf128h(Address(rsp,128),xmm8);
4556       vextractf128h(Address(rsp,144),xmm9);
4557       vextractf128h(Address(rsp,160),xmm10);
4558       vextractf128h(Address(rsp,176),xmm11);
4559       vextractf128h(Address(rsp,192),xmm12);
4560       vextractf128h(Address(rsp,208),xmm13);
4561       vextractf128h(Address(rsp,224),xmm14);
4562       vextractf128h(Address(rsp,240),xmm15);
4563 #endif
4564     }
4565 #endif
4566     // Save whole 128bit (16 bytes) XMM regiters
4567     subptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
4568     movdqu(Address(rsp,off++*16),xmm0);
4569     movdqu(Address(rsp,off++*16),xmm1);
4570     movdqu(Address(rsp,off++*16),xmm2);
4571     movdqu(Address(rsp,off++*16),xmm3);
4572     movdqu(Address(rsp,off++*16),xmm4);
4573     movdqu(Address(rsp,off++*16),xmm5);
4574     movdqu(Address(rsp,off++*16),xmm6);
4575     movdqu(Address(rsp,off++*16),xmm7);
4576 #ifdef _LP64
4577     movdqu(Address(rsp,off++*16),xmm8);
4578     movdqu(Address(rsp,off++*16),xmm9);
4579     movdqu(Address(rsp,off++*16),xmm10);
4580     movdqu(Address(rsp,off++*16),xmm11);
4581     movdqu(Address(rsp,off++*16),xmm12);
4582     movdqu(Address(rsp,off++*16),xmm13);
4583     movdqu(Address(rsp,off++*16),xmm14);
4584     movdqu(Address(rsp,off++*16),xmm15);
4585 #endif
4586   }
4587 
4588   // Preserve registers across runtime call
4589   int incoming_argument_and_return_value_offset = -1;
4590   if (num_fpu_regs_in_use > 1) {
4591     // Must preserve all other FPU regs (could alternatively convert
4592     // SharedRuntime::dsin, dcos etc. into assembly routines known not to trash
4593     // FPU state, but can not trust C compiler)
4594     NEEDS_CLEANUP;
4595     // NOTE that in this case we also push the incoming argument(s) to
4596     // the stack and restore it later; we also use this stack slot to
4597     // hold the return value from dsin, dcos etc.
4598     for (int i = 0; i < num_fpu_regs_in_use; i++) {
4599       subptr(rsp, sizeof(jdouble));
4600       fstp_d(Address(rsp, 0));
4601     }
4602     incoming_argument_and_return_value_offset = sizeof(jdouble)*(num_fpu_regs_in_use-1);
4603     for (int i = nb_args-1; i >= 0; i--) {
4604       fld_d(Address(rsp, incoming_argument_and_return_value_offset-i*sizeof(jdouble)));
4605     }
4606   }
4607 
4608   subptr(rsp, nb_args*sizeof(jdouble));
4609   for (int i = 0; i < nb_args; i++) {
4610     fstp_d(Address(rsp, i*sizeof(jdouble)));
4611   }
4612 
4613 #ifdef _LP64
4614   if (nb_args > 0) {
4615     movdbl(xmm0, Address(rsp, 0));
4616   }
4617   if (nb_args > 1) {
4618     movdbl(xmm1, Address(rsp, sizeof(jdouble)));
4619   }
4620   assert(nb_args <= 2, "unsupported number of args");
4621 #endif // _LP64
4622 
4623   // NOTE: we must not use call_VM_leaf here because that requires a
4624   // complete interpreter frame in debug mode -- same bug as 4387334
4625   // MacroAssembler::call_VM_leaf_base is perfectly safe and will
4626   // do proper 64bit abi
4627 
4628   NEEDS_CLEANUP;
4629   // Need to add stack banging before this runtime call if it needs to
4630   // be taken; however, there is no generic stack banging routine at
4631   // the MacroAssembler level
4632 
4633   MacroAssembler::call_VM_leaf_base(runtime_entry, 0);
4634 
4635 #ifdef _LP64
4636   movsd(Address(rsp, 0), xmm0);
4637   fld_d(Address(rsp, 0));
4638 #endif // _LP64
4639   addptr(rsp, sizeof(jdouble) * nb_args);
4640   if (num_fpu_regs_in_use > 1) {
4641     // Must save return value to stack and then restore entire FPU
4642     // stack except incoming arguments
4643     fstp_d(Address(rsp, incoming_argument_and_return_value_offset));
4644     for (int i = 0; i < num_fpu_regs_in_use - nb_args; i++) {
4645       fld_d(Address(rsp, 0));
4646       addptr(rsp, sizeof(jdouble));
4647     }
4648     fld_d(Address(rsp, (nb_args-1)*sizeof(jdouble)));
4649     addptr(rsp, sizeof(jdouble) * nb_args);
4650   }
4651 
4652   off = 0;
4653   if (UseSSE == 1)  {
4654     movflt(xmm0, Address(rsp,off++*sizeof(jdouble)));
4655     movflt(xmm1, Address(rsp,off++*sizeof(jdouble)));
4656     movflt(xmm2, Address(rsp,off++*sizeof(jdouble)));
4657     movflt(xmm3, Address(rsp,off++*sizeof(jdouble)));
4658     movflt(xmm4, Address(rsp,off++*sizeof(jdouble)));
4659     movflt(xmm5, Address(rsp,off++*sizeof(jdouble)));
4660     movflt(xmm6, Address(rsp,off++*sizeof(jdouble)));
4661     movflt(xmm7, Address(rsp,off++*sizeof(jdouble)));
4662     addptr(rsp, sizeof(jdouble)*8);
4663   } else if (UseSSE >= 2)  {
4664     // Restore whole 128bit (16 bytes) XMM regiters
4665     movdqu(xmm0, Address(rsp,off++*16));
4666     movdqu(xmm1, Address(rsp,off++*16));
4667     movdqu(xmm2, Address(rsp,off++*16));
4668     movdqu(xmm3, Address(rsp,off++*16));
4669     movdqu(xmm4, Address(rsp,off++*16));
4670     movdqu(xmm5, Address(rsp,off++*16));
4671     movdqu(xmm6, Address(rsp,off++*16));
4672     movdqu(xmm7, Address(rsp,off++*16));
4673 #ifdef _LP64
4674     movdqu(xmm8, Address(rsp,off++*16));
4675     movdqu(xmm9, Address(rsp,off++*16));
4676     movdqu(xmm10, Address(rsp,off++*16));
4677     movdqu(xmm11, Address(rsp,off++*16));
4678     movdqu(xmm12, Address(rsp,off++*16));
4679     movdqu(xmm13, Address(rsp,off++*16));
4680     movdqu(xmm14, Address(rsp,off++*16));
4681     movdqu(xmm15, Address(rsp,off++*16));
4682 #endif
4683     addptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
4684 #ifdef COMPILER2
4685     if (MaxVectorSize > 16) {
4686       // Restore upper half of YMM registes.
4687       vinsertf128h(xmm0, Address(rsp,  0));
4688       vinsertf128h(xmm1, Address(rsp, 16));
4689       vinsertf128h(xmm2, Address(rsp, 32));
4690       vinsertf128h(xmm3, Address(rsp, 48));
4691       vinsertf128h(xmm4, Address(rsp, 64));
4692       vinsertf128h(xmm5, Address(rsp, 80));
4693       vinsertf128h(xmm6, Address(rsp, 96));
4694       vinsertf128h(xmm7, Address(rsp,112));
4695 #ifdef _LP64
4696       vinsertf128h(xmm8, Address(rsp,128));
4697       vinsertf128h(xmm9, Address(rsp,144));
4698       vinsertf128h(xmm10, Address(rsp,160));
4699       vinsertf128h(xmm11, Address(rsp,176));
4700       vinsertf128h(xmm12, Address(rsp,192));
4701       vinsertf128h(xmm13, Address(rsp,208));
4702       vinsertf128h(xmm14, Address(rsp,224));
4703       vinsertf128h(xmm15, Address(rsp,240));
4704 #endif
4705       addptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
4706     }
4707 #endif
4708   }
4709   popa();
4710 }
4711 
4712 static const double     pi_4 =  0.7853981633974483;
4713 
4714 void MacroAssembler::trigfunc(char trig, int num_fpu_regs_in_use) {
4715   // A hand-coded argument reduction for values in fabs(pi/4, pi/2)
4716   // was attempted in this code; unfortunately it appears that the
4717   // switch to 80-bit precision and back causes this to be
4718   // unprofitable compared with simply performing a runtime call if
4719   // the argument is out of the (-pi/4, pi/4) range.
4720 
4721   Register tmp = noreg;
4722   if (!VM_Version::supports_cmov()) {
4723     // fcmp needs a temporary so preserve rbx,
4724     tmp = rbx;
4725     push(tmp);
4726   }
4727 
4728   Label slow_case, done;
4729 
4730   ExternalAddress pi4_adr = (address)&pi_4;
4731   if (reachable(pi4_adr)) {
4732     // x ?<= pi/4
4733     fld_d(pi4_adr);
4734     fld_s(1);                // Stack:  X  PI/4  X
4735     fabs();                  // Stack: |X| PI/4  X
4736     fcmp(tmp);
4737     jcc(Assembler::above, slow_case);
4738 
4739     // fastest case: -pi/4 <= x <= pi/4
4740     switch(trig) {
4741     case 's':
4742       fsin();
4743       break;
4744     case 'c':
4745       fcos();
4746       break;
4747     case 't':
4748       ftan();
4749       break;
4750     default:
4751       assert(false, "bad intrinsic");
4752       break;
4753     }
4754     jmp(done);
4755   }
4756 
4757   // slow case: runtime call
4758   bind(slow_case);
4759 
4760   switch(trig) {
4761   case 's':
4762     {
4763       fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dsin), 1, num_fpu_regs_in_use);
4764     }
4765     break;
4766   case 'c':
4767     {
4768       fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dcos), 1, num_fpu_regs_in_use);
4769     }
4770     break;
4771   case 't':
4772     {
4773       fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dtan), 1, num_fpu_regs_in_use);
4774     }
4775     break;
4776   default:
4777     assert(false, "bad intrinsic");
4778     break;
4779   }
4780 
4781   // Come here with result in F-TOS
4782   bind(done);
4783 
4784   if (tmp != noreg) {
4785     pop(tmp);
4786   }
4787 }
4788 
4789 
4790 // Look up the method for a megamorphic invokeinterface call.
4791 // The target method is determined by <intf_klass, itable_index>.
4792 // The receiver klass is in recv_klass.
4793 // On success, the result will be in method_result, and execution falls through.
4794 // On failure, execution transfers to the given label.
4795 void MacroAssembler::lookup_interface_method(Register recv_klass,
4796                                              Register intf_klass,
4797                                              RegisterOrConstant itable_index,
4798                                              Register method_result,
4799                                              Register scan_temp,
4800                                              Label& L_no_such_interface) {
4801   assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
4802   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
4803          "caller must use same register for non-constant itable index as for method");
4804 
4805   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
4806   int vtable_base = InstanceKlass::vtable_start_offset() * wordSize;
4807   int itentry_off = itableMethodEntry::method_offset_in_bytes();
4808   int scan_step   = itableOffsetEntry::size() * wordSize;
4809   int vte_size    = vtableEntry::size() * wordSize;
4810   Address::ScaleFactor times_vte_scale = Address::times_ptr;
4811   assert(vte_size == wordSize, "else adjust times_vte_scale");
4812 
4813   movl(scan_temp, Address(recv_klass, InstanceKlass::vtable_length_offset() * wordSize));
4814 
4815   // %%% Could store the aligned, prescaled offset in the klassoop.
4816   lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
4817   if (HeapWordsPerLong > 1) {
4818     // Round up to align_object_offset boundary
4819     // see code for InstanceKlass::start_of_itable!
4820     round_to(scan_temp, BytesPerLong);
4821   }
4822 
4823   // Adjust recv_klass by scaled itable_index, so we can free itable_index.
4824   assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
4825   lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
4826 
4827   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
4828   //   if (scan->interface() == intf) {
4829   //     result = (klass + scan->offset() + itable_index);
4830   //   }
4831   // }
4832   Label search, found_method;
4833 
4834   for (int peel = 1; peel >= 0; peel--) {
4835     movptr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
4836     cmpptr(intf_klass, method_result);
4837 
4838     if (peel) {
4839       jccb(Assembler::equal, found_method);
4840     } else {
4841       jccb(Assembler::notEqual, search);
4842       // (invert the test to fall through to found_method...)
4843     }
4844 
4845     if (!peel)  break;
4846 
4847     bind(search);
4848 
4849     // Check that the previous entry is non-null.  A null entry means that
4850     // the receiver class doesn't implement the interface, and wasn't the
4851     // same as when the caller was compiled.
4852     testptr(method_result, method_result);
4853     jcc(Assembler::zero, L_no_such_interface);
4854     addptr(scan_temp, scan_step);
4855   }
4856 
4857   bind(found_method);
4858 
4859   // Got a hit.
4860   movl(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
4861   movptr(method_result, Address(recv_klass, scan_temp, Address::times_1));
4862 }
4863 
4864 
4865 // virtual method calling
4866 void MacroAssembler::lookup_virtual_method(Register recv_klass,
4867                                            RegisterOrConstant vtable_index,
4868                                            Register method_result) {
4869   const int base = InstanceKlass::vtable_start_offset() * wordSize;
4870   assert(vtableEntry::size() * wordSize == wordSize, "else adjust the scaling in the code below");
4871   Address vtable_entry_addr(recv_klass,
4872                             vtable_index, Address::times_ptr,
4873                             base + vtableEntry::method_offset_in_bytes());
4874   movptr(method_result, vtable_entry_addr);
4875 }
4876 
4877 
4878 void MacroAssembler::check_klass_subtype(Register sub_klass,
4879                            Register super_klass,
4880                            Register temp_reg,
4881                            Label& L_success) {
4882   Label L_failure;
4883   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
4884   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
4885   bind(L_failure);
4886 }
4887 
4888 
4889 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
4890                                                    Register super_klass,
4891                                                    Register temp_reg,
4892                                                    Label* L_success,
4893                                                    Label* L_failure,
4894                                                    Label* L_slow_path,
4895                                         RegisterOrConstant super_check_offset) {
4896   assert_different_registers(sub_klass, super_klass, temp_reg);
4897   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
4898   if (super_check_offset.is_register()) {
4899     assert_different_registers(sub_klass, super_klass,
4900                                super_check_offset.as_register());
4901   } else if (must_load_sco) {
4902     assert(temp_reg != noreg, "supply either a temp or a register offset");
4903   }
4904 
4905   Label L_fallthrough;
4906   int label_nulls = 0;
4907   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
4908   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
4909   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
4910   assert(label_nulls <= 1, "at most one NULL in the batch");
4911 
4912   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4913   int sco_offset = in_bytes(Klass::super_check_offset_offset());
4914   Address super_check_offset_addr(super_klass, sco_offset);
4915 
4916   // Hacked jcc, which "knows" that L_fallthrough, at least, is in
4917   // range of a jccb.  If this routine grows larger, reconsider at
4918   // least some of these.
4919 #define local_jcc(assembler_cond, label)                                \
4920   if (&(label) == &L_fallthrough)  jccb(assembler_cond, label);         \
4921   else                             jcc( assembler_cond, label) /*omit semi*/
4922 
4923   // Hacked jmp, which may only be used just before L_fallthrough.
4924 #define final_jmp(label)                                                \
4925   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
4926   else                            jmp(label)                /*omit semi*/
4927 
4928   // If the pointers are equal, we are done (e.g., String[] elements).
4929   // This self-check enables sharing of secondary supertype arrays among
4930   // non-primary types such as array-of-interface.  Otherwise, each such
4931   // type would need its own customized SSA.
4932   // We move this check to the front of the fast path because many
4933   // type checks are in fact trivially successful in this manner,
4934   // so we get a nicely predicted branch right at the start of the check.
4935   cmpptr(sub_klass, super_klass);
4936   local_jcc(Assembler::equal, *L_success);
4937 
4938   // Check the supertype display:
4939   if (must_load_sco) {
4940     // Positive movl does right thing on LP64.
4941     movl(temp_reg, super_check_offset_addr);
4942     super_check_offset = RegisterOrConstant(temp_reg);
4943   }
4944   Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0);
4945   cmpptr(super_klass, super_check_addr); // load displayed supertype
4946 
4947   // This check has worked decisively for primary supers.
4948   // Secondary supers are sought in the super_cache ('super_cache_addr').
4949   // (Secondary supers are interfaces and very deeply nested subtypes.)
4950   // This works in the same check above because of a tricky aliasing
4951   // between the super_cache and the primary super display elements.
4952   // (The 'super_check_addr' can address either, as the case requires.)
4953   // Note that the cache is updated below if it does not help us find
4954   // what we need immediately.
4955   // So if it was a primary super, we can just fail immediately.
4956   // Otherwise, it's the slow path for us (no success at this point).
4957 
4958   if (super_check_offset.is_register()) {
4959     local_jcc(Assembler::equal, *L_success);
4960     cmpl(super_check_offset.as_register(), sc_offset);
4961     if (L_failure == &L_fallthrough) {
4962       local_jcc(Assembler::equal, *L_slow_path);
4963     } else {
4964       local_jcc(Assembler::notEqual, *L_failure);
4965       final_jmp(*L_slow_path);
4966     }
4967   } else if (super_check_offset.as_constant() == sc_offset) {
4968     // Need a slow path; fast failure is impossible.
4969     if (L_slow_path == &L_fallthrough) {
4970       local_jcc(Assembler::equal, *L_success);
4971     } else {
4972       local_jcc(Assembler::notEqual, *L_slow_path);
4973       final_jmp(*L_success);
4974     }
4975   } else {
4976     // No slow path; it's a fast decision.
4977     if (L_failure == &L_fallthrough) {
4978       local_jcc(Assembler::equal, *L_success);
4979     } else {
4980       local_jcc(Assembler::notEqual, *L_failure);
4981       final_jmp(*L_success);
4982     }
4983   }
4984 
4985   bind(L_fallthrough);
4986 
4987 #undef local_jcc
4988 #undef final_jmp
4989 }
4990 
4991 
4992 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
4993                                                    Register super_klass,
4994                                                    Register temp_reg,
4995                                                    Register temp2_reg,
4996                                                    Label* L_success,
4997                                                    Label* L_failure,
4998                                                    bool set_cond_codes) {
4999   assert_different_registers(sub_klass, super_klass, temp_reg);
5000   if (temp2_reg != noreg)
5001     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg);
5002 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
5003 
5004   Label L_fallthrough;
5005   int label_nulls = 0;
5006   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
5007   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
5008   assert(label_nulls <= 1, "at most one NULL in the batch");
5009 
5010   // a couple of useful fields in sub_klass:
5011   int ss_offset = in_bytes(Klass::secondary_supers_offset());
5012   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
5013   Address secondary_supers_addr(sub_klass, ss_offset);
5014   Address super_cache_addr(     sub_klass, sc_offset);
5015 
5016   // Do a linear scan of the secondary super-klass chain.
5017   // This code is rarely used, so simplicity is a virtue here.
5018   // The repne_scan instruction uses fixed registers, which we must spill.
5019   // Don't worry too much about pre-existing connections with the input regs.
5020 
5021   assert(sub_klass != rax, "killed reg"); // killed by mov(rax, super)
5022   assert(sub_klass != rcx, "killed reg"); // killed by lea(rcx, &pst_counter)
5023 
5024   // Get super_klass value into rax (even if it was in rdi or rcx).
5025   bool pushed_rax = false, pushed_rcx = false, pushed_rdi = false;
5026   if (super_klass != rax || UseCompressedOops) {
5027     if (!IS_A_TEMP(rax)) { push(rax); pushed_rax = true; }
5028     mov(rax, super_klass);
5029   }
5030   if (!IS_A_TEMP(rcx)) { push(rcx); pushed_rcx = true; }
5031   if (!IS_A_TEMP(rdi)) { push(rdi); pushed_rdi = true; }
5032 
5033 #ifndef PRODUCT
5034   int* pst_counter = &SharedRuntime::_partial_subtype_ctr;
5035   ExternalAddress pst_counter_addr((address) pst_counter);
5036   NOT_LP64(  incrementl(pst_counter_addr) );
5037   LP64_ONLY( lea(rcx, pst_counter_addr) );
5038   LP64_ONLY( incrementl(Address(rcx, 0)) );
5039 #endif //PRODUCT
5040 
5041   // We will consult the secondary-super array.
5042   movptr(rdi, secondary_supers_addr);
5043   // Load the array length.  (Positive movl does right thing on LP64.)
5044   movl(rcx, Address(rdi, Array<Klass*>::length_offset_in_bytes()));
5045   // Skip to start of data.
5046   addptr(rdi, Array<Klass*>::base_offset_in_bytes());
5047 
5048   // Scan RCX words at [RDI] for an occurrence of RAX.
5049   // Set NZ/Z based on last compare.
5050   // Z flag value will not be set by 'repne' if RCX == 0 since 'repne' does
5051   // not change flags (only scas instruction which is repeated sets flags).
5052   // Set Z = 0 (not equal) before 'repne' to indicate that class was not found.
5053 
5054     testptr(rax,rax); // Set Z = 0
5055     repne_scan();
5056 
5057   // Unspill the temp. registers:
5058   if (pushed_rdi)  pop(rdi);
5059   if (pushed_rcx)  pop(rcx);
5060   if (pushed_rax)  pop(rax);
5061 
5062   if (set_cond_codes) {
5063     // Special hack for the AD files:  rdi is guaranteed non-zero.
5064     assert(!pushed_rdi, "rdi must be left non-NULL");
5065     // Also, the condition codes are properly set Z/NZ on succeed/failure.
5066   }
5067 
5068   if (L_failure == &L_fallthrough)
5069         jccb(Assembler::notEqual, *L_failure);
5070   else  jcc(Assembler::notEqual, *L_failure);
5071 
5072   // Success.  Cache the super we found and proceed in triumph.
5073   movptr(super_cache_addr, super_klass);
5074 
5075   if (L_success != &L_fallthrough) {
5076     jmp(*L_success);
5077   }
5078 
5079 #undef IS_A_TEMP
5080 
5081   bind(L_fallthrough);
5082 }
5083 
5084 
5085 void MacroAssembler::cmov32(Condition cc, Register dst, Address src) {
5086   if (VM_Version::supports_cmov()) {
5087     cmovl(cc, dst, src);
5088   } else {
5089     Label L;
5090     jccb(negate_condition(cc), L);
5091     movl(dst, src);
5092     bind(L);
5093   }
5094 }
5095 
5096 void MacroAssembler::cmov32(Condition cc, Register dst, Register src) {
5097   if (VM_Version::supports_cmov()) {
5098     cmovl(cc, dst, src);
5099   } else {
5100     Label L;
5101     jccb(negate_condition(cc), L);
5102     movl(dst, src);
5103     bind(L);
5104   }
5105 }
5106 
5107 void MacroAssembler::verify_oop(Register reg, const char* s) {
5108   if (!VerifyOops) return;
5109 
5110   // Pass register number to verify_oop_subroutine
5111   const char* b = NULL;
5112   {
5113     ResourceMark rm;
5114     stringStream ss;
5115     ss.print("verify_oop: %s: %s", reg->name(), s);
5116     b = code_string(ss.as_string());
5117   }
5118   BLOCK_COMMENT("verify_oop {");
5119 #ifdef _LP64
5120   push(rscratch1);                    // save r10, trashed by movptr()
5121 #endif
5122   push(rax);                          // save rax,
5123   push(reg);                          // pass register argument
5124   ExternalAddress buffer((address) b);
5125   // avoid using pushptr, as it modifies scratch registers
5126   // and our contract is not to modify anything
5127   movptr(rax, buffer.addr());
5128   push(rax);
5129   // call indirectly to solve generation ordering problem
5130   movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
5131   call(rax);
5132   // Caller pops the arguments (oop, message) and restores rax, r10
5133   BLOCK_COMMENT("} verify_oop");
5134 }
5135 
5136 
5137 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
5138                                                       Register tmp,
5139                                                       int offset) {
5140   intptr_t value = *delayed_value_addr;
5141   if (value != 0)
5142     return RegisterOrConstant(value + offset);
5143 
5144   // load indirectly to solve generation ordering problem
5145   movptr(tmp, ExternalAddress((address) delayed_value_addr));
5146 
5147 #ifdef ASSERT
5148   { Label L;
5149     testptr(tmp, tmp);
5150     if (WizardMode) {
5151       const char* buf = NULL;
5152       {
5153         ResourceMark rm;
5154         stringStream ss;
5155         ss.print("DelayedValue="INTPTR_FORMAT, delayed_value_addr[1]);
5156         buf = code_string(ss.as_string());
5157       }
5158       jcc(Assembler::notZero, L);
5159       STOP(buf);
5160     } else {
5161       jccb(Assembler::notZero, L);
5162       hlt();
5163     }
5164     bind(L);
5165   }
5166 #endif
5167 
5168   if (offset != 0)
5169     addptr(tmp, offset);
5170 
5171   return RegisterOrConstant(tmp);
5172 }
5173 
5174 
5175 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
5176                                          int extra_slot_offset) {
5177   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
5178   int stackElementSize = Interpreter::stackElementSize;
5179   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
5180 #ifdef ASSERT
5181   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
5182   assert(offset1 - offset == stackElementSize, "correct arithmetic");
5183 #endif
5184   Register             scale_reg    = noreg;
5185   Address::ScaleFactor scale_factor = Address::no_scale;
5186   if (arg_slot.is_constant()) {
5187     offset += arg_slot.as_constant() * stackElementSize;
5188   } else {
5189     scale_reg    = arg_slot.as_register();
5190     scale_factor = Address::times(stackElementSize);
5191   }
5192   offset += wordSize;           // return PC is on stack
5193   return Address(rsp, scale_reg, scale_factor, offset);
5194 }
5195 
5196 
5197 void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
5198   if (!VerifyOops) return;
5199 
5200   // Address adjust(addr.base(), addr.index(), addr.scale(), addr.disp() + BytesPerWord);
5201   // Pass register number to verify_oop_subroutine
5202   const char* b = NULL;
5203   {
5204     ResourceMark rm;
5205     stringStream ss;
5206     ss.print("verify_oop_addr: %s", s);
5207     b = code_string(ss.as_string());
5208   }
5209 #ifdef _LP64
5210   push(rscratch1);                    // save r10, trashed by movptr()
5211 #endif
5212   push(rax);                          // save rax,
5213   // addr may contain rsp so we will have to adjust it based on the push
5214   // we just did (and on 64 bit we do two pushes)
5215   // NOTE: 64bit seemed to have had a bug in that it did movq(addr, rax); which
5216   // stores rax into addr which is backwards of what was intended.
5217   if (addr.uses(rsp)) {
5218     lea(rax, addr);
5219     pushptr(Address(rax, LP64_ONLY(2 *) BytesPerWord));
5220   } else {
5221     pushptr(addr);
5222   }
5223 
5224   ExternalAddress buffer((address) b);
5225   // pass msg argument
5226   // avoid using pushptr, as it modifies scratch registers
5227   // and our contract is not to modify anything
5228   movptr(rax, buffer.addr());
5229   push(rax);
5230 
5231   // call indirectly to solve generation ordering problem
5232   movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
5233   call(rax);
5234   // Caller pops the arguments (addr, message) and restores rax, r10.
5235 }
5236 
5237 void MacroAssembler::verify_tlab() {
5238 #ifdef ASSERT
5239   if (UseTLAB && VerifyOops) {
5240     Label next, ok;
5241     Register t1 = rsi;
5242     Register thread_reg = NOT_LP64(rbx) LP64_ONLY(r15_thread);
5243 
5244     push(t1);
5245     NOT_LP64(push(thread_reg));
5246     NOT_LP64(get_thread(thread_reg));
5247 
5248     movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
5249     cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())));
5250     jcc(Assembler::aboveEqual, next);
5251     STOP("assert(top >= start)");
5252     should_not_reach_here();
5253 
5254     bind(next);
5255     movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())));
5256     cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
5257     jcc(Assembler::aboveEqual, ok);
5258     STOP("assert(top <= end)");
5259     should_not_reach_here();
5260 
5261     bind(ok);
5262     NOT_LP64(pop(thread_reg));
5263     pop(t1);
5264   }
5265 #endif
5266 }
5267 
5268 class ControlWord {
5269  public:
5270   int32_t _value;
5271 
5272   int  rounding_control() const        { return  (_value >> 10) & 3      ; }
5273   int  precision_control() const       { return  (_value >>  8) & 3      ; }
5274   bool precision() const               { return ((_value >>  5) & 1) != 0; }
5275   bool underflow() const               { return ((_value >>  4) & 1) != 0; }
5276   bool overflow() const                { return ((_value >>  3) & 1) != 0; }
5277   bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
5278   bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
5279   bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
5280 
5281   void print() const {
5282     // rounding control
5283     const char* rc;
5284     switch (rounding_control()) {
5285       case 0: rc = "round near"; break;
5286       case 1: rc = "round down"; break;
5287       case 2: rc = "round up  "; break;
5288       case 3: rc = "chop      "; break;
5289     };
5290     // precision control
5291     const char* pc;
5292     switch (precision_control()) {
5293       case 0: pc = "24 bits "; break;
5294       case 1: pc = "reserved"; break;
5295       case 2: pc = "53 bits "; break;
5296       case 3: pc = "64 bits "; break;
5297     };
5298     // flags
5299     char f[9];
5300     f[0] = ' ';
5301     f[1] = ' ';
5302     f[2] = (precision   ()) ? 'P' : 'p';
5303     f[3] = (underflow   ()) ? 'U' : 'u';
5304     f[4] = (overflow    ()) ? 'O' : 'o';
5305     f[5] = (zero_divide ()) ? 'Z' : 'z';
5306     f[6] = (denormalized()) ? 'D' : 'd';
5307     f[7] = (invalid     ()) ? 'I' : 'i';
5308     f[8] = '\x0';
5309     // output
5310     printf("%04x  masks = %s, %s, %s", _value & 0xFFFF, f, rc, pc);
5311   }
5312 
5313 };
5314 
5315 class StatusWord {
5316  public:
5317   int32_t _value;
5318 
5319   bool busy() const                    { return ((_value >> 15) & 1) != 0; }
5320   bool C3() const                      { return ((_value >> 14) & 1) != 0; }
5321   bool C2() const                      { return ((_value >> 10) & 1) != 0; }
5322   bool C1() const                      { return ((_value >>  9) & 1) != 0; }
5323   bool C0() const                      { return ((_value >>  8) & 1) != 0; }
5324   int  top() const                     { return  (_value >> 11) & 7      ; }
5325   bool error_status() const            { return ((_value >>  7) & 1) != 0; }
5326   bool stack_fault() const             { return ((_value >>  6) & 1) != 0; }
5327   bool precision() const               { return ((_value >>  5) & 1) != 0; }
5328   bool underflow() const               { return ((_value >>  4) & 1) != 0; }
5329   bool overflow() const                { return ((_value >>  3) & 1) != 0; }
5330   bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
5331   bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
5332   bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
5333 
5334   void print() const {
5335     // condition codes
5336     char c[5];
5337     c[0] = (C3()) ? '3' : '-';
5338     c[1] = (C2()) ? '2' : '-';
5339     c[2] = (C1()) ? '1' : '-';
5340     c[3] = (C0()) ? '0' : '-';
5341     c[4] = '\x0';
5342     // flags
5343     char f[9];
5344     f[0] = (error_status()) ? 'E' : '-';
5345     f[1] = (stack_fault ()) ? 'S' : '-';
5346     f[2] = (precision   ()) ? 'P' : '-';
5347     f[3] = (underflow   ()) ? 'U' : '-';
5348     f[4] = (overflow    ()) ? 'O' : '-';
5349     f[5] = (zero_divide ()) ? 'Z' : '-';
5350     f[6] = (denormalized()) ? 'D' : '-';
5351     f[7] = (invalid     ()) ? 'I' : '-';
5352     f[8] = '\x0';
5353     // output
5354     printf("%04x  flags = %s, cc =  %s, top = %d", _value & 0xFFFF, f, c, top());
5355   }
5356 
5357 };
5358 
5359 class TagWord {
5360  public:
5361   int32_t _value;
5362 
5363   int tag_at(int i) const              { return (_value >> (i*2)) & 3; }
5364 
5365   void print() const {
5366     printf("%04x", _value & 0xFFFF);
5367   }
5368 
5369 };
5370 
5371 class FPU_Register {
5372  public:
5373   int32_t _m0;
5374   int32_t _m1;
5375   int16_t _ex;
5376 
5377   bool is_indefinite() const           {
5378     return _ex == -1 && _m1 == (int32_t)0xC0000000 && _m0 == 0;
5379   }
5380 
5381   void print() const {
5382     char  sign = (_ex < 0) ? '-' : '+';
5383     const char* kind = (_ex == 0x7FFF || _ex == (int16_t)-1) ? "NaN" : "   ";
5384     printf("%c%04hx.%08x%08x  %s", sign, _ex, _m1, _m0, kind);
5385   };
5386 
5387 };
5388 
5389 class FPU_State {
5390  public:
5391   enum {
5392     register_size       = 10,
5393     number_of_registers =  8,
5394     register_mask       =  7
5395   };
5396 
5397   ControlWord  _control_word;
5398   StatusWord   _status_word;
5399   TagWord      _tag_word;
5400   int32_t      _error_offset;
5401   int32_t      _error_selector;
5402   int32_t      _data_offset;
5403   int32_t      _data_selector;
5404   int8_t       _register[register_size * number_of_registers];
5405 
5406   int tag_for_st(int i) const          { return _tag_word.tag_at((_status_word.top() + i) & register_mask); }
5407   FPU_Register* st(int i) const        { return (FPU_Register*)&_register[register_size * i]; }
5408 
5409   const char* tag_as_string(int tag) const {
5410     switch (tag) {
5411       case 0: return "valid";
5412       case 1: return "zero";
5413       case 2: return "special";
5414       case 3: return "empty";
5415     }
5416     ShouldNotReachHere();
5417     return NULL;
5418   }
5419 
5420   void print() const {
5421     // print computation registers
5422     { int t = _status_word.top();
5423       for (int i = 0; i < number_of_registers; i++) {
5424         int j = (i - t) & register_mask;
5425         printf("%c r%d = ST%d = ", (j == 0 ? '*' : ' '), i, j);
5426         st(j)->print();
5427         printf(" %s\n", tag_as_string(_tag_word.tag_at(i)));
5428       }
5429     }
5430     printf("\n");
5431     // print control registers
5432     printf("ctrl = "); _control_word.print(); printf("\n");
5433     printf("stat = "); _status_word .print(); printf("\n");
5434     printf("tags = "); _tag_word    .print(); printf("\n");
5435   }
5436 
5437 };
5438 
5439 class Flag_Register {
5440  public:
5441   int32_t _value;
5442 
5443   bool overflow() const                { return ((_value >> 11) & 1) != 0; }
5444   bool direction() const               { return ((_value >> 10) & 1) != 0; }
5445   bool sign() const                    { return ((_value >>  7) & 1) != 0; }
5446   bool zero() const                    { return ((_value >>  6) & 1) != 0; }
5447   bool auxiliary_carry() const         { return ((_value >>  4) & 1) != 0; }
5448   bool parity() const                  { return ((_value >>  2) & 1) != 0; }
5449   bool carry() const                   { return ((_value >>  0) & 1) != 0; }
5450 
5451   void print() const {
5452     // flags
5453     char f[8];
5454     f[0] = (overflow       ()) ? 'O' : '-';
5455     f[1] = (direction      ()) ? 'D' : '-';
5456     f[2] = (sign           ()) ? 'S' : '-';
5457     f[3] = (zero           ()) ? 'Z' : '-';
5458     f[4] = (auxiliary_carry()) ? 'A' : '-';
5459     f[5] = (parity         ()) ? 'P' : '-';
5460     f[6] = (carry          ()) ? 'C' : '-';
5461     f[7] = '\x0';
5462     // output
5463     printf("%08x  flags = %s", _value, f);
5464   }
5465 
5466 };
5467 
5468 class IU_Register {
5469  public:
5470   int32_t _value;
5471 
5472   void print() const {
5473     printf("%08x  %11d", _value, _value);
5474   }
5475 
5476 };
5477 
5478 class IU_State {
5479  public:
5480   Flag_Register _eflags;
5481   IU_Register   _rdi;
5482   IU_Register   _rsi;
5483   IU_Register   _rbp;
5484   IU_Register   _rsp;
5485   IU_Register   _rbx;
5486   IU_Register   _rdx;
5487   IU_Register   _rcx;
5488   IU_Register   _rax;
5489 
5490   void print() const {
5491     // computation registers
5492     printf("rax,  = "); _rax.print(); printf("\n");
5493     printf("rbx,  = "); _rbx.print(); printf("\n");
5494     printf("rcx  = "); _rcx.print(); printf("\n");
5495     printf("rdx  = "); _rdx.print(); printf("\n");
5496     printf("rdi  = "); _rdi.print(); printf("\n");
5497     printf("rsi  = "); _rsi.print(); printf("\n");
5498     printf("rbp,  = "); _rbp.print(); printf("\n");
5499     printf("rsp  = "); _rsp.print(); printf("\n");
5500     printf("\n");
5501     // control registers
5502     printf("flgs = "); _eflags.print(); printf("\n");
5503   }
5504 };
5505 
5506 
5507 class CPU_State {
5508  public:
5509   FPU_State _fpu_state;
5510   IU_State  _iu_state;
5511 
5512   void print() const {
5513     printf("--------------------------------------------------\n");
5514     _iu_state .print();
5515     printf("\n");
5516     _fpu_state.print();
5517     printf("--------------------------------------------------\n");
5518   }
5519 
5520 };
5521 
5522 
5523 static void _print_CPU_state(CPU_State* state) {
5524   state->print();
5525 };
5526 
5527 
5528 void MacroAssembler::print_CPU_state() {
5529   push_CPU_state();
5530   push(rsp);                // pass CPU state
5531   call(RuntimeAddress(CAST_FROM_FN_PTR(address, _print_CPU_state)));
5532   addptr(rsp, wordSize);       // discard argument
5533   pop_CPU_state();
5534 }
5535 
5536 
5537 static bool _verify_FPU(int stack_depth, char* s, CPU_State* state) {
5538   static int counter = 0;
5539   FPU_State* fs = &state->_fpu_state;
5540   counter++;
5541   // For leaf calls, only verify that the top few elements remain empty.
5542   // We only need 1 empty at the top for C2 code.
5543   if( stack_depth < 0 ) {
5544     if( fs->tag_for_st(7) != 3 ) {
5545       printf("FPR7 not empty\n");
5546       state->print();
5547       assert(false, "error");
5548       return false;
5549     }
5550     return true;                // All other stack states do not matter
5551   }
5552 
5553   assert((fs->_control_word._value & 0xffff) == StubRoutines::_fpu_cntrl_wrd_std,
5554          "bad FPU control word");
5555 
5556   // compute stack depth
5557   int i = 0;
5558   while (i < FPU_State::number_of_registers && fs->tag_for_st(i)  < 3) i++;
5559   int d = i;
5560   while (i < FPU_State::number_of_registers && fs->tag_for_st(i) == 3) i++;
5561   // verify findings
5562   if (i != FPU_State::number_of_registers) {
5563     // stack not contiguous
5564     printf("%s: stack not contiguous at ST%d\n", s, i);
5565     state->print();
5566     assert(false, "error");
5567     return false;
5568   }
5569   // check if computed stack depth corresponds to expected stack depth
5570   if (stack_depth < 0) {
5571     // expected stack depth is -stack_depth or less
5572     if (d > -stack_depth) {
5573       // too many elements on the stack
5574       printf("%s: <= %d stack elements expected but found %d\n", s, -stack_depth, d);
5575       state->print();
5576       assert(false, "error");
5577       return false;
5578     }
5579   } else {
5580     // expected stack depth is stack_depth
5581     if (d != stack_depth) {
5582       // wrong stack depth
5583       printf("%s: %d stack elements expected but found %d\n", s, stack_depth, d);
5584       state->print();
5585       assert(false, "error");
5586       return false;
5587     }
5588   }
5589   // everything is cool
5590   return true;
5591 }
5592 
5593 
5594 void MacroAssembler::verify_FPU(int stack_depth, const char* s) {
5595   if (!VerifyFPU) return;
5596   push_CPU_state();
5597   push(rsp);                // pass CPU state
5598   ExternalAddress msg((address) s);
5599   // pass message string s
5600   pushptr(msg.addr());
5601   push(stack_depth);        // pass stack depth
5602   call(RuntimeAddress(CAST_FROM_FN_PTR(address, _verify_FPU)));
5603   addptr(rsp, 3 * wordSize);   // discard arguments
5604   // check for error
5605   { Label L;
5606     testl(rax, rax);
5607     jcc(Assembler::notZero, L);
5608     int3();                  // break if error condition
5609     bind(L);
5610   }
5611   pop_CPU_state();
5612 }
5613 
5614 void MacroAssembler::restore_cpu_control_state_after_jni() {
5615   // Either restore the MXCSR register after returning from the JNI Call
5616   // or verify that it wasn't changed (with -Xcheck:jni flag).
5617   if (VM_Version::supports_sse()) {
5618     if (RestoreMXCSROnJNICalls) {
5619       ldmxcsr(ExternalAddress(StubRoutines::addr_mxcsr_std()));
5620     } else if (CheckJNICalls) {
5621       call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry()));
5622     }
5623   }
5624   if (VM_Version::supports_avx()) {
5625     // Clear upper bits of YMM registers to avoid SSE <-> AVX transition penalty.
5626     vzeroupper();
5627   }
5628 
5629 #ifndef _LP64
5630   // Either restore the x87 floating pointer control word after returning
5631   // from the JNI call or verify that it wasn't changed.
5632   if (CheckJNICalls) {
5633     call(RuntimeAddress(StubRoutines::x86::verify_fpu_cntrl_wrd_entry()));
5634   }
5635 #endif // _LP64
5636 }
5637 
5638 
5639 void MacroAssembler::load_klass(Register dst, Register src) {
5640 #ifdef _LP64
5641   if (UseCompressedClassPointers) {
5642     movl(dst, Address(src, oopDesc::klass_offset_in_bytes()));
5643     decode_klass_not_null(dst);
5644   } else
5645 #endif
5646     movptr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
5647 }
5648 
5649 void MacroAssembler::load_prototype_header(Register dst, Register src) {
5650   load_klass(dst, src);
5651   movptr(dst, Address(dst, Klass::prototype_header_offset()));
5652 }
5653 
5654 void MacroAssembler::store_klass(Register dst, Register src) {
5655 #ifdef _LP64
5656   if (UseCompressedClassPointers) {
5657     encode_klass_not_null(src);
5658     movl(Address(dst, oopDesc::klass_offset_in_bytes()), src);
5659   } else
5660 #endif
5661     movptr(Address(dst, oopDesc::klass_offset_in_bytes()), src);
5662 }
5663 
5664 void MacroAssembler::load_heap_oop(Register dst, Address src) {
5665 #ifdef _LP64
5666   // FIXME: Must change all places where we try to load the klass.
5667   if (UseCompressedOops) {
5668     movl(dst, src);
5669     decode_heap_oop(dst);
5670   } else
5671 #endif
5672     movptr(dst, src);
5673 }
5674 
5675 // Doesn't do verfication, generates fixed size code
5676 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src) {
5677 #ifdef _LP64
5678   if (UseCompressedOops) {
5679     movl(dst, src);
5680     decode_heap_oop_not_null(dst);
5681   } else
5682 #endif
5683     movptr(dst, src);
5684 }
5685 
5686 void MacroAssembler::store_heap_oop(Address dst, Register src) {
5687 #ifdef _LP64
5688   if (UseCompressedOops) {
5689     assert(!dst.uses(src), "not enough registers");
5690     encode_heap_oop(src);
5691     movl(dst, src);
5692   } else
5693 #endif
5694     movptr(dst, src);
5695 }
5696 
5697 void MacroAssembler::cmp_heap_oop(Register src1, Address src2, Register tmp) {
5698   assert_different_registers(src1, tmp);
5699 #ifdef _LP64
5700   if (UseCompressedOops) {
5701     bool did_push = false;
5702     if (tmp == noreg) {
5703       tmp = rax;
5704       push(tmp);
5705       did_push = true;
5706       assert(!src2.uses(rsp), "can't push");
5707     }
5708     load_heap_oop(tmp, src2);
5709     cmpptr(src1, tmp);
5710     if (did_push)  pop(tmp);
5711   } else
5712 #endif
5713     cmpptr(src1, src2);
5714 }
5715 
5716 // Used for storing NULLs.
5717 void MacroAssembler::store_heap_oop_null(Address dst) {
5718 #ifdef _LP64
5719   if (UseCompressedOops) {
5720     movl(dst, (int32_t)NULL_WORD);
5721   } else {
5722     movslq(dst, (int32_t)NULL_WORD);
5723   }
5724 #else
5725   movl(dst, (int32_t)NULL_WORD);
5726 #endif
5727 }
5728 
5729 #ifdef _LP64
5730 void MacroAssembler::store_klass_gap(Register dst, Register src) {
5731   if (UseCompressedClassPointers) {
5732     // Store to klass gap in destination
5733     movl(Address(dst, oopDesc::klass_gap_offset_in_bytes()), src);
5734   }
5735 }
5736 
5737 #ifdef ASSERT
5738 void MacroAssembler::verify_heapbase(const char* msg) {
5739   assert (UseCompressedOops, "should be compressed");
5740   assert (Universe::heap() != NULL, "java heap should be initialized");
5741   if (CheckCompressedOops) {
5742     Label ok;
5743     push(rscratch1); // cmpptr trashes rscratch1
5744     cmpptr(r12_heapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
5745     jcc(Assembler::equal, ok);
5746     STOP(msg);
5747     bind(ok);
5748     pop(rscratch1);
5749   }
5750 }
5751 #endif
5752 
5753 // Algorithm must match oop.inline.hpp encode_heap_oop.
5754 void MacroAssembler::encode_heap_oop(Register r) {
5755 #ifdef ASSERT
5756   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
5757 #endif
5758   verify_oop(r, "broken oop in encode_heap_oop");
5759   if (Universe::narrow_oop_base() == NULL) {
5760     if (Universe::narrow_oop_shift() != 0) {
5761       assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
5762       shrq(r, LogMinObjAlignmentInBytes);
5763     }
5764     return;
5765   }
5766   testq(r, r);
5767   cmovq(Assembler::equal, r, r12_heapbase);
5768   subq(r, r12_heapbase);
5769   shrq(r, LogMinObjAlignmentInBytes);
5770 }
5771 
5772 void MacroAssembler::encode_heap_oop_not_null(Register r) {
5773 #ifdef ASSERT
5774   verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
5775   if (CheckCompressedOops) {
5776     Label ok;
5777     testq(r, r);
5778     jcc(Assembler::notEqual, ok);
5779     STOP("null oop passed to encode_heap_oop_not_null");
5780     bind(ok);
5781   }
5782 #endif
5783   verify_oop(r, "broken oop in encode_heap_oop_not_null");
5784   if (Universe::narrow_oop_base() != NULL) {
5785     subq(r, r12_heapbase);
5786   }
5787   if (Universe::narrow_oop_shift() != 0) {
5788     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
5789     shrq(r, LogMinObjAlignmentInBytes);
5790   }
5791 }
5792 
5793 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
5794 #ifdef ASSERT
5795   verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
5796   if (CheckCompressedOops) {
5797     Label ok;
5798     testq(src, src);
5799     jcc(Assembler::notEqual, ok);
5800     STOP("null oop passed to encode_heap_oop_not_null2");
5801     bind(ok);
5802   }
5803 #endif
5804   verify_oop(src, "broken oop in encode_heap_oop_not_null2");
5805   if (dst != src) {
5806     movq(dst, src);
5807   }
5808   if (Universe::narrow_oop_base() != NULL) {
5809     subq(dst, r12_heapbase);
5810   }
5811   if (Universe::narrow_oop_shift() != 0) {
5812     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
5813     shrq(dst, LogMinObjAlignmentInBytes);
5814   }
5815 }
5816 
5817 void  MacroAssembler::decode_heap_oop(Register r) {
5818 #ifdef ASSERT
5819   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
5820 #endif
5821   if (Universe::narrow_oop_base() == NULL) {
5822     if (Universe::narrow_oop_shift() != 0) {
5823       assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
5824       shlq(r, LogMinObjAlignmentInBytes);
5825     }
5826   } else {
5827     Label done;
5828     shlq(r, LogMinObjAlignmentInBytes);
5829     jccb(Assembler::equal, done);
5830     addq(r, r12_heapbase);
5831     bind(done);
5832   }
5833   verify_oop(r, "broken oop in decode_heap_oop");
5834 }
5835 
5836 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
5837   // Note: it will change flags
5838   assert (UseCompressedOops, "should only be used for compressed headers");
5839   assert (Universe::heap() != NULL, "java heap should be initialized");
5840   // Cannot assert, unverified entry point counts instructions (see .ad file)
5841   // vtableStubs also counts instructions in pd_code_size_limit.
5842   // Also do not verify_oop as this is called by verify_oop.
5843   if (Universe::narrow_oop_shift() != 0) {
5844     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
5845     shlq(r, LogMinObjAlignmentInBytes);
5846     if (Universe::narrow_oop_base() != NULL) {
5847       addq(r, r12_heapbase);
5848     }
5849   } else {
5850     assert (Universe::narrow_oop_base() == NULL, "sanity");
5851   }
5852 }
5853 
5854 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
5855   // Note: it will change flags
5856   assert (UseCompressedOops, "should only be used for compressed headers");
5857   assert (Universe::heap() != NULL, "java heap should be initialized");
5858   // Cannot assert, unverified entry point counts instructions (see .ad file)
5859   // vtableStubs also counts instructions in pd_code_size_limit.
5860   // Also do not verify_oop as this is called by verify_oop.
5861   if (Universe::narrow_oop_shift() != 0) {
5862     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
5863     if (LogMinObjAlignmentInBytes == Address::times_8) {
5864       leaq(dst, Address(r12_heapbase, src, Address::times_8, 0));
5865     } else {
5866       if (dst != src) {
5867         movq(dst, src);
5868       }
5869       shlq(dst, LogMinObjAlignmentInBytes);
5870       if (Universe::narrow_oop_base() != NULL) {
5871         addq(dst, r12_heapbase);
5872       }
5873     }
5874   } else {
5875     assert (Universe::narrow_oop_base() == NULL, "sanity");
5876     if (dst != src) {
5877       movq(dst, src);
5878     }
5879   }
5880 }
5881 
5882 void MacroAssembler::encode_klass_not_null(Register r) {
5883   if (Universe::narrow_klass_base() != NULL) {
5884     // Use r12 as a scratch register in which to temporarily load the narrow_klass_base.
5885     assert(r != r12_heapbase, "Encoding a klass in r12");
5886     mov64(r12_heapbase, (int64_t)Universe::narrow_klass_base());
5887     subq(r, r12_heapbase);
5888   }
5889   if (Universe::narrow_klass_shift() != 0) {
5890     assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
5891     shrq(r, LogKlassAlignmentInBytes);
5892   }
5893   if (Universe::narrow_klass_base() != NULL) {
5894     reinit_heapbase();
5895   }
5896 }
5897 
5898 void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
5899   if (dst == src) {
5900     encode_klass_not_null(src);
5901   } else {
5902     if (Universe::narrow_klass_base() != NULL) {
5903       mov64(dst, (int64_t)Universe::narrow_klass_base());
5904       negq(dst);
5905       addq(dst, src);
5906     } else {
5907       movptr(dst, src);
5908     }
5909     if (Universe::narrow_klass_shift() != 0) {
5910       assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
5911       shrq(dst, LogKlassAlignmentInBytes);
5912     }
5913   }
5914 }
5915 
5916 // Function instr_size_for_decode_klass_not_null() counts the instructions
5917 // generated by decode_klass_not_null(register r) and reinit_heapbase(),
5918 // when (Universe::heap() != NULL).  Hence, if the instructions they
5919 // generate change, then this method needs to be updated.
5920 int MacroAssembler::instr_size_for_decode_klass_not_null() {
5921   assert (UseCompressedClassPointers, "only for compressed klass ptrs");
5922   if (Universe::narrow_klass_base() != NULL) {
5923     // mov64 + addq + shlq? + mov64  (for reinit_heapbase()).
5924     return (Universe::narrow_klass_shift() == 0 ? 20 : 24);
5925   } else {
5926     // longest load decode klass function, mov64, leaq
5927     return 16;
5928   }
5929 }
5930 
5931 // !!! If the instructions that get generated here change then function
5932 // instr_size_for_decode_klass_not_null() needs to get updated.
5933 void  MacroAssembler::decode_klass_not_null(Register r) {
5934   // Note: it will change flags
5935   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5936   assert(r != r12_heapbase, "Decoding a klass in r12");
5937   // Cannot assert, unverified entry point counts instructions (see .ad file)
5938   // vtableStubs also counts instructions in pd_code_size_limit.
5939   // Also do not verify_oop as this is called by verify_oop.
5940   if (Universe::narrow_klass_shift() != 0) {
5941     assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
5942     shlq(r, LogKlassAlignmentInBytes);
5943   }
5944   // Use r12 as a scratch register in which to temporarily load the narrow_klass_base.
5945   if (Universe::narrow_klass_base() != NULL) {
5946     mov64(r12_heapbase, (int64_t)Universe::narrow_klass_base());
5947     addq(r, r12_heapbase);
5948     reinit_heapbase();
5949   }
5950 }
5951 
5952 void  MacroAssembler::decode_klass_not_null(Register dst, Register src) {
5953   // Note: it will change flags
5954   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5955   if (dst == src) {
5956     decode_klass_not_null(dst);
5957   } else {
5958     // Cannot assert, unverified entry point counts instructions (see .ad file)
5959     // vtableStubs also counts instructions in pd_code_size_limit.
5960     // Also do not verify_oop as this is called by verify_oop.
5961     mov64(dst, (int64_t)Universe::narrow_klass_base());
5962     if (Universe::narrow_klass_shift() != 0) {
5963       assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
5964       assert(LogKlassAlignmentInBytes == Address::times_8, "klass not aligned on 64bits?");
5965       leaq(dst, Address(dst, src, Address::times_8, 0));
5966     } else {
5967       addq(dst, src);
5968     }
5969   }
5970 }
5971 
5972 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
5973   assert (UseCompressedOops, "should only be used for compressed headers");
5974   assert (Universe::heap() != NULL, "java heap should be initialized");
5975   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5976   int oop_index = oop_recorder()->find_index(obj);
5977   RelocationHolder rspec = oop_Relocation::spec(oop_index);
5978   mov_narrow_oop(dst, oop_index, rspec);
5979 }
5980 
5981 void  MacroAssembler::set_narrow_oop(Address dst, jobject obj) {
5982   assert (UseCompressedOops, "should only be used for compressed headers");
5983   assert (Universe::heap() != NULL, "java heap should be initialized");
5984   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5985   int oop_index = oop_recorder()->find_index(obj);
5986   RelocationHolder rspec = oop_Relocation::spec(oop_index);
5987   mov_narrow_oop(dst, oop_index, rspec);
5988 }
5989 
5990 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
5991   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5992   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5993   int klass_index = oop_recorder()->find_index(k);
5994   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5995   mov_narrow_oop(dst, Klass::encode_klass(k), rspec);
5996 }
5997 
5998 void  MacroAssembler::set_narrow_klass(Address dst, Klass* k) {
5999   assert (UseCompressedClassPointers, "should only be used for compressed headers");
6000   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
6001   int klass_index = oop_recorder()->find_index(k);
6002   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
6003   mov_narrow_oop(dst, Klass::encode_klass(k), rspec);
6004 }
6005 
6006 void  MacroAssembler::cmp_narrow_oop(Register dst, jobject obj) {
6007   assert (UseCompressedOops, "should only be used for compressed headers");
6008   assert (Universe::heap() != NULL, "java heap should be initialized");
6009   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
6010   int oop_index = oop_recorder()->find_index(obj);
6011   RelocationHolder rspec = oop_Relocation::spec(oop_index);
6012   Assembler::cmp_narrow_oop(dst, oop_index, rspec);
6013 }
6014 
6015 void  MacroAssembler::cmp_narrow_oop(Address dst, jobject obj) {
6016   assert (UseCompressedOops, "should only be used for compressed headers");
6017   assert (Universe::heap() != NULL, "java heap should be initialized");
6018   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
6019   int oop_index = oop_recorder()->find_index(obj);
6020   RelocationHolder rspec = oop_Relocation::spec(oop_index);
6021   Assembler::cmp_narrow_oop(dst, oop_index, rspec);
6022 }
6023 
6024 void  MacroAssembler::cmp_narrow_klass(Register dst, Klass* k) {
6025   assert (UseCompressedClassPointers, "should only be used for compressed headers");
6026   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
6027   int klass_index = oop_recorder()->find_index(k);
6028   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
6029   Assembler::cmp_narrow_oop(dst, Klass::encode_klass(k), rspec);
6030 }
6031 
6032 void  MacroAssembler::cmp_narrow_klass(Address dst, Klass* k) {
6033   assert (UseCompressedClassPointers, "should only be used for compressed headers");
6034   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
6035   int klass_index = oop_recorder()->find_index(k);
6036   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
6037   Assembler::cmp_narrow_oop(dst, Klass::encode_klass(k), rspec);
6038 }
6039 
6040 void MacroAssembler::reinit_heapbase() {
6041   if (UseCompressedOops || UseCompressedClassPointers) {
6042     if (Universe::heap() != NULL) {
6043       if (Universe::narrow_oop_base() == NULL) {
6044         MacroAssembler::xorptr(r12_heapbase, r12_heapbase);
6045       } else {
6046         mov64(r12_heapbase, (int64_t)Universe::narrow_ptrs_base());
6047       }
6048     } else {
6049       movptr(r12_heapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
6050     }
6051   }
6052 }
6053 
6054 #endif // _LP64
6055 
6056 
6057 // C2 compiled method's prolog code.
6058 void MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b) {
6059 
6060   // WARNING: Initial instruction MUST be 5 bytes or longer so that
6061   // NativeJump::patch_verified_entry will be able to patch out the entry
6062   // code safely. The push to verify stack depth is ok at 5 bytes,
6063   // the frame allocation can be either 3 or 6 bytes. So if we don't do
6064   // stack bang then we must use the 6 byte frame allocation even if
6065   // we have no frame. :-(
6066   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
6067 
6068   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
6069   // Remove word for return addr
6070   framesize -= wordSize;
6071   stack_bang_size -= wordSize;
6072 
6073   // Calls to C2R adapters often do not accept exceptional returns.
6074   // We require that their callers must bang for them.  But be careful, because
6075   // some VM calls (such as call site linkage) can use several kilobytes of
6076   // stack.  But the stack safety zone should account for that.
6077   // See bugs 4446381, 4468289, 4497237.
6078   if (stack_bang_size > 0) {
6079     generate_stack_overflow_check(stack_bang_size);
6080 
6081     // We always push rbp, so that on return to interpreter rbp, will be
6082     // restored correctly and we can correct the stack.
6083     push(rbp);
6084     // Save caller's stack pointer into RBP if the frame pointer is preserved.
6085     if (PreserveFramePointer) {
6086       mov(rbp, rsp);
6087     }
6088     // Remove word for ebp
6089     framesize -= wordSize;
6090 
6091     // Create frame
6092     if (framesize) {
6093       subptr(rsp, framesize);
6094     }
6095   } else {
6096     // Create frame (force generation of a 4 byte immediate value)
6097     subptr_imm32(rsp, framesize);
6098 
6099     // Save RBP register now.
6100     framesize -= wordSize;
6101     movptr(Address(rsp, framesize), rbp);
6102     // Save caller's stack pointer into RBP if the frame pointer is preserved.
6103     if (PreserveFramePointer) {
6104       movptr(rbp, rsp);
6105       addptr(rbp, framesize + wordSize);
6106     }
6107   }
6108 
6109   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
6110     framesize -= wordSize;
6111     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
6112   }
6113 
6114 #ifndef _LP64
6115   // If method sets FPU control word do it now
6116   if (fp_mode_24b) {
6117     fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_24()));
6118   }
6119   if (UseSSE >= 2 && VerifyFPU) {
6120     verify_FPU(0, "FPU stack must be clean on entry");
6121   }
6122 #endif
6123 
6124 #ifdef ASSERT
6125   if (VerifyStackAtCalls) {
6126     Label L;
6127     push(rax);
6128     mov(rax, rsp);
6129     andptr(rax, StackAlignmentInBytes-1);
6130     cmpptr(rax, StackAlignmentInBytes-wordSize);
6131     pop(rax);
6132     jcc(Assembler::equal, L);
6133     STOP("Stack is not properly aligned!");
6134     bind(L);
6135   }
6136 #endif
6137 
6138 }
6139 
6140 void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp) {
6141   // cnt - number of qwords (8-byte words).
6142   // base - start address, qword aligned.
6143   assert(base==rdi, "base register must be edi for rep stos");
6144   assert(tmp==rax,   "tmp register must be eax for rep stos");
6145   assert(cnt==rcx,   "cnt register must be ecx for rep stos");
6146 
6147   xorptr(tmp, tmp);
6148   if (UseFastStosb) {
6149     shlptr(cnt,3); // convert to number of bytes
6150     rep_stosb();
6151   } else {
6152     NOT_LP64(shlptr(cnt,1);) // convert to number of dwords for 32-bit VM
6153     rep_stos();
6154   }
6155 }
6156 
6157 // IndexOf for constant substrings with size >= 8 chars
6158 // which don't need to be loaded through stack.
6159 void MacroAssembler::string_indexofC8(Register str1, Register str2,
6160                                       Register cnt1, Register cnt2,
6161                                       int int_cnt2,  Register result,
6162                                       XMMRegister vec, Register tmp) {
6163   ShortBranchVerifier sbv(this);
6164   assert(UseSSE42Intrinsics, "SSE4.2 is required");
6165 
6166   // This method uses pcmpestri instruction with bound registers
6167   //   inputs:
6168   //     xmm - substring
6169   //     rax - substring length (elements count)
6170   //     mem - scanned string
6171   //     rdx - string length (elements count)
6172   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
6173   //   outputs:
6174   //     rcx - matched index in string
6175   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
6176 
6177   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
6178         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
6179         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
6180 
6181   // Note, inline_string_indexOf() generates checks:
6182   // if (substr.count > string.count) return -1;
6183   // if (substr.count == 0) return 0;
6184   assert(int_cnt2 >= 8, "this code isused only for cnt2 >= 8 chars");
6185 
6186   // Load substring.
6187   movdqu(vec, Address(str2, 0));
6188   movl(cnt2, int_cnt2);
6189   movptr(result, str1); // string addr
6190 
6191   if (int_cnt2 > 8) {
6192     jmpb(SCAN_TO_SUBSTR);
6193 
6194     // Reload substr for rescan, this code
6195     // is executed only for large substrings (> 8 chars)
6196     bind(RELOAD_SUBSTR);
6197     movdqu(vec, Address(str2, 0));
6198     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
6199 
6200     bind(RELOAD_STR);
6201     // We came here after the beginning of the substring was
6202     // matched but the rest of it was not so we need to search
6203     // again. Start from the next element after the previous match.
6204 
6205     // cnt2 is number of substring reminding elements and
6206     // cnt1 is number of string reminding elements when cmp failed.
6207     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
6208     subl(cnt1, cnt2);
6209     addl(cnt1, int_cnt2);
6210     movl(cnt2, int_cnt2); // Now restore cnt2
6211 
6212     decrementl(cnt1);     // Shift to next element
6213     cmpl(cnt1, cnt2);
6214     jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
6215 
6216     addptr(result, 2);
6217 
6218   } // (int_cnt2 > 8)
6219 
6220   // Scan string for start of substr in 16-byte vectors
6221   bind(SCAN_TO_SUBSTR);
6222   pcmpestri(vec, Address(result, 0), 0x0d);
6223   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
6224   subl(cnt1, 8);
6225   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
6226   cmpl(cnt1, cnt2);
6227   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
6228   addptr(result, 16);
6229   jmpb(SCAN_TO_SUBSTR);
6230 
6231   // Found a potential substr
6232   bind(FOUND_CANDIDATE);
6233   // Matched whole vector if first element matched (tmp(rcx) == 0).
6234   if (int_cnt2 == 8) {
6235     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
6236   } else { // int_cnt2 > 8
6237     jccb(Assembler::overflow, FOUND_SUBSTR);
6238   }
6239   // After pcmpestri tmp(rcx) contains matched element index
6240   // Compute start addr of substr
6241   lea(result, Address(result, tmp, Address::times_2));
6242 
6243   // Make sure string is still long enough
6244   subl(cnt1, tmp);
6245   cmpl(cnt1, cnt2);
6246   if (int_cnt2 == 8) {
6247     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
6248   } else { // int_cnt2 > 8
6249     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
6250   }
6251   // Left less then substring.
6252 
6253   bind(RET_NOT_FOUND);
6254   movl(result, -1);
6255   jmpb(EXIT);
6256 
6257   if (int_cnt2 > 8) {
6258     // This code is optimized for the case when whole substring
6259     // is matched if its head is matched.
6260     bind(MATCH_SUBSTR_HEAD);
6261     pcmpestri(vec, Address(result, 0), 0x0d);
6262     // Reload only string if does not match
6263     jccb(Assembler::noOverflow, RELOAD_STR); // OF == 0
6264 
6265     Label CONT_SCAN_SUBSTR;
6266     // Compare the rest of substring (> 8 chars).
6267     bind(FOUND_SUBSTR);
6268     // First 8 chars are already matched.
6269     negptr(cnt2);
6270     addptr(cnt2, 8);
6271 
6272     bind(SCAN_SUBSTR);
6273     subl(cnt1, 8);
6274     cmpl(cnt2, -8); // Do not read beyond substring
6275     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
6276     // Back-up strings to avoid reading beyond substring:
6277     // cnt1 = cnt1 - cnt2 + 8
6278     addl(cnt1, cnt2); // cnt2 is negative
6279     addl(cnt1, 8);
6280     movl(cnt2, 8); negptr(cnt2);
6281     bind(CONT_SCAN_SUBSTR);
6282     if (int_cnt2 < (int)G) {
6283       movdqu(vec, Address(str2, cnt2, Address::times_2, int_cnt2*2));
6284       pcmpestri(vec, Address(result, cnt2, Address::times_2, int_cnt2*2), 0x0d);
6285     } else {
6286       // calculate index in register to avoid integer overflow (int_cnt2*2)
6287       movl(tmp, int_cnt2);
6288       addptr(tmp, cnt2);
6289       movdqu(vec, Address(str2, tmp, Address::times_2, 0));
6290       pcmpestri(vec, Address(result, tmp, Address::times_2, 0), 0x0d);
6291     }
6292     // Need to reload strings pointers if not matched whole vector
6293     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
6294     addptr(cnt2, 8);
6295     jcc(Assembler::negative, SCAN_SUBSTR);
6296     // Fall through if found full substring
6297 
6298   } // (int_cnt2 > 8)
6299 
6300   bind(RET_FOUND);
6301   // Found result if we matched full small substring.
6302   // Compute substr offset
6303   subptr(result, str1);
6304   shrl(result, 1); // index
6305   bind(EXIT);
6306 
6307 } // string_indexofC8
6308 
6309 // Small strings are loaded through stack if they cross page boundary.
6310 void MacroAssembler::string_indexof(Register str1, Register str2,
6311                                     Register cnt1, Register cnt2,
6312                                     int int_cnt2,  Register result,
6313                                     XMMRegister vec, Register tmp) {
6314   ShortBranchVerifier sbv(this);
6315   assert(UseSSE42Intrinsics, "SSE4.2 is required");
6316   //
6317   // int_cnt2 is length of small (< 8 chars) constant substring
6318   // or (-1) for non constant substring in which case its length
6319   // is in cnt2 register.
6320   //
6321   // Note, inline_string_indexOf() generates checks:
6322   // if (substr.count > string.count) return -1;
6323   // if (substr.count == 0) return 0;
6324   //
6325   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < 8), "should be != 0");
6326 
6327   // This method uses pcmpestri instruction with bound registers
6328   //   inputs:
6329   //     xmm - substring
6330   //     rax - substring length (elements count)
6331   //     mem - scanned string
6332   //     rdx - string length (elements count)
6333   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
6334   //   outputs:
6335   //     rcx - matched index in string
6336   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
6337 
6338   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
6339         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
6340         FOUND_CANDIDATE;
6341 
6342   { //========================================================
6343     // We don't know where these strings are located
6344     // and we can't read beyond them. Load them through stack.
6345     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
6346 
6347     movptr(tmp, rsp); // save old SP
6348 
6349     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
6350       if (int_cnt2 == 1) {  // One char
6351         load_unsigned_short(result, Address(str2, 0));
6352         movdl(vec, result); // move 32 bits
6353       } else if (int_cnt2 == 2) { // Two chars
6354         movdl(vec, Address(str2, 0)); // move 32 bits
6355       } else if (int_cnt2 == 4) { // Four chars
6356         movq(vec, Address(str2, 0));  // move 64 bits
6357       } else { // cnt2 = { 3, 5, 6, 7 }
6358         // Array header size is 12 bytes in 32-bit VM
6359         // + 6 bytes for 3 chars == 18 bytes,
6360         // enough space to load vec and shift.
6361         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
6362         movdqu(vec, Address(str2, (int_cnt2*2)-16));
6363         psrldq(vec, 16-(int_cnt2*2));
6364       }
6365     } else { // not constant substring
6366       cmpl(cnt2, 8);
6367       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
6368 
6369       // We can read beyond string if srt+16 does not cross page boundary
6370       // since heaps are aligned and mapped by pages.
6371       assert(os::vm_page_size() < (int)G, "default page should be small");
6372       movl(result, str2); // We need only low 32 bits
6373       andl(result, (os::vm_page_size()-1));
6374       cmpl(result, (os::vm_page_size()-16));
6375       jccb(Assembler::belowEqual, CHECK_STR);
6376 
6377       // Move small strings to stack to allow load 16 bytes into vec.
6378       subptr(rsp, 16);
6379       int stk_offset = wordSize-2;
6380       push(cnt2);
6381 
6382       bind(COPY_SUBSTR);
6383       load_unsigned_short(result, Address(str2, cnt2, Address::times_2, -2));
6384       movw(Address(rsp, cnt2, Address::times_2, stk_offset), result);
6385       decrement(cnt2);
6386       jccb(Assembler::notZero, COPY_SUBSTR);
6387 
6388       pop(cnt2);
6389       movptr(str2, rsp);  // New substring address
6390     } // non constant
6391 
6392     bind(CHECK_STR);
6393     cmpl(cnt1, 8);
6394     jccb(Assembler::aboveEqual, BIG_STRINGS);
6395 
6396     // Check cross page boundary.
6397     movl(result, str1); // We need only low 32 bits
6398     andl(result, (os::vm_page_size()-1));
6399     cmpl(result, (os::vm_page_size()-16));
6400     jccb(Assembler::belowEqual, BIG_STRINGS);
6401 
6402     subptr(rsp, 16);
6403     int stk_offset = -2;
6404     if (int_cnt2 < 0) { // not constant
6405       push(cnt2);
6406       stk_offset += wordSize;
6407     }
6408     movl(cnt2, cnt1);
6409 
6410     bind(COPY_STR);
6411     load_unsigned_short(result, Address(str1, cnt2, Address::times_2, -2));
6412     movw(Address(rsp, cnt2, Address::times_2, stk_offset), result);
6413     decrement(cnt2);
6414     jccb(Assembler::notZero, COPY_STR);
6415 
6416     if (int_cnt2 < 0) { // not constant
6417       pop(cnt2);
6418     }
6419     movptr(str1, rsp);  // New string address
6420 
6421     bind(BIG_STRINGS);
6422     // Load substring.
6423     if (int_cnt2 < 0) { // -1
6424       movdqu(vec, Address(str2, 0));
6425       push(cnt2);       // substr count
6426       push(str2);       // substr addr
6427       push(str1);       // string addr
6428     } else {
6429       // Small (< 8 chars) constant substrings are loaded already.
6430       movl(cnt2, int_cnt2);
6431     }
6432     push(tmp);  // original SP
6433 
6434   } // Finished loading
6435 
6436   //========================================================
6437   // Start search
6438   //
6439 
6440   movptr(result, str1); // string addr
6441 
6442   if (int_cnt2  < 0) {  // Only for non constant substring
6443     jmpb(SCAN_TO_SUBSTR);
6444 
6445     // SP saved at sp+0
6446     // String saved at sp+1*wordSize
6447     // Substr saved at sp+2*wordSize
6448     // Substr count saved at sp+3*wordSize
6449 
6450     // Reload substr for rescan, this code
6451     // is executed only for large substrings (> 8 chars)
6452     bind(RELOAD_SUBSTR);
6453     movptr(str2, Address(rsp, 2*wordSize));
6454     movl(cnt2, Address(rsp, 3*wordSize));
6455     movdqu(vec, Address(str2, 0));
6456     // We came here after the beginning of the substring was
6457     // matched but the rest of it was not so we need to search
6458     // again. Start from the next element after the previous match.
6459     subptr(str1, result); // Restore counter
6460     shrl(str1, 1);
6461     addl(cnt1, str1);
6462     decrementl(cnt1);   // Shift to next element
6463     cmpl(cnt1, cnt2);
6464     jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
6465 
6466     addptr(result, 2);
6467   } // non constant
6468 
6469   // Scan string for start of substr in 16-byte vectors
6470   bind(SCAN_TO_SUBSTR);
6471   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
6472   pcmpestri(vec, Address(result, 0), 0x0d);
6473   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
6474   subl(cnt1, 8);
6475   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
6476   cmpl(cnt1, cnt2);
6477   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
6478   addptr(result, 16);
6479 
6480   bind(ADJUST_STR);
6481   cmpl(cnt1, 8); // Do not read beyond string
6482   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
6483   // Back-up string to avoid reading beyond string.
6484   lea(result, Address(result, cnt1, Address::times_2, -16));
6485   movl(cnt1, 8);
6486   jmpb(SCAN_TO_SUBSTR);
6487 
6488   // Found a potential substr
6489   bind(FOUND_CANDIDATE);
6490   // After pcmpestri tmp(rcx) contains matched element index
6491 
6492   // Make sure string is still long enough
6493   subl(cnt1, tmp);
6494   cmpl(cnt1, cnt2);
6495   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
6496   // Left less then substring.
6497 
6498   bind(RET_NOT_FOUND);
6499   movl(result, -1);
6500   jmpb(CLEANUP);
6501 
6502   bind(FOUND_SUBSTR);
6503   // Compute start addr of substr
6504   lea(result, Address(result, tmp, Address::times_2));
6505 
6506   if (int_cnt2 > 0) { // Constant substring
6507     // Repeat search for small substring (< 8 chars)
6508     // from new point without reloading substring.
6509     // Have to check that we don't read beyond string.
6510     cmpl(tmp, 8-int_cnt2);
6511     jccb(Assembler::greater, ADJUST_STR);
6512     // Fall through if matched whole substring.
6513   } else { // non constant
6514     assert(int_cnt2 == -1, "should be != 0");
6515 
6516     addl(tmp, cnt2);
6517     // Found result if we matched whole substring.
6518     cmpl(tmp, 8);
6519     jccb(Assembler::lessEqual, RET_FOUND);
6520 
6521     // Repeat search for small substring (<= 8 chars)
6522     // from new point 'str1' without reloading substring.
6523     cmpl(cnt2, 8);
6524     // Have to check that we don't read beyond string.
6525     jccb(Assembler::lessEqual, ADJUST_STR);
6526 
6527     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
6528     // Compare the rest of substring (> 8 chars).
6529     movptr(str1, result);
6530 
6531     cmpl(tmp, cnt2);
6532     // First 8 chars are already matched.
6533     jccb(Assembler::equal, CHECK_NEXT);
6534 
6535     bind(SCAN_SUBSTR);
6536     pcmpestri(vec, Address(str1, 0), 0x0d);
6537     // Need to reload strings pointers if not matched whole vector
6538     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
6539 
6540     bind(CHECK_NEXT);
6541     subl(cnt2, 8);
6542     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
6543     addptr(str1, 16);
6544     addptr(str2, 16);
6545     subl(cnt1, 8);
6546     cmpl(cnt2, 8); // Do not read beyond substring
6547     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
6548     // Back-up strings to avoid reading beyond substring.
6549     lea(str2, Address(str2, cnt2, Address::times_2, -16));
6550     lea(str1, Address(str1, cnt2, Address::times_2, -16));
6551     subl(cnt1, cnt2);
6552     movl(cnt2, 8);
6553     addl(cnt1, 8);
6554     bind(CONT_SCAN_SUBSTR);
6555     movdqu(vec, Address(str2, 0));
6556     jmpb(SCAN_SUBSTR);
6557 
6558     bind(RET_FOUND_LONG);
6559     movptr(str1, Address(rsp, wordSize));
6560   } // non constant
6561 
6562   bind(RET_FOUND);
6563   // Compute substr offset
6564   subptr(result, str1);
6565   shrl(result, 1); // index
6566 
6567   bind(CLEANUP);
6568   pop(rsp); // restore SP
6569 
6570 } // string_indexof
6571 
6572 // Compare strings.
6573 void MacroAssembler::string_compare(Register str1, Register str2,
6574                                     Register cnt1, Register cnt2, Register result,
6575                                     XMMRegister vec1) {
6576   ShortBranchVerifier sbv(this);
6577   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
6578 
6579   // Compute the minimum of the string lengths and the
6580   // difference of the string lengths (stack).
6581   // Do the conditional move stuff
6582   movl(result, cnt1);
6583   subl(cnt1, cnt2);
6584   push(cnt1);
6585   cmov32(Assembler::lessEqual, cnt2, result);
6586 
6587   // Is the minimum length zero?
6588   testl(cnt2, cnt2);
6589   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
6590 
6591   // Compare first characters
6592   load_unsigned_short(result, Address(str1, 0));
6593   load_unsigned_short(cnt1, Address(str2, 0));
6594   subl(result, cnt1);
6595   jcc(Assembler::notZero,  POP_LABEL);
6596   cmpl(cnt2, 1);
6597   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
6598 
6599   // Check if the strings start at the same location.
6600   cmpptr(str1, str2);
6601   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
6602 
6603   Address::ScaleFactor scale = Address::times_2;
6604   int stride = 8;
6605 
6606   if (UseAVX >= 2 && UseSSE42Intrinsics) {
6607     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
6608     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
6609     Label COMPARE_TAIL_LONG;
6610     int pcmpmask = 0x19;
6611 
6612     // Setup to compare 16-chars (32-bytes) vectors,
6613     // start from first character again because it has aligned address.
6614     int stride2 = 16;
6615     int adr_stride  = stride  << scale;
6616 
6617     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
6618     // rax and rdx are used by pcmpestri as elements counters
6619     movl(result, cnt2);
6620     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
6621     jcc(Assembler::zero, COMPARE_TAIL_LONG);
6622 
6623     // fast path : compare first 2 8-char vectors.
6624     bind(COMPARE_16_CHARS);
6625     movdqu(vec1, Address(str1, 0));
6626     pcmpestri(vec1, Address(str2, 0), pcmpmask);
6627     jccb(Assembler::below, COMPARE_INDEX_CHAR);
6628 
6629     movdqu(vec1, Address(str1, adr_stride));
6630     pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
6631     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
6632     addl(cnt1, stride);
6633 
6634     // Compare the characters at index in cnt1
6635     bind(COMPARE_INDEX_CHAR); //cnt1 has the offset of the mismatching character
6636     load_unsigned_short(result, Address(str1, cnt1, scale));
6637     load_unsigned_short(cnt2, Address(str2, cnt1, scale));
6638     subl(result, cnt2);
6639     jmp(POP_LABEL);
6640 
6641     // Setup the registers to start vector comparison loop
6642     bind(COMPARE_WIDE_VECTORS);
6643     lea(str1, Address(str1, result, scale));
6644     lea(str2, Address(str2, result, scale));
6645     subl(result, stride2);
6646     subl(cnt2, stride2);
6647     jccb(Assembler::zero, COMPARE_WIDE_TAIL);
6648     negptr(result);
6649 
6650     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
6651     bind(COMPARE_WIDE_VECTORS_LOOP);
6652     vmovdqu(vec1, Address(str1, result, scale));
6653     vpxor(vec1, Address(str2, result, scale));
6654     vptest(vec1, vec1);
6655     jccb(Assembler::notZero, VECTOR_NOT_EQUAL);
6656     addptr(result, stride2);
6657     subl(cnt2, stride2);
6658     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
6659     // clean upper bits of YMM registers
6660     vpxor(vec1, vec1);
6661 
6662     // compare wide vectors tail
6663     bind(COMPARE_WIDE_TAIL);
6664     testptr(result, result);
6665     jccb(Assembler::zero, LENGTH_DIFF_LABEL);
6666 
6667     movl(result, stride2);
6668     movl(cnt2, result);
6669     negptr(result);
6670     jmpb(COMPARE_WIDE_VECTORS_LOOP);
6671 
6672     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
6673     bind(VECTOR_NOT_EQUAL);
6674     // clean upper bits of YMM registers
6675     vpxor(vec1, vec1);
6676     lea(str1, Address(str1, result, scale));
6677     lea(str2, Address(str2, result, scale));
6678     jmp(COMPARE_16_CHARS);
6679 
6680     // Compare tail chars, length between 1 to 15 chars
6681     bind(COMPARE_TAIL_LONG);
6682     movl(cnt2, result);
6683     cmpl(cnt2, stride);
6684     jccb(Assembler::less, COMPARE_SMALL_STR);
6685 
6686     movdqu(vec1, Address(str1, 0));
6687     pcmpestri(vec1, Address(str2, 0), pcmpmask);
6688     jcc(Assembler::below, COMPARE_INDEX_CHAR);
6689     subptr(cnt2, stride);
6690     jccb(Assembler::zero, LENGTH_DIFF_LABEL);
6691     lea(str1, Address(str1, result, scale));
6692     lea(str2, Address(str2, result, scale));
6693     negptr(cnt2);
6694     jmpb(WHILE_HEAD_LABEL);
6695 
6696     bind(COMPARE_SMALL_STR);
6697   } else if (UseSSE42Intrinsics) {
6698     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
6699     int pcmpmask = 0x19;
6700     // Setup to compare 8-char (16-byte) vectors,
6701     // start from first character again because it has aligned address.
6702     movl(result, cnt2);
6703     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
6704     jccb(Assembler::zero, COMPARE_TAIL);
6705 
6706     lea(str1, Address(str1, result, scale));
6707     lea(str2, Address(str2, result, scale));
6708     negptr(result);
6709 
6710     // pcmpestri
6711     //   inputs:
6712     //     vec1- substring
6713     //     rax - negative string length (elements count)
6714     //     mem - scanned string
6715     //     rdx - string length (elements count)
6716     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
6717     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
6718     //   outputs:
6719     //     rcx - first mismatched element index
6720     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
6721 
6722     bind(COMPARE_WIDE_VECTORS);
6723     movdqu(vec1, Address(str1, result, scale));
6724     pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
6725     // After pcmpestri cnt1(rcx) contains mismatched element index
6726 
6727     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
6728     addptr(result, stride);
6729     subptr(cnt2, stride);
6730     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
6731 
6732     // compare wide vectors tail
6733     testptr(result, result);
6734     jccb(Assembler::zero, LENGTH_DIFF_LABEL);
6735 
6736     movl(cnt2, stride);
6737     movl(result, stride);
6738     negptr(result);
6739     movdqu(vec1, Address(str1, result, scale));
6740     pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
6741     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
6742 
6743     // Mismatched characters in the vectors
6744     bind(VECTOR_NOT_EQUAL);
6745     addptr(cnt1, result);
6746     load_unsigned_short(result, Address(str1, cnt1, scale));
6747     load_unsigned_short(cnt2, Address(str2, cnt1, scale));
6748     subl(result, cnt2);
6749     jmpb(POP_LABEL);
6750 
6751     bind(COMPARE_TAIL); // limit is zero
6752     movl(cnt2, result);
6753     // Fallthru to tail compare
6754   }
6755   // Shift str2 and str1 to the end of the arrays, negate min
6756   lea(str1, Address(str1, cnt2, scale));
6757   lea(str2, Address(str2, cnt2, scale));
6758   decrementl(cnt2);  // first character was compared already
6759   negptr(cnt2);
6760 
6761   // Compare the rest of the elements
6762   bind(WHILE_HEAD_LABEL);
6763   load_unsigned_short(result, Address(str1, cnt2, scale, 0));
6764   load_unsigned_short(cnt1, Address(str2, cnt2, scale, 0));
6765   subl(result, cnt1);
6766   jccb(Assembler::notZero, POP_LABEL);
6767   increment(cnt2);
6768   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
6769 
6770   // Strings are equal up to min length.  Return the length difference.
6771   bind(LENGTH_DIFF_LABEL);
6772   pop(result);
6773   jmpb(DONE_LABEL);
6774 
6775   // Discard the stored length difference
6776   bind(POP_LABEL);
6777   pop(cnt1);
6778 
6779   // That's it
6780   bind(DONE_LABEL);
6781 }
6782 
6783 // Compare char[] arrays aligned to 4 bytes or substrings.
6784 void MacroAssembler::char_arrays_equals(bool is_array_equ, Register ary1, Register ary2,
6785                                         Register limit, Register result, Register chr,
6786                                         XMMRegister vec1, XMMRegister vec2) {
6787   ShortBranchVerifier sbv(this);
6788   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR;
6789 
6790   int length_offset  = arrayOopDesc::length_offset_in_bytes();
6791   int base_offset    = arrayOopDesc::base_offset_in_bytes(T_CHAR);
6792 
6793   // Check the input args
6794   cmpptr(ary1, ary2);
6795   jcc(Assembler::equal, TRUE_LABEL);
6796 
6797   if (is_array_equ) {
6798     // Need additional checks for arrays_equals.
6799     testptr(ary1, ary1);
6800     jcc(Assembler::zero, FALSE_LABEL);
6801     testptr(ary2, ary2);
6802     jcc(Assembler::zero, FALSE_LABEL);
6803 
6804     // Check the lengths
6805     movl(limit, Address(ary1, length_offset));
6806     cmpl(limit, Address(ary2, length_offset));
6807     jcc(Assembler::notEqual, FALSE_LABEL);
6808   }
6809 
6810   // count == 0
6811   testl(limit, limit);
6812   jcc(Assembler::zero, TRUE_LABEL);
6813 
6814   if (is_array_equ) {
6815     // Load array address
6816     lea(ary1, Address(ary1, base_offset));
6817     lea(ary2, Address(ary2, base_offset));
6818   }
6819 
6820   shll(limit, 1);      // byte count != 0
6821   movl(result, limit); // copy
6822 
6823   if (UseAVX >= 2) {
6824     // With AVX2, use 32-byte vector compare
6825     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
6826 
6827     // Compare 32-byte vectors
6828     andl(result, 0x0000001e);  //   tail count (in bytes)
6829     andl(limit, 0xffffffe0);   // vector count (in bytes)
6830     jccb(Assembler::zero, COMPARE_TAIL);
6831 
6832     lea(ary1, Address(ary1, limit, Address::times_1));
6833     lea(ary2, Address(ary2, limit, Address::times_1));
6834     negptr(limit);
6835 
6836     bind(COMPARE_WIDE_VECTORS);
6837     vmovdqu(vec1, Address(ary1, limit, Address::times_1));
6838     vmovdqu(vec2, Address(ary2, limit, Address::times_1));
6839     vpxor(vec1, vec2);
6840 
6841     vptest(vec1, vec1);
6842     jccb(Assembler::notZero, FALSE_LABEL);
6843     addptr(limit, 32);
6844     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
6845 
6846     testl(result, result);
6847     jccb(Assembler::zero, TRUE_LABEL);
6848 
6849     vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
6850     vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
6851     vpxor(vec1, vec2);
6852 
6853     vptest(vec1, vec1);
6854     jccb(Assembler::notZero, FALSE_LABEL);
6855     jmpb(TRUE_LABEL);
6856 
6857     bind(COMPARE_TAIL); // limit is zero
6858     movl(limit, result);
6859     // Fallthru to tail compare
6860   } else if (UseSSE42Intrinsics) {
6861     // With SSE4.2, use double quad vector compare
6862     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
6863 
6864     // Compare 16-byte vectors
6865     andl(result, 0x0000000e);  //   tail count (in bytes)
6866     andl(limit, 0xfffffff0);   // vector count (in bytes)
6867     jccb(Assembler::zero, COMPARE_TAIL);
6868 
6869     lea(ary1, Address(ary1, limit, Address::times_1));
6870     lea(ary2, Address(ary2, limit, Address::times_1));
6871     negptr(limit);
6872 
6873     bind(COMPARE_WIDE_VECTORS);
6874     movdqu(vec1, Address(ary1, limit, Address::times_1));
6875     movdqu(vec2, Address(ary2, limit, Address::times_1));
6876     pxor(vec1, vec2);
6877 
6878     ptest(vec1, vec1);
6879     jccb(Assembler::notZero, FALSE_LABEL);
6880     addptr(limit, 16);
6881     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
6882 
6883     testl(result, result);
6884     jccb(Assembler::zero, TRUE_LABEL);
6885 
6886     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
6887     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
6888     pxor(vec1, vec2);
6889 
6890     ptest(vec1, vec1);
6891     jccb(Assembler::notZero, FALSE_LABEL);
6892     jmpb(TRUE_LABEL);
6893 
6894     bind(COMPARE_TAIL); // limit is zero
6895     movl(limit, result);
6896     // Fallthru to tail compare
6897   }
6898 
6899   // Compare 4-byte vectors
6900   andl(limit, 0xfffffffc); // vector count (in bytes)
6901   jccb(Assembler::zero, COMPARE_CHAR);
6902 
6903   lea(ary1, Address(ary1, limit, Address::times_1));
6904   lea(ary2, Address(ary2, limit, Address::times_1));
6905   negptr(limit);
6906 
6907   bind(COMPARE_VECTORS);
6908   movl(chr, Address(ary1, limit, Address::times_1));
6909   cmpl(chr, Address(ary2, limit, Address::times_1));
6910   jccb(Assembler::notEqual, FALSE_LABEL);
6911   addptr(limit, 4);
6912   jcc(Assembler::notZero, COMPARE_VECTORS);
6913 
6914   // Compare trailing char (final 2 bytes), if any
6915   bind(COMPARE_CHAR);
6916   testl(result, 0x2);   // tail  char
6917   jccb(Assembler::zero, TRUE_LABEL);
6918   load_unsigned_short(chr, Address(ary1, 0));
6919   load_unsigned_short(limit, Address(ary2, 0));
6920   cmpl(chr, limit);
6921   jccb(Assembler::notEqual, FALSE_LABEL);
6922 
6923   bind(TRUE_LABEL);
6924   movl(result, 1);   // return true
6925   jmpb(DONE);
6926 
6927   bind(FALSE_LABEL);
6928   xorl(result, result); // return false
6929 
6930   // That's it
6931   bind(DONE);
6932   if (UseAVX >= 2) {
6933     // clean upper bits of YMM registers
6934     vpxor(vec1, vec1);
6935     vpxor(vec2, vec2);
6936   }
6937 }
6938 
6939 void MacroAssembler::generate_fill(BasicType t, bool aligned,
6940                                    Register to, Register value, Register count,
6941                                    Register rtmp, XMMRegister xtmp) {
6942   ShortBranchVerifier sbv(this);
6943   assert_different_registers(to, value, count, rtmp);
6944   Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
6945   Label L_fill_2_bytes, L_fill_4_bytes;
6946 
6947   int shift = -1;
6948   switch (t) {
6949     case T_BYTE:
6950       shift = 2;
6951       break;
6952     case T_SHORT:
6953       shift = 1;
6954       break;
6955     case T_INT:
6956       shift = 0;
6957       break;
6958     default: ShouldNotReachHere();
6959   }
6960 
6961   if (t == T_BYTE) {
6962     andl(value, 0xff);
6963     movl(rtmp, value);
6964     shll(rtmp, 8);
6965     orl(value, rtmp);
6966   }
6967   if (t == T_SHORT) {
6968     andl(value, 0xffff);
6969   }
6970   if (t == T_BYTE || t == T_SHORT) {
6971     movl(rtmp, value);
6972     shll(rtmp, 16);
6973     orl(value, rtmp);
6974   }
6975 
6976   cmpl(count, 2<<shift); // Short arrays (< 8 bytes) fill by element
6977   jcc(Assembler::below, L_fill_4_bytes); // use unsigned cmp
6978   if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
6979     // align source address at 4 bytes address boundary
6980     if (t == T_BYTE) {
6981       // One byte misalignment happens only for byte arrays
6982       testptr(to, 1);
6983       jccb(Assembler::zero, L_skip_align1);
6984       movb(Address(to, 0), value);
6985       increment(to);
6986       decrement(count);
6987       BIND(L_skip_align1);
6988     }
6989     // Two bytes misalignment happens only for byte and short (char) arrays
6990     testptr(to, 2);
6991     jccb(Assembler::zero, L_skip_align2);
6992     movw(Address(to, 0), value);
6993     addptr(to, 2);
6994     subl(count, 1<<(shift-1));
6995     BIND(L_skip_align2);
6996   }
6997   if (UseSSE < 2) {
6998     Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
6999     // Fill 32-byte chunks
7000     subl(count, 8 << shift);
7001     jcc(Assembler::less, L_check_fill_8_bytes);
7002     align(16);
7003 
7004     BIND(L_fill_32_bytes_loop);
7005 
7006     for (int i = 0; i < 32; i += 4) {
7007       movl(Address(to, i), value);
7008     }
7009 
7010     addptr(to, 32);
7011     subl(count, 8 << shift);
7012     jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
7013     BIND(L_check_fill_8_bytes);
7014     addl(count, 8 << shift);
7015     jccb(Assembler::zero, L_exit);
7016     jmpb(L_fill_8_bytes);
7017 
7018     //
7019     // length is too short, just fill qwords
7020     //
7021     BIND(L_fill_8_bytes_loop);
7022     movl(Address(to, 0), value);
7023     movl(Address(to, 4), value);
7024     addptr(to, 8);
7025     BIND(L_fill_8_bytes);
7026     subl(count, 1 << (shift + 1));
7027     jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
7028     // fall through to fill 4 bytes
7029   } else {
7030     Label L_fill_32_bytes;
7031     if (!UseUnalignedLoadStores) {
7032       // align to 8 bytes, we know we are 4 byte aligned to start
7033       testptr(to, 4);
7034       jccb(Assembler::zero, L_fill_32_bytes);
7035       movl(Address(to, 0), value);
7036       addptr(to, 4);
7037       subl(count, 1<<shift);
7038     }
7039     BIND(L_fill_32_bytes);
7040     {
7041       assert( UseSSE >= 2, "supported cpu only" );
7042       Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
7043       if (UseAVX > 2) {
7044         movl(rtmp, 0xffff);
7045 #ifdef _LP64
7046         kmovql(k1, rtmp);
7047 #else
7048         kmovdl(k1, rtmp);
7049 #endif
7050       }
7051       movdl(xtmp, value);
7052       if (UseAVX > 2 && UseUnalignedLoadStores) {
7053         // Fill 64-byte chunks
7054         Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
7055         evpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
7056 
7057         subl(count, 16 << shift);
7058         jcc(Assembler::less, L_check_fill_32_bytes);
7059         align(16);
7060 
7061         BIND(L_fill_64_bytes_loop);
7062         evmovdqu(Address(to, 0), xtmp, Assembler::AVX_512bit);
7063         addptr(to, 64);
7064         subl(count, 16 << shift);
7065         jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
7066 
7067         BIND(L_check_fill_32_bytes);
7068         addl(count, 8 << shift);
7069         jccb(Assembler::less, L_check_fill_8_bytes);
7070         evmovdqu(Address(to, 0), xtmp, Assembler::AVX_256bit);
7071         addptr(to, 32);
7072         subl(count, 8 << shift);
7073 
7074         BIND(L_check_fill_8_bytes);
7075       } else if (UseAVX == 2 && UseUnalignedLoadStores) {
7076         // Fill 64-byte chunks
7077         Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
7078         vpbroadcastd(xtmp, xtmp);
7079 
7080         subl(count, 16 << shift);
7081         jcc(Assembler::less, L_check_fill_32_bytes);
7082         align(16);
7083 
7084         BIND(L_fill_64_bytes_loop);
7085         vmovdqu(Address(to, 0), xtmp);
7086         vmovdqu(Address(to, 32), xtmp);
7087         addptr(to, 64);
7088         subl(count, 16 << shift);
7089         jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
7090 
7091         BIND(L_check_fill_32_bytes);
7092         addl(count, 8 << shift);
7093         jccb(Assembler::less, L_check_fill_8_bytes);
7094         vmovdqu(Address(to, 0), xtmp);
7095         addptr(to, 32);
7096         subl(count, 8 << shift);
7097 
7098         BIND(L_check_fill_8_bytes);
7099         // clean upper bits of YMM registers
7100         movdl(xtmp, value);
7101         pshufd(xtmp, xtmp, 0);
7102       } else {
7103         // Fill 32-byte chunks
7104         pshufd(xtmp, xtmp, 0);
7105 
7106         subl(count, 8 << shift);
7107         jcc(Assembler::less, L_check_fill_8_bytes);
7108         align(16);
7109 
7110         BIND(L_fill_32_bytes_loop);
7111 
7112         if (UseUnalignedLoadStores) {
7113           movdqu(Address(to, 0), xtmp);
7114           movdqu(Address(to, 16), xtmp);
7115         } else {
7116           movq(Address(to, 0), xtmp);
7117           movq(Address(to, 8), xtmp);
7118           movq(Address(to, 16), xtmp);
7119           movq(Address(to, 24), xtmp);
7120         }
7121 
7122         addptr(to, 32);
7123         subl(count, 8 << shift);
7124         jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
7125 
7126         BIND(L_check_fill_8_bytes);
7127       }
7128       addl(count, 8 << shift);
7129       jccb(Assembler::zero, L_exit);
7130       jmpb(L_fill_8_bytes);
7131 
7132       //
7133       // length is too short, just fill qwords
7134       //
7135       BIND(L_fill_8_bytes_loop);
7136       movq(Address(to, 0), xtmp);
7137       addptr(to, 8);
7138       BIND(L_fill_8_bytes);
7139       subl(count, 1 << (shift + 1));
7140       jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
7141     }
7142   }
7143   // fill trailing 4 bytes
7144   BIND(L_fill_4_bytes);
7145   testl(count, 1<<shift);
7146   jccb(Assembler::zero, L_fill_2_bytes);
7147   movl(Address(to, 0), value);
7148   if (t == T_BYTE || t == T_SHORT) {
7149     addptr(to, 4);
7150     BIND(L_fill_2_bytes);
7151     // fill trailing 2 bytes
7152     testl(count, 1<<(shift-1));
7153     jccb(Assembler::zero, L_fill_byte);
7154     movw(Address(to, 0), value);
7155     if (t == T_BYTE) {
7156       addptr(to, 2);
7157       BIND(L_fill_byte);
7158       // fill trailing byte
7159       testl(count, 1);
7160       jccb(Assembler::zero, L_exit);
7161       movb(Address(to, 0), value);
7162     } else {
7163       BIND(L_fill_byte);
7164     }
7165   } else {
7166     BIND(L_fill_2_bytes);
7167   }
7168   BIND(L_exit);
7169 }
7170 
7171 // encode char[] to byte[] in ISO_8859_1
7172 void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
7173                                       XMMRegister tmp1Reg, XMMRegister tmp2Reg,
7174                                       XMMRegister tmp3Reg, XMMRegister tmp4Reg,
7175                                       Register tmp5, Register result) {
7176   // rsi: src
7177   // rdi: dst
7178   // rdx: len
7179   // rcx: tmp5
7180   // rax: result
7181   ShortBranchVerifier sbv(this);
7182   assert_different_registers(src, dst, len, tmp5, result);
7183   Label L_done, L_copy_1_char, L_copy_1_char_exit;
7184 
7185   // set result
7186   xorl(result, result);
7187   // check for zero length
7188   testl(len, len);
7189   jcc(Assembler::zero, L_done);
7190   movl(result, len);
7191 
7192   // Setup pointers
7193   lea(src, Address(src, len, Address::times_2)); // char[]
7194   lea(dst, Address(dst, len, Address::times_1)); // byte[]
7195   negptr(len);
7196 
7197   if (UseSSE42Intrinsics || UseAVX >= 2) {
7198     Label L_chars_8_check, L_copy_8_chars, L_copy_8_chars_exit;
7199     Label L_chars_16_check, L_copy_16_chars, L_copy_16_chars_exit;
7200 
7201     if (UseAVX >= 2) {
7202       Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit;
7203       movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vector
7204       movdl(tmp1Reg, tmp5);
7205       vpbroadcastd(tmp1Reg, tmp1Reg);
7206       jmpb(L_chars_32_check);
7207 
7208       bind(L_copy_32_chars);
7209       vmovdqu(tmp3Reg, Address(src, len, Address::times_2, -64));
7210       vmovdqu(tmp4Reg, Address(src, len, Address::times_2, -32));
7211       vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
7212       vptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in  vector
7213       jccb(Assembler::notZero, L_copy_32_chars_exit);
7214       vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
7215       vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector_len */ 1);
7216       vmovdqu(Address(dst, len, Address::times_1, -32), tmp4Reg);
7217 
7218       bind(L_chars_32_check);
7219       addptr(len, 32);
7220       jccb(Assembler::lessEqual, L_copy_32_chars);
7221 
7222       bind(L_copy_32_chars_exit);
7223       subptr(len, 16);
7224       jccb(Assembler::greater, L_copy_16_chars_exit);
7225 
7226     } else if (UseSSE42Intrinsics) {
7227       movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vector
7228       movdl(tmp1Reg, tmp5);
7229       pshufd(tmp1Reg, tmp1Reg, 0);
7230       jmpb(L_chars_16_check);
7231     }
7232 
7233     bind(L_copy_16_chars);
7234     if (UseAVX >= 2) {
7235       vmovdqu(tmp2Reg, Address(src, len, Address::times_2, -32));
7236       vptest(tmp2Reg, tmp1Reg);
7237       jccb(Assembler::notZero, L_copy_16_chars_exit);
7238       vpackuswb(tmp2Reg, tmp2Reg, tmp1Reg, /* vector_len */ 1);
7239       vpermq(tmp3Reg, tmp2Reg, 0xD8, /* vector_len */ 1);
7240     } else {
7241       if (UseAVX > 0) {
7242         movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
7243         movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
7244         vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 0);
7245       } else {
7246         movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
7247         por(tmp2Reg, tmp3Reg);
7248         movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
7249         por(tmp2Reg, tmp4Reg);
7250       }
7251       ptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in  vector
7252       jccb(Assembler::notZero, L_copy_16_chars_exit);
7253       packuswb(tmp3Reg, tmp4Reg);
7254     }
7255     movdqu(Address(dst, len, Address::times_1, -16), tmp3Reg);
7256 
7257     bind(L_chars_16_check);
7258     addptr(len, 16);
7259     jccb(Assembler::lessEqual, L_copy_16_chars);
7260 
7261     bind(L_copy_16_chars_exit);
7262     if (UseAVX >= 2) {
7263       // clean upper bits of YMM registers
7264       vpxor(tmp2Reg, tmp2Reg);
7265       vpxor(tmp3Reg, tmp3Reg);
7266       vpxor(tmp4Reg, tmp4Reg);
7267       movdl(tmp1Reg, tmp5);
7268       pshufd(tmp1Reg, tmp1Reg, 0);
7269     }
7270     subptr(len, 8);
7271     jccb(Assembler::greater, L_copy_8_chars_exit);
7272 
7273     bind(L_copy_8_chars);
7274     movdqu(tmp3Reg, Address(src, len, Address::times_2, -16));
7275     ptest(tmp3Reg, tmp1Reg);
7276     jccb(Assembler::notZero, L_copy_8_chars_exit);
7277     packuswb(tmp3Reg, tmp1Reg);
7278     movq(Address(dst, len, Address::times_1, -8), tmp3Reg);
7279     addptr(len, 8);
7280     jccb(Assembler::lessEqual, L_copy_8_chars);
7281 
7282     bind(L_copy_8_chars_exit);
7283     subptr(len, 8);
7284     jccb(Assembler::zero, L_done);
7285   }
7286 
7287   bind(L_copy_1_char);
7288   load_unsigned_short(tmp5, Address(src, len, Address::times_2, 0));
7289   testl(tmp5, 0xff00);      // check if Unicode char
7290   jccb(Assembler::notZero, L_copy_1_char_exit);
7291   movb(Address(dst, len, Address::times_1, 0), tmp5);
7292   addptr(len, 1);
7293   jccb(Assembler::less, L_copy_1_char);
7294 
7295   bind(L_copy_1_char_exit);
7296   addptr(result, len); // len is negative count of not processed elements
7297   bind(L_done);
7298 }
7299 
7300 #ifdef _LP64
7301 /**
7302  * Helper for multiply_to_len().
7303  */
7304 void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) {
7305   addq(dest_lo, src1);
7306   adcq(dest_hi, 0);
7307   addq(dest_lo, src2);
7308   adcq(dest_hi, 0);
7309 }
7310 
7311 /**
7312  * Multiply 64 bit by 64 bit first loop.
7313  */
7314 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
7315                                            Register y, Register y_idx, Register z,
7316                                            Register carry, Register product,
7317                                            Register idx, Register kdx) {
7318   //
7319   //  jlong carry, x[], y[], z[];
7320   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
7321   //    huge_128 product = y[idx] * x[xstart] + carry;
7322   //    z[kdx] = (jlong)product;
7323   //    carry  = (jlong)(product >>> 64);
7324   //  }
7325   //  z[xstart] = carry;
7326   //
7327 
7328   Label L_first_loop, L_first_loop_exit;
7329   Label L_one_x, L_one_y, L_multiply;
7330 
7331   decrementl(xstart);
7332   jcc(Assembler::negative, L_one_x);
7333 
7334   movq(x_xstart, Address(x, xstart, Address::times_4,  0));
7335   rorq(x_xstart, 32); // convert big-endian to little-endian
7336 
7337   bind(L_first_loop);
7338   decrementl(idx);
7339   jcc(Assembler::negative, L_first_loop_exit);
7340   decrementl(idx);
7341   jcc(Assembler::negative, L_one_y);
7342   movq(y_idx, Address(y, idx, Address::times_4,  0));
7343   rorq(y_idx, 32); // convert big-endian to little-endian
7344   bind(L_multiply);
7345   movq(product, x_xstart);
7346   mulq(y_idx); // product(rax) * y_idx -> rdx:rax
7347   addq(product, carry);
7348   adcq(rdx, 0);
7349   subl(kdx, 2);
7350   movl(Address(z, kdx, Address::times_4,  4), product);
7351   shrq(product, 32);
7352   movl(Address(z, kdx, Address::times_4,  0), product);
7353   movq(carry, rdx);
7354   jmp(L_first_loop);
7355 
7356   bind(L_one_y);
7357   movl(y_idx, Address(y,  0));
7358   jmp(L_multiply);
7359 
7360   bind(L_one_x);
7361   movl(x_xstart, Address(x,  0));
7362   jmp(L_first_loop);
7363 
7364   bind(L_first_loop_exit);
7365 }
7366 
7367 /**
7368  * Multiply 64 bit by 64 bit and add 128 bit.
7369  */
7370 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, Register z,
7371                                             Register yz_idx, Register idx,
7372                                             Register carry, Register product, int offset) {
7373   //     huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
7374   //     z[kdx] = (jlong)product;
7375 
7376   movq(yz_idx, Address(y, idx, Address::times_4,  offset));
7377   rorq(yz_idx, 32); // convert big-endian to little-endian
7378   movq(product, x_xstart);
7379   mulq(yz_idx);     // product(rax) * yz_idx -> rdx:product(rax)
7380   movq(yz_idx, Address(z, idx, Address::times_4,  offset));
7381   rorq(yz_idx, 32); // convert big-endian to little-endian
7382 
7383   add2_with_carry(rdx, product, carry, yz_idx);
7384 
7385   movl(Address(z, idx, Address::times_4,  offset+4), product);
7386   shrq(product, 32);
7387   movl(Address(z, idx, Address::times_4,  offset), product);
7388 
7389 }
7390 
7391 /**
7392  * Multiply 128 bit by 128 bit. Unrolled inner loop.
7393  */
7394 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, Register y, Register z,
7395                                              Register yz_idx, Register idx, Register jdx,
7396                                              Register carry, Register product,
7397                                              Register carry2) {
7398   //   jlong carry, x[], y[], z[];
7399   //   int kdx = ystart+1;
7400   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
7401   //     huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
7402   //     z[kdx+idx+1] = (jlong)product;
7403   //     jlong carry2  = (jlong)(product >>> 64);
7404   //     product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
7405   //     z[kdx+idx] = (jlong)product;
7406   //     carry  = (jlong)(product >>> 64);
7407   //   }
7408   //   idx += 2;
7409   //   if (idx > 0) {
7410   //     product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
7411   //     z[kdx+idx] = (jlong)product;
7412   //     carry  = (jlong)(product >>> 64);
7413   //   }
7414   //
7415 
7416   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
7417 
7418   movl(jdx, idx);
7419   andl(jdx, 0xFFFFFFFC);
7420   shrl(jdx, 2);
7421 
7422   bind(L_third_loop);
7423   subl(jdx, 1);
7424   jcc(Assembler::negative, L_third_loop_exit);
7425   subl(idx, 4);
7426 
7427   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 8);
7428   movq(carry2, rdx);
7429 
7430   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product, 0);
7431   movq(carry, rdx);
7432   jmp(L_third_loop);
7433 
7434   bind (L_third_loop_exit);
7435 
7436   andl (idx, 0x3);
7437   jcc(Assembler::zero, L_post_third_loop_done);
7438 
7439   Label L_check_1;
7440   subl(idx, 2);
7441   jcc(Assembler::negative, L_check_1);
7442 
7443   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 0);
7444   movq(carry, rdx);
7445 
7446   bind (L_check_1);
7447   addl (idx, 0x2);
7448   andl (idx, 0x1);
7449   subl(idx, 1);
7450   jcc(Assembler::negative, L_post_third_loop_done);
7451 
7452   movl(yz_idx, Address(y, idx, Address::times_4,  0));
7453   movq(product, x_xstart);
7454   mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax)
7455   movl(yz_idx, Address(z, idx, Address::times_4,  0));
7456 
7457   add2_with_carry(rdx, product, yz_idx, carry);
7458 
7459   movl(Address(z, idx, Address::times_4,  0), product);
7460   shrq(product, 32);
7461 
7462   shlq(rdx, 32);
7463   orq(product, rdx);
7464   movq(carry, product);
7465 
7466   bind(L_post_third_loop_done);
7467 }
7468 
7469 /**
7470  * Multiply 128 bit by 128 bit using BMI2. Unrolled inner loop.
7471  *
7472  */
7473 void MacroAssembler::multiply_128_x_128_bmi2_loop(Register y, Register z,
7474                                                   Register carry, Register carry2,
7475                                                   Register idx, Register jdx,
7476                                                   Register yz_idx1, Register yz_idx2,
7477                                                   Register tmp, Register tmp3, Register tmp4) {
7478   assert(UseBMI2Instructions, "should be used only when BMI2 is available");
7479 
7480   //   jlong carry, x[], y[], z[];
7481   //   int kdx = ystart+1;
7482   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
7483   //     huge_128 tmp3 = (y[idx+1] * rdx) + z[kdx+idx+1] + carry;
7484   //     jlong carry2  = (jlong)(tmp3 >>> 64);
7485   //     huge_128 tmp4 = (y[idx]   * rdx) + z[kdx+idx] + carry2;
7486   //     carry  = (jlong)(tmp4 >>> 64);
7487   //     z[kdx+idx+1] = (jlong)tmp3;
7488   //     z[kdx+idx] = (jlong)tmp4;
7489   //   }
7490   //   idx += 2;
7491   //   if (idx > 0) {
7492   //     yz_idx1 = (y[idx] * rdx) + z[kdx+idx] + carry;
7493   //     z[kdx+idx] = (jlong)yz_idx1;
7494   //     carry  = (jlong)(yz_idx1 >>> 64);
7495   //   }
7496   //
7497 
7498   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
7499 
7500   movl(jdx, idx);
7501   andl(jdx, 0xFFFFFFFC);
7502   shrl(jdx, 2);
7503 
7504   bind(L_third_loop);
7505   subl(jdx, 1);
7506   jcc(Assembler::negative, L_third_loop_exit);
7507   subl(idx, 4);
7508 
7509   movq(yz_idx1,  Address(y, idx, Address::times_4,  8));
7510   rorxq(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
7511   movq(yz_idx2, Address(y, idx, Address::times_4,  0));
7512   rorxq(yz_idx2, yz_idx2, 32);
7513 
7514   mulxq(tmp4, tmp3, yz_idx1);  //  yz_idx1 * rdx -> tmp4:tmp3
7515   mulxq(carry2, tmp, yz_idx2); //  yz_idx2 * rdx -> carry2:tmp
7516 
7517   movq(yz_idx1,  Address(z, idx, Address::times_4,  8));
7518   rorxq(yz_idx1, yz_idx1, 32);
7519   movq(yz_idx2, Address(z, idx, Address::times_4,  0));
7520   rorxq(yz_idx2, yz_idx2, 32);
7521 
7522   if (VM_Version::supports_adx()) {
7523     adcxq(tmp3, carry);
7524     adoxq(tmp3, yz_idx1);
7525 
7526     adcxq(tmp4, tmp);
7527     adoxq(tmp4, yz_idx2);
7528 
7529     movl(carry, 0); // does not affect flags
7530     adcxq(carry2, carry);
7531     adoxq(carry2, carry);
7532   } else {
7533     add2_with_carry(tmp4, tmp3, carry, yz_idx1);
7534     add2_with_carry(carry2, tmp4, tmp, yz_idx2);
7535   }
7536   movq(carry, carry2);
7537 
7538   movl(Address(z, idx, Address::times_4, 12), tmp3);
7539   shrq(tmp3, 32);
7540   movl(Address(z, idx, Address::times_4,  8), tmp3);
7541 
7542   movl(Address(z, idx, Address::times_4,  4), tmp4);
7543   shrq(tmp4, 32);
7544   movl(Address(z, idx, Address::times_4,  0), tmp4);
7545 
7546   jmp(L_third_loop);
7547 
7548   bind (L_third_loop_exit);
7549 
7550   andl (idx, 0x3);
7551   jcc(Assembler::zero, L_post_third_loop_done);
7552 
7553   Label L_check_1;
7554   subl(idx, 2);
7555   jcc(Assembler::negative, L_check_1);
7556 
7557   movq(yz_idx1, Address(y, idx, Address::times_4,  0));
7558   rorxq(yz_idx1, yz_idx1, 32);
7559   mulxq(tmp4, tmp3, yz_idx1); //  yz_idx1 * rdx -> tmp4:tmp3
7560   movq(yz_idx2, Address(z, idx, Address::times_4,  0));
7561   rorxq(yz_idx2, yz_idx2, 32);
7562 
7563   add2_with_carry(tmp4, tmp3, carry, yz_idx2);
7564 
7565   movl(Address(z, idx, Address::times_4,  4), tmp3);
7566   shrq(tmp3, 32);
7567   movl(Address(z, idx, Address::times_4,  0), tmp3);
7568   movq(carry, tmp4);
7569 
7570   bind (L_check_1);
7571   addl (idx, 0x2);
7572   andl (idx, 0x1);
7573   subl(idx, 1);
7574   jcc(Assembler::negative, L_post_third_loop_done);
7575   movl(tmp4, Address(y, idx, Address::times_4,  0));
7576   mulxq(carry2, tmp3, tmp4);  //  tmp4 * rdx -> carry2:tmp3
7577   movl(tmp4, Address(z, idx, Address::times_4,  0));
7578 
7579   add2_with_carry(carry2, tmp3, tmp4, carry);
7580 
7581   movl(Address(z, idx, Address::times_4,  0), tmp3);
7582   shrq(tmp3, 32);
7583 
7584   shlq(carry2, 32);
7585   orq(tmp3, carry2);
7586   movq(carry, tmp3);
7587 
7588   bind(L_post_third_loop_done);
7589 }
7590 
7591 /**
7592  * Code for BigInteger::multiplyToLen() instrinsic.
7593  *
7594  * rdi: x
7595  * rax: xlen
7596  * rsi: y
7597  * rcx: ylen
7598  * r8:  z
7599  * r11: zlen
7600  * r12: tmp1
7601  * r13: tmp2
7602  * r14: tmp3
7603  * r15: tmp4
7604  * rbx: tmp5
7605  *
7606  */
7607 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen,
7608                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
7609   ShortBranchVerifier sbv(this);
7610   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx);
7611 
7612   push(tmp1);
7613   push(tmp2);
7614   push(tmp3);
7615   push(tmp4);
7616   push(tmp5);
7617 
7618   push(xlen);
7619   push(zlen);
7620 
7621   const Register idx = tmp1;
7622   const Register kdx = tmp2;
7623   const Register xstart = tmp3;
7624 
7625   const Register y_idx = tmp4;
7626   const Register carry = tmp5;
7627   const Register product  = xlen;
7628   const Register x_xstart = zlen;  // reuse register
7629 
7630   // First Loop.
7631   //
7632   //  final static long LONG_MASK = 0xffffffffL;
7633   //  int xstart = xlen - 1;
7634   //  int ystart = ylen - 1;
7635   //  long carry = 0;
7636   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
7637   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
7638   //    z[kdx] = (int)product;
7639   //    carry = product >>> 32;
7640   //  }
7641   //  z[xstart] = (int)carry;
7642   //
7643 
7644   movl(idx, ylen);      // idx = ylen;
7645   movl(kdx, zlen);      // kdx = xlen+ylen;
7646   xorq(carry, carry);   // carry = 0;
7647 
7648   Label L_done;
7649 
7650   movl(xstart, xlen);
7651   decrementl(xstart);
7652   jcc(Assembler::negative, L_done);
7653 
7654   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
7655 
7656   Label L_second_loop;
7657   testl(kdx, kdx);
7658   jcc(Assembler::zero, L_second_loop);
7659 
7660   Label L_carry;
7661   subl(kdx, 1);
7662   jcc(Assembler::zero, L_carry);
7663 
7664   movl(Address(z, kdx, Address::times_4,  0), carry);
7665   shrq(carry, 32);
7666   subl(kdx, 1);
7667 
7668   bind(L_carry);
7669   movl(Address(z, kdx, Address::times_4,  0), carry);
7670 
7671   // Second and third (nested) loops.
7672   //
7673   // for (int i = xstart-1; i >= 0; i--) { // Second loop
7674   //   carry = 0;
7675   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
7676   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
7677   //                    (z[k] & LONG_MASK) + carry;
7678   //     z[k] = (int)product;
7679   //     carry = product >>> 32;
7680   //   }
7681   //   z[i] = (int)carry;
7682   // }
7683   //
7684   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
7685 
7686   const Register jdx = tmp1;
7687 
7688   bind(L_second_loop);
7689   xorl(carry, carry);    // carry = 0;
7690   movl(jdx, ylen);       // j = ystart+1
7691 
7692   subl(xstart, 1);       // i = xstart-1;
7693   jcc(Assembler::negative, L_done);
7694 
7695   push (z);
7696 
7697   Label L_last_x;
7698   lea(z, Address(z, xstart, Address::times_4, 4)); // z = z + k - j
7699   subl(xstart, 1);       // i = xstart-1;
7700   jcc(Assembler::negative, L_last_x);
7701 
7702   if (UseBMI2Instructions) {
7703     movq(rdx,  Address(x, xstart, Address::times_4,  0));
7704     rorxq(rdx, rdx, 32); // convert big-endian to little-endian
7705   } else {
7706     movq(x_xstart, Address(x, xstart, Address::times_4,  0));
7707     rorq(x_xstart, 32);  // convert big-endian to little-endian
7708   }
7709 
7710   Label L_third_loop_prologue;
7711   bind(L_third_loop_prologue);
7712 
7713   push (x);
7714   push (xstart);
7715   push (ylen);
7716 
7717 
7718   if (UseBMI2Instructions) {
7719     multiply_128_x_128_bmi2_loop(y, z, carry, x, jdx, ylen, product, tmp2, x_xstart, tmp3, tmp4);
7720   } else { // !UseBMI2Instructions
7721     multiply_128_x_128_loop(x_xstart, y, z, y_idx, jdx, ylen, carry, product, x);
7722   }
7723 
7724   pop(ylen);
7725   pop(xlen);
7726   pop(x);
7727   pop(z);
7728 
7729   movl(tmp3, xlen);
7730   addl(tmp3, 1);
7731   movl(Address(z, tmp3, Address::times_4,  0), carry);
7732   subl(tmp3, 1);
7733   jccb(Assembler::negative, L_done);
7734 
7735   shrq(carry, 32);
7736   movl(Address(z, tmp3, Address::times_4,  0), carry);
7737   jmp(L_second_loop);
7738 
7739   // Next infrequent code is moved outside loops.
7740   bind(L_last_x);
7741   if (UseBMI2Instructions) {
7742     movl(rdx, Address(x,  0));
7743   } else {
7744     movl(x_xstart, Address(x,  0));
7745   }
7746   jmp(L_third_loop_prologue);
7747 
7748   bind(L_done);
7749 
7750   pop(zlen);
7751   pop(xlen);
7752 
7753   pop(tmp5);
7754   pop(tmp4);
7755   pop(tmp3);
7756   pop(tmp2);
7757   pop(tmp1);
7758 }
7759 
7760 //Helper functions for square_to_len()
7761 
7762 /**
7763  * Store the squares of x[], right shifted one bit (divided by 2) into z[]
7764  * Preserves x and z and modifies rest of the registers.
7765  */
7766 
7767 void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
7768   // Perform square and right shift by 1
7769   // Handle odd xlen case first, then for even xlen do the following
7770   // jlong carry = 0;
7771   // for (int j=0, i=0; j < xlen; j+=2, i+=4) {
7772   //     huge_128 product = x[j:j+1] * x[j:j+1];
7773   //     z[i:i+1] = (carry << 63) | (jlong)(product >>> 65);
7774   //     z[i+2:i+3] = (jlong)(product >>> 1);
7775   //     carry = (jlong)product;
7776   // }
7777 
7778   xorq(tmp5, tmp5);     // carry
7779   xorq(rdxReg, rdxReg);
7780   xorl(tmp1, tmp1);     // index for x
7781   xorl(tmp4, tmp4);     // index for z
7782 
7783   Label L_first_loop, L_first_loop_exit;
7784 
7785   testl(xlen, 1);
7786   jccb(Assembler::zero, L_first_loop); //jump if xlen is even
7787 
7788   // Square and right shift by 1 the odd element using 32 bit multiply
7789   movl(raxReg, Address(x, tmp1, Address::times_4, 0));
7790   imulq(raxReg, raxReg);
7791   shrq(raxReg, 1);
7792   adcq(tmp5, 0);
7793   movq(Address(z, tmp4, Address::times_4, 0), raxReg);
7794   incrementl(tmp1);
7795   addl(tmp4, 2);
7796 
7797   // Square and  right shift by 1 the rest using 64 bit multiply
7798   bind(L_first_loop);
7799   cmpptr(tmp1, xlen);
7800   jccb(Assembler::equal, L_first_loop_exit);
7801 
7802   // Square
7803   movq(raxReg, Address(x, tmp1, Address::times_4,  0));
7804   rorq(raxReg, 32);    // convert big-endian to little-endian
7805   mulq(raxReg);        // 64-bit multiply rax * rax -> rdx:rax
7806 
7807   // Right shift by 1 and save carry
7808   shrq(tmp5, 1);       // rdx:rax:tmp5 = (tmp5:rdx:rax) >>> 1
7809   rcrq(rdxReg, 1);
7810   rcrq(raxReg, 1);
7811   adcq(tmp5, 0);
7812 
7813   // Store result in z
7814   movq(Address(z, tmp4, Address::times_4, 0), rdxReg);
7815   movq(Address(z, tmp4, Address::times_4, 8), raxReg);
7816 
7817   // Update indices for x and z
7818   addl(tmp1, 2);
7819   addl(tmp4, 4);
7820   jmp(L_first_loop);
7821 
7822   bind(L_first_loop_exit);
7823 }
7824 
7825 
7826 /**
7827  * Perform the following multiply add operation using BMI2 instructions
7828  * carry:sum = sum + op1*op2 + carry
7829  * op2 should be in rdx
7830  * op2 is preserved, all other registers are modified
7831  */
7832 void MacroAssembler::multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, Register tmp2) {
7833   // assert op2 is rdx
7834   mulxq(tmp2, op1, op1);  //  op1 * op2 -> tmp2:op1
7835   addq(sum, carry);
7836   adcq(tmp2, 0);
7837   addq(sum, op1);
7838   adcq(tmp2, 0);
7839   movq(carry, tmp2);
7840 }
7841 
7842 /**
7843  * Perform the following multiply add operation:
7844  * carry:sum = sum + op1*op2 + carry
7845  * Preserves op1, op2 and modifies rest of registers
7846  */
7847 void MacroAssembler::multiply_add_64(Register sum, Register op1, Register op2, Register carry, Register rdxReg, Register raxReg) {
7848   // rdx:rax = op1 * op2
7849   movq(raxReg, op2);
7850   mulq(op1);
7851 
7852   //  rdx:rax = sum + carry + rdx:rax
7853   addq(sum, carry);
7854   adcq(rdxReg, 0);
7855   addq(sum, raxReg);
7856   adcq(rdxReg, 0);
7857 
7858   // carry:sum = rdx:sum
7859   movq(carry, rdxReg);
7860 }
7861 
7862 /**
7863  * Add 64 bit long carry into z[] with carry propogation.
7864  * Preserves z and carry register values and modifies rest of registers.
7865  *
7866  */
7867 void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) {
7868   Label L_fourth_loop, L_fourth_loop_exit;
7869 
7870   movl(tmp1, 1);
7871   subl(zlen, 2);
7872   addq(Address(z, zlen, Address::times_4, 0), carry);
7873 
7874   bind(L_fourth_loop);
7875   jccb(Assembler::carryClear, L_fourth_loop_exit);
7876   subl(zlen, 2);
7877   jccb(Assembler::negative, L_fourth_loop_exit);
7878   addq(Address(z, zlen, Address::times_4, 0), tmp1);
7879   jmp(L_fourth_loop);
7880   bind(L_fourth_loop_exit);
7881 }
7882 
7883 /**
7884  * Shift z[] left by 1 bit.
7885  * Preserves x, len, z and zlen registers and modifies rest of the registers.
7886  *
7887  */
7888 void MacroAssembler::lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
7889 
7890   Label L_fifth_loop, L_fifth_loop_exit;
7891 
7892   // Fifth loop
7893   // Perform primitiveLeftShift(z, zlen, 1)
7894 
7895   const Register prev_carry = tmp1;
7896   const Register new_carry = tmp4;
7897   const Register value = tmp2;
7898   const Register zidx = tmp3;
7899 
7900   // int zidx, carry;
7901   // long value;
7902   // carry = 0;
7903   // for (zidx = zlen-2; zidx >=0; zidx -= 2) {
7904   //    (carry:value)  = (z[i] << 1) | carry ;
7905   //    z[i] = value;
7906   // }
7907 
7908   movl(zidx, zlen);
7909   xorl(prev_carry, prev_carry); // clear carry flag and prev_carry register
7910 
7911   bind(L_fifth_loop);
7912   decl(zidx);  // Use decl to preserve carry flag
7913   decl(zidx);
7914   jccb(Assembler::negative, L_fifth_loop_exit);
7915 
7916   if (UseBMI2Instructions) {
7917      movq(value, Address(z, zidx, Address::times_4, 0));
7918      rclq(value, 1);
7919      rorxq(value, value, 32);
7920      movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
7921   }
7922   else {
7923     // clear new_carry
7924     xorl(new_carry, new_carry);
7925 
7926     // Shift z[i] by 1, or in previous carry and save new carry
7927     movq(value, Address(z, zidx, Address::times_4, 0));
7928     shlq(value, 1);
7929     adcl(new_carry, 0);
7930 
7931     orq(value, prev_carry);
7932     rorq(value, 0x20);
7933     movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
7934 
7935     // Set previous carry = new carry
7936     movl(prev_carry, new_carry);
7937   }
7938   jmp(L_fifth_loop);
7939 
7940   bind(L_fifth_loop_exit);
7941 }
7942 
7943 
7944 /**
7945  * Code for BigInteger::squareToLen() intrinsic
7946  *
7947  * rdi: x
7948  * rsi: len
7949  * r8:  z
7950  * rcx: zlen
7951  * r12: tmp1
7952  * r13: tmp2
7953  * r14: tmp3
7954  * r15: tmp4
7955  * rbx: tmp5
7956  *
7957  */
7958 void MacroAssembler::square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
7959 
7960   Label L_second_loop, L_second_loop_exit, L_third_loop, L_third_loop_exit, fifth_loop, fifth_loop_exit, L_last_x, L_multiply;
7961   push(tmp1);
7962   push(tmp2);
7963   push(tmp3);
7964   push(tmp4);
7965   push(tmp5);
7966 
7967   // First loop
7968   // Store the squares, right shifted one bit (i.e., divided by 2).
7969   square_rshift(x, len, z, tmp1, tmp3, tmp4, tmp5, rdxReg, raxReg);
7970 
7971   // Add in off-diagonal sums.
7972   //
7973   // Second, third (nested) and fourth loops.
7974   // zlen +=2;
7975   // for (int xidx=len-2,zidx=zlen-4; xidx > 0; xidx-=2,zidx-=4) {
7976   //    carry = 0;
7977   //    long op2 = x[xidx:xidx+1];
7978   //    for (int j=xidx-2,k=zidx; j >= 0; j-=2) {
7979   //       k -= 2;
7980   //       long op1 = x[j:j+1];
7981   //       long sum = z[k:k+1];
7982   //       carry:sum = multiply_add_64(sum, op1, op2, carry, tmp_regs);
7983   //       z[k:k+1] = sum;
7984   //    }
7985   //    add_one_64(z, k, carry, tmp_regs);
7986   // }
7987 
7988   const Register carry = tmp5;
7989   const Register sum = tmp3;
7990   const Register op1 = tmp4;
7991   Register op2 = tmp2;
7992 
7993   push(zlen);
7994   push(len);
7995   addl(zlen,2);
7996   bind(L_second_loop);
7997   xorq(carry, carry);
7998   subl(zlen, 4);
7999   subl(len, 2);
8000   push(zlen);
8001   push(len);
8002   cmpl(len, 0);
8003   jccb(Assembler::lessEqual, L_second_loop_exit);
8004 
8005   // Multiply an array by one 64 bit long.
8006   if (UseBMI2Instructions) {
8007     op2 = rdxReg;
8008     movq(op2, Address(x, len, Address::times_4,  0));
8009     rorxq(op2, op2, 32);
8010   }
8011   else {
8012     movq(op2, Address(x, len, Address::times_4,  0));
8013     rorq(op2, 32);
8014   }
8015 
8016   bind(L_third_loop);
8017   decrementl(len);
8018   jccb(Assembler::negative, L_third_loop_exit);
8019   decrementl(len);
8020   jccb(Assembler::negative, L_last_x);
8021 
8022   movq(op1, Address(x, len, Address::times_4,  0));
8023   rorq(op1, 32);
8024 
8025   bind(L_multiply);
8026   subl(zlen, 2);
8027   movq(sum, Address(z, zlen, Address::times_4,  0));
8028 
8029   // Multiply 64 bit by 64 bit and add 64 bits lower half and upper 64 bits as carry.
8030   if (UseBMI2Instructions) {
8031     multiply_add_64_bmi2(sum, op1, op2, carry, tmp2);
8032   }
8033   else {
8034     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
8035   }
8036 
8037   movq(Address(z, zlen, Address::times_4, 0), sum);
8038 
8039   jmp(L_third_loop);
8040   bind(L_third_loop_exit);
8041 
8042   // Fourth loop
8043   // Add 64 bit long carry into z with carry propogation.
8044   // Uses offsetted zlen.
8045   add_one_64(z, zlen, carry, tmp1);
8046 
8047   pop(len);
8048   pop(zlen);
8049   jmp(L_second_loop);
8050 
8051   // Next infrequent code is moved outside loops.
8052   bind(L_last_x);
8053   movl(op1, Address(x, 0));
8054   jmp(L_multiply);
8055 
8056   bind(L_second_loop_exit);
8057   pop(len);
8058   pop(zlen);
8059   pop(len);
8060   pop(zlen);
8061 
8062   // Fifth loop
8063   // Shift z left 1 bit.
8064   lshift_by_1(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4);
8065 
8066   // z[zlen-1] |= x[len-1] & 1;
8067   movl(tmp3, Address(x, len, Address::times_4, -4));
8068   andl(tmp3, 1);
8069   orl(Address(z, zlen, Address::times_4,  -4), tmp3);
8070 
8071   pop(tmp5);
8072   pop(tmp4);
8073   pop(tmp3);
8074   pop(tmp2);
8075   pop(tmp1);
8076 }
8077 
8078 /**
8079  * Helper function for mul_add()
8080  * Multiply the in[] by int k and add to out[] starting at offset offs using
8081  * 128 bit by 32 bit multiply and return the carry in tmp5.
8082  * Only quad int aligned length of in[] is operated on in this function.
8083  * k is in rdxReg for BMI2Instructions, for others it is in tmp2.
8084  * This function preserves out, in and k registers.
8085  * len and offset point to the appropriate index in "in" & "out" correspondingly
8086  * tmp5 has the carry.
8087  * other registers are temporary and are modified.
8088  *
8089  */
8090 void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in,
8091   Register offset, Register len, Register tmp1, Register tmp2, Register tmp3,
8092   Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
8093 
8094   Label L_first_loop, L_first_loop_exit;
8095 
8096   movl(tmp1, len);
8097   shrl(tmp1, 2);
8098 
8099   bind(L_first_loop);
8100   subl(tmp1, 1);
8101   jccb(Assembler::negative, L_first_loop_exit);
8102 
8103   subl(len, 4);
8104   subl(offset, 4);
8105 
8106   Register op2 = tmp2;
8107   const Register sum = tmp3;
8108   const Register op1 = tmp4;
8109   const Register carry = tmp5;
8110 
8111   if (UseBMI2Instructions) {
8112     op2 = rdxReg;
8113   }
8114 
8115   movq(op1, Address(in, len, Address::times_4,  8));
8116   rorq(op1, 32);
8117   movq(sum, Address(out, offset, Address::times_4,  8));
8118   rorq(sum, 32);
8119   if (UseBMI2Instructions) {
8120     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
8121   }
8122   else {
8123     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
8124   }
8125   // Store back in big endian from little endian
8126   rorq(sum, 0x20);
8127   movq(Address(out, offset, Address::times_4,  8), sum);
8128 
8129   movq(op1, Address(in, len, Address::times_4,  0));
8130   rorq(op1, 32);
8131   movq(sum, Address(out, offset, Address::times_4,  0));
8132   rorq(sum, 32);
8133   if (UseBMI2Instructions) {
8134     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
8135   }
8136   else {
8137     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
8138   }
8139   // Store back in big endian from little endian
8140   rorq(sum, 0x20);
8141   movq(Address(out, offset, Address::times_4,  0), sum);
8142 
8143   jmp(L_first_loop);
8144   bind(L_first_loop_exit);
8145 }
8146 
8147 /**
8148  * Code for BigInteger::mulAdd() intrinsic
8149  *
8150  * rdi: out
8151  * rsi: in
8152  * r11: offs (out.length - offset)
8153  * rcx: len
8154  * r8:  k
8155  * r12: tmp1
8156  * r13: tmp2
8157  * r14: tmp3
8158  * r15: tmp4
8159  * rbx: tmp5
8160  * Multiply the in[] by word k and add to out[], return the carry in rax
8161  */
8162 void MacroAssembler::mul_add(Register out, Register in, Register offs,
8163    Register len, Register k, Register tmp1, Register tmp2, Register tmp3,
8164    Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
8165 
8166   Label L_carry, L_last_in, L_done;
8167 
8168 // carry = 0;
8169 // for (int j=len-1; j >= 0; j--) {
8170 //    long product = (in[j] & LONG_MASK) * kLong +
8171 //                   (out[offs] & LONG_MASK) + carry;
8172 //    out[offs--] = (int)product;
8173 //    carry = product >>> 32;
8174 // }
8175 //
8176   push(tmp1);
8177   push(tmp2);
8178   push(tmp3);
8179   push(tmp4);
8180   push(tmp5);
8181 
8182   Register op2 = tmp2;
8183   const Register sum = tmp3;
8184   const Register op1 = tmp4;
8185   const Register carry =  tmp5;
8186 
8187   if (UseBMI2Instructions) {
8188     op2 = rdxReg;
8189     movl(op2, k);
8190   }
8191   else {
8192     movl(op2, k);
8193   }
8194 
8195   xorq(carry, carry);
8196 
8197   //First loop
8198 
8199   //Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply
8200   //The carry is in tmp5
8201   mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg);
8202 
8203   //Multiply the trailing in[] entry using 64 bit by 32 bit, if any
8204   decrementl(len);
8205   jccb(Assembler::negative, L_carry);
8206   decrementl(len);
8207   jccb(Assembler::negative, L_last_in);
8208 
8209   movq(op1, Address(in, len, Address::times_4,  0));
8210   rorq(op1, 32);
8211 
8212   subl(offs, 2);
8213   movq(sum, Address(out, offs, Address::times_4,  0));
8214   rorq(sum, 32);
8215 
8216   if (UseBMI2Instructions) {
8217     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
8218   }
8219   else {
8220     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
8221   }
8222 
8223   // Store back in big endian from little endian
8224   rorq(sum, 0x20);
8225   movq(Address(out, offs, Address::times_4,  0), sum);
8226 
8227   testl(len, len);
8228   jccb(Assembler::zero, L_carry);
8229 
8230   //Multiply the last in[] entry, if any
8231   bind(L_last_in);
8232   movl(op1, Address(in, 0));
8233   movl(sum, Address(out, offs, Address::times_4,  -4));
8234 
8235   movl(raxReg, k);
8236   mull(op1); //tmp4 * eax -> edx:eax
8237   addl(sum, carry);
8238   adcl(rdxReg, 0);
8239   addl(sum, raxReg);
8240   adcl(rdxReg, 0);
8241   movl(carry, rdxReg);
8242 
8243   movl(Address(out, offs, Address::times_4,  -4), sum);
8244 
8245   bind(L_carry);
8246   //return tmp5/carry as carry in rax
8247   movl(rax, carry);
8248 
8249   bind(L_done);
8250   pop(tmp5);
8251   pop(tmp4);
8252   pop(tmp3);
8253   pop(tmp2);
8254   pop(tmp1);
8255 }
8256 #endif
8257 
8258 /**
8259  * Emits code to update CRC-32 with a byte value according to constants in table
8260  *
8261  * @param [in,out]crc   Register containing the crc.
8262  * @param [in]val       Register containing the byte to fold into the CRC.
8263  * @param [in]table     Register containing the table of crc constants.
8264  *
8265  * uint32_t crc;
8266  * val = crc_table[(val ^ crc) & 0xFF];
8267  * crc = val ^ (crc >> 8);
8268  *
8269  */
8270 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
8271   xorl(val, crc);
8272   andl(val, 0xFF);
8273   shrl(crc, 8); // unsigned shift
8274   xorl(crc, Address(table, val, Address::times_4, 0));
8275 }
8276 
8277 /**
8278  * Fold 128-bit data chunk
8279  */
8280 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
8281   if (UseAVX > 0) {
8282     vpclmulhdq(xtmp, xK, xcrc); // [123:64]
8283     vpclmulldq(xcrc, xK, xcrc); // [63:0]
8284     vpxor(xcrc, xcrc, Address(buf, offset), 0 /* vector_len */);
8285     pxor(xcrc, xtmp);
8286   } else {
8287     movdqa(xtmp, xcrc);
8288     pclmulhdq(xtmp, xK);   // [123:64]
8289     pclmulldq(xcrc, xK);   // [63:0]
8290     pxor(xcrc, xtmp);
8291     movdqu(xtmp, Address(buf, offset));
8292     pxor(xcrc, xtmp);
8293   }
8294 }
8295 
8296 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) {
8297   if (UseAVX > 0) {
8298     vpclmulhdq(xtmp, xK, xcrc);
8299     vpclmulldq(xcrc, xK, xcrc);
8300     pxor(xcrc, xbuf);
8301     pxor(xcrc, xtmp);
8302   } else {
8303     movdqa(xtmp, xcrc);
8304     pclmulhdq(xtmp, xK);
8305     pclmulldq(xcrc, xK);
8306     pxor(xcrc, xbuf);
8307     pxor(xcrc, xtmp);
8308   }
8309 }
8310 
8311 /**
8312  * 8-bit folds to compute 32-bit CRC
8313  *
8314  * uint64_t xcrc;
8315  * timesXtoThe32[xcrc & 0xFF] ^ (xcrc >> 8);
8316  */
8317 void MacroAssembler::fold_8bit_crc32(XMMRegister xcrc, Register table, XMMRegister xtmp, Register tmp) {
8318   movdl(tmp, xcrc);
8319   andl(tmp, 0xFF);
8320   movdl(xtmp, Address(table, tmp, Address::times_4, 0));
8321   psrldq(xcrc, 1); // unsigned shift one byte
8322   pxor(xcrc, xtmp);
8323 }
8324 
8325 /**
8326  * uint32_t crc;
8327  * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
8328  */
8329 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
8330   movl(tmp, crc);
8331   andl(tmp, 0xFF);
8332   shrl(crc, 8);
8333   xorl(crc, Address(table, tmp, Address::times_4, 0));
8334 }
8335 
8336 /**
8337  * @param crc   register containing existing CRC (32-bit)
8338  * @param buf   register pointing to input byte buffer (byte*)
8339  * @param len   register containing number of bytes
8340  * @param table register that will contain address of CRC table
8341  * @param tmp   scratch register
8342  */
8343 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp) {
8344   assert_different_registers(crc, buf, len, table, tmp, rax);
8345 
8346   Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
8347   Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
8348 
8349   lea(table, ExternalAddress(StubRoutines::crc_table_addr()));
8350   notl(crc); // ~crc
8351   cmpl(len, 16);
8352   jcc(Assembler::less, L_tail);
8353 
8354   // Align buffer to 16 bytes
8355   movl(tmp, buf);
8356   andl(tmp, 0xF);
8357   jccb(Assembler::zero, L_aligned);
8358   subl(tmp,  16);
8359   addl(len, tmp);
8360 
8361   align(4);
8362   BIND(L_align_loop);
8363   movsbl(rax, Address(buf, 0)); // load byte with sign extension
8364   update_byte_crc32(crc, rax, table);
8365   increment(buf);
8366   incrementl(tmp);
8367   jccb(Assembler::less, L_align_loop);
8368 
8369   BIND(L_aligned);
8370   movl(tmp, len); // save
8371   shrl(len, 4);
8372   jcc(Assembler::zero, L_tail_restore);
8373 
8374   // Fold crc into first bytes of vector
8375   movdqa(xmm1, Address(buf, 0));
8376   movdl(rax, xmm1);
8377   xorl(crc, rax);
8378   pinsrd(xmm1, crc, 0);
8379   addptr(buf, 16);
8380   subl(len, 4); // len > 0
8381   jcc(Assembler::less, L_fold_tail);
8382 
8383   movdqa(xmm2, Address(buf,  0));
8384   movdqa(xmm3, Address(buf, 16));
8385   movdqa(xmm4, Address(buf, 32));
8386   addptr(buf, 48);
8387   subl(len, 3);
8388   jcc(Assembler::lessEqual, L_fold_512b);
8389 
8390   // Fold total 512 bits of polynomial on each iteration,
8391   // 128 bits per each of 4 parallel streams.
8392   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32));
8393 
8394   align(32);
8395   BIND(L_fold_512b_loop);
8396   fold_128bit_crc32(xmm1, xmm0, xmm5, buf,  0);
8397   fold_128bit_crc32(xmm2, xmm0, xmm5, buf, 16);
8398   fold_128bit_crc32(xmm3, xmm0, xmm5, buf, 32);
8399   fold_128bit_crc32(xmm4, xmm0, xmm5, buf, 48);
8400   addptr(buf, 64);
8401   subl(len, 4);
8402   jcc(Assembler::greater, L_fold_512b_loop);
8403 
8404   // Fold 512 bits to 128 bits.
8405   BIND(L_fold_512b);
8406   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
8407   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm2);
8408   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm3);
8409   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm4);
8410 
8411   // Fold the rest of 128 bits data chunks
8412   BIND(L_fold_tail);
8413   addl(len, 3);
8414   jccb(Assembler::lessEqual, L_fold_128b);
8415   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
8416 
8417   BIND(L_fold_tail_loop);
8418   fold_128bit_crc32(xmm1, xmm0, xmm5, buf,  0);
8419   addptr(buf, 16);
8420   decrementl(len);
8421   jccb(Assembler::greater, L_fold_tail_loop);
8422 
8423   // Fold 128 bits in xmm1 down into 32 bits in crc register.
8424   BIND(L_fold_128b);
8425   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr()));
8426   if (UseAVX > 0) {
8427     vpclmulqdq(xmm2, xmm0, xmm1, 0x1);
8428     vpand(xmm3, xmm0, xmm2, 0 /* vector_len */);
8429     vpclmulqdq(xmm0, xmm0, xmm3, 0x1);
8430   } else {
8431     movdqa(xmm2, xmm0);
8432     pclmulqdq(xmm2, xmm1, 0x1);
8433     movdqa(xmm3, xmm0);
8434     pand(xmm3, xmm2);
8435     pclmulqdq(xmm0, xmm3, 0x1);
8436   }
8437   psrldq(xmm1, 8);
8438   psrldq(xmm2, 4);
8439   pxor(xmm0, xmm1);
8440   pxor(xmm0, xmm2);
8441 
8442   // 8 8-bit folds to compute 32-bit CRC.
8443   for (int j = 0; j < 4; j++) {
8444     fold_8bit_crc32(xmm0, table, xmm1, rax);
8445   }
8446   movdl(crc, xmm0); // mov 32 bits to general register
8447   for (int j = 0; j < 4; j++) {
8448     fold_8bit_crc32(crc, table, rax);
8449   }
8450 
8451   BIND(L_tail_restore);
8452   movl(len, tmp); // restore
8453   BIND(L_tail);
8454   andl(len, 0xf);
8455   jccb(Assembler::zero, L_exit);
8456 
8457   // Fold the rest of bytes
8458   align(4);
8459   BIND(L_tail_loop);
8460   movsbl(rax, Address(buf, 0)); // load byte with sign extension
8461   update_byte_crc32(crc, rax, table);
8462   increment(buf);
8463   decrementl(len);
8464   jccb(Assembler::greater, L_tail_loop);
8465 
8466   BIND(L_exit);
8467   notl(crc); // ~c
8468 }
8469 
8470 namespace CRC32C {
8471 #include "crc32c.h"
8472 
8473 #define Nehalem(x) x
8474 #define Westmere(x) x
8475 
8476 #undef IN
8477 #define IN(x) x
8478 #define INOUT(x) x
8479 #undef OUT
8480 #define OUT(x) x
8481 #define Scratch(x) x
8482 
8483 #undef D
8484 
8485 #ifdef _LP64
8486 // S. Gueron / Information Processing Letters 112 (2012) 184
8487 // Algorithm 4: Computing carry-less multiplication using a precomputed lookup table.
8488 // Input: A 32 bit value B = [byte3, byte2, byte1, byte0].
8489 // Output: the 64-bit carry-less product of B * CONST
8490   void IPL_Alg4(INOUT(Register B), uint32_t n,
8491   Scratch(Register C), Scratch(Register D), Scratch(Register Z),
8492   MacroAssembler * This) {
8493     This->lea(Z, ExternalAddress(StubRoutines::crc32c_table_addr()));
8494     if (n > 0) {
8495       This->addq(Z, n * 256 * 8);
8496     }
8497     //    Q1 = TABLEExt[n][B & 0xFF];
8498     This->movl(C, B);
8499     This->andl(C, 0x000000FF);
8500     This->shll(C, 3);
8501     This->addq(C, Z);
8502     This->movq(C, Address(C, 0));
8503 
8504     //    Q2 = TABLEExt[n][B >> 8 & 0xFF];
8505     This->movl(D, B);
8506     This->shrl(D, 8);
8507     This->andl(D, 0x000000FF);
8508     This->shll(D, 3);
8509     This->addq(D, Z);
8510     This->movq(D, Address(D, 0));
8511 
8512     This->shlq(D, 8);
8513     This->xorq(C, D);
8514 
8515     //    Q3 = TABLEExt[n][B >> 16 & 0xFF];
8516     This->movl(D, B);
8517     This->shrl(D, 16);
8518     This->andl(D, 0x000000FF);
8519     This->shll(D, 3);
8520     This->addq(D, Z);
8521     This->movq(D, Address(D, 0));
8522 
8523     This->shlq(D, 16);
8524     This->xorq(C, D);
8525 
8526     //    Q4 = TABLEExt[n][B >> 24 & 0xFF];
8527     This->shrl(B, 24);
8528     This->andl(B, 0x000000FF);
8529     This->shll(B, 3);
8530     This->addq(B, Z);
8531     This->movq(B, Address(B, 0));
8532  
8533     This->shlq(B, 24);
8534     This->xorq(B, C);
8535     //    return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
8536   }
8537 
8538   void PCLMULQDQ(Westmere(Scratch(XMMRegister crcXMM)),
8539     INOUT(Register crc),
8540     uint32_t CONSTOrPreCompConstIndex, bool IsPclmulqdqSupported,
8541     Westmere(Scratch(XMMRegister DXMM)),
8542     Scratch(Register A),
8543     Nehalem(Scratch(Register B)), Nehalem(Scratch(Register C)),
8544     MacroAssembler * This) {
8545     if (IsPclmulqdqSupported) {
8546       This->movdl(crcXMM, crc); // modified blindly
8547 
8548       This->movl(A, CONSTOrPreCompConstIndex);
8549       This->movdl(DXMM, A);
8550       This->pclmulqdq(crcXMM, DXMM, 0);
8551 
8552       This->movdq(crc, crcXMM);
8553     } else {
8554       IPL_Alg4(crc, CONSTOrPreCompConstIndex, A, B, C, This);
8555     }
8556   }
8557 
8558   // Recombination Alternative 2: No bit-reflections
8559   // T1 = (CRC_A * U1) << 1
8560   // T2 = (CRC_B * U2) << 1
8561   // C1 = T1 >> 32
8562   // C2 = T2 >> 32
8563   // T1 = T1 & 0xFFFFFFFF
8564   // T2 = T2 & 0xFFFFFFFF
8565   // T1 = CRC32(0, T1)
8566   // T2 = CRC32(0, T2)
8567   // C1 = C1 ^ T1
8568   // C2 = C2 ^ T2
8569   // CRC = C1 ^ C2 ^ CRC_C
8570   void RecAlt2(uint32_t CONSTOrPreCompConstIndexU1, uint32_t CONSTOrPreCompConstIndexU2, bool IsPclmulqdqSupported, INOUT(Register crcA), IN(Scratch(Register crcB)), IN(Register crcC),
8571     Westmere(Scratch(XMMRegister AXMM)), Westmere(Scratch(XMMRegister BXMM)), Westmere(Scratch(XMMRegister CXMM)),
8572     Scratch(Register E), Scratch(Register F),
8573     Nehalem(Scratch(Register G)),
8574     MacroAssembler * This) {
8575     PCLMULQDQ(AXMM, crcA, CONSTOrPreCompConstIndexU1, IsPclmulqdqSupported, CXMM, E, F, G, This);
8576     PCLMULQDQ(BXMM, crcB, CONSTOrPreCompConstIndexU2, IsPclmulqdqSupported, CXMM, E, F, G, This);
8577     This->shlq(crcA, 1);
8578     This->movl(E, crcA);
8579     This->shrq(crcA, 32);
8580     This->xorl(F, F);
8581     This->crc32(F, E, 4);
8582     This->xorl(crcA, F); // we don't care about upper 32 bit contents here
8583     This->shlq(crcB, 1);
8584     This->movl(E, crcB);
8585     This->shrq(crcB, 32);
8586     This->xorl(F, F);
8587     This->crc32(F, E, 4);
8588     This->xorl(crcB, F);
8589     This->xorl(crcA, crcB);
8590     This->xorl(crcA, crcC);
8591   }
8592 
8593   // Set N to predefined value
8594   // Subtract from a lenght of a buffer
8595   // execute in a loop:
8596   // CRC_A = 0xFFFFFFFF, CRC_B = 0, CRC_C = 0
8597   // for i = 1 to N do
8598   //  CRC_A = CRC32(CRC_A, A[i])
8599   //  CRC_B = CRC32(CRC_B, B[i])
8600   //  CRC_C = CRC32(CRC_C, C[i])
8601   // end for
8602   // Recombine
8603   void ProcChunk(uint32_t size, uint32_t CONSTOrPreCompConstIndexU1, uint32_t CONSTOrPreCompConstIndexU2, bool IsPclmulqdqSupported,
8604     INOUT(Register len), INOUT(Register buf), INOUT(Register crc),
8605     Scratch(Register E), Scratch(Register F), Scratch(Register end), 
8606     Westmere(Scratch(XMMRegister AXMM)), Westmere(Scratch(XMMRegister BXMM)), Westmere(Scratch(XMMRegister CXMM)),
8607     Scratch(Register G), Scratch(Register H), 
8608     Nehalem(Scratch(Register I)),
8609     MacroAssembler * This) {
8610     Label L_processPartitions;
8611     Label L_processPartition;
8612     Label L_exit;
8613     
8614     This->bind(L_processPartitions);
8615     This->cmpl(len, 3 * size);
8616     This->jcc(Assembler::less, L_exit);
8617       This->xorl(E, E);
8618       This->xorl(F, F);
8619       This->movq(end, buf);
8620       This->addq(end, size);
8621 
8622       This->bind(L_processPartition);
8623         This->crc32(crc, Address(buf, 0), 8);
8624         This->crc32(E, Address(buf, size), 8);
8625         This->crc32(F, Address(buf, size * 2), 8);
8626         This->addq(buf, 8);
8627         This->cmpq(buf, end);
8628         This->jcc(Assembler::less, L_processPartition);
8629       RecAlt2(CONSTOrPreCompConstIndexU1, CONSTOrPreCompConstIndexU2, IsPclmulqdqSupported, crc, E, F, 
8630       AXMM, BXMM, CXMM,
8631       G, H,
8632       I, 
8633       This);
8634       This->addq(buf, 2 * size);
8635       This->subl(len, 3 * size);
8636       This->jmp(L_processPartitions);
8637 
8638     This->bind(L_exit);
8639   }
8640 #else
8641 void IPL_Alg4(INOUT(Register B), uint32_t n,
8642   Scratch(Register C), Scratch(Register D), Scratch(Register Z),
8643   Scratch(XMMRegister CXMM), Scratch(XMMRegister DXMM),
8644   MacroAssembler * This) {
8645   This->lea(Z, ExternalAddress(StubRoutines::crc32c_table_addr()));
8646   if (n > 0) {
8647     This->addl(Z, n * 256 * 8);
8648   }
8649   //    Q1 = TABLEExt[n][B & 0xFF];
8650   This->movl(C, B);
8651   This->andl(C, 0x000000FF);
8652   This->shll(C, 3);
8653   This->addl(C, Z);
8654   This->movq(CXMM, Address(C, 0));
8655 
8656   //    Q2 = TABLEExt[n][B >> 8 & 0xFF];
8657   This->movl(D, B);
8658   This->shrl(D, 8);
8659   This->andl(D, 0x000000FF);
8660   This->shll(D, 3);
8661   This->addl(D, Z);
8662   This->movq(DXMM, Address(D, 0));
8663 
8664   This->psllq(DXMM, 8);
8665   This->pxor(CXMM, DXMM);
8666 
8667   //    Q3 = TABLEExt[n][B >> 16 & 0xFF];
8668   This->movl(D, B);
8669   This->shrl(D, 16);
8670   This->andl(D, 0x000000FF);
8671   This->shll(D, 3);
8672   This->addl(D, Z);
8673   This->movq(DXMM, Address(D, 0));
8674 
8675   This->psllq(DXMM, 16);
8676   This->pxor(CXMM, DXMM);
8677 
8678   //    Q4 = TABLEExt[n][B >> 24 & 0xFF];
8679   This->shrl(B, 24);
8680   This->andl(B, 0x000000FF);
8681   This->shll(B, 3);
8682   This->addl(B, Z);
8683   This->movq(DXMM, Address(B, 0));
8684 
8685   This->psllq(DXMM, 24);
8686   This->pxor(CXMM, DXMM); // Result in CXMM
8687   //    return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
8688 }
8689 
8690 void PCLMULQDQ(Westmere(Scratch(XMMRegister crcXMM)),
8691   INOUT(Register crc),
8692   uint32_t CONSTOrPreCompConstIndex, bool IsPclmulqdqSupported,
8693   Westmere(Scratch(XMMRegister DXMM)),
8694   Scratch(Register A),
8695   Nehalem(Scratch(Register B)), Nehalem(Scratch(Register C)),
8696   MacroAssembler * This) {
8697   if (IsPclmulqdqSupported) {
8698     This->movdl(crcXMM, crc);
8699 
8700     This->movl(A, CONSTOrPreCompConstIndex);
8701     This->movdl(DXMM, A);
8702     This->pclmulqdq(crcXMM, DXMM, 0);
8703     // Keep result in XMM since GPR is 32 bit in length
8704   } else {
8705     IPL_Alg4(crc, CONSTOrPreCompConstIndex, A, B, C, crcXMM, DXMM, This);
8706   }
8707 }
8708 
8709 void RecAlt2(uint32_t CONSTOrPreCompConstIndexU1, uint32_t CONSTOrPreCompConstIndexU2, bool IsPclmulqdqSupported, INOUT(Register crcA), IN(Scratch(Register crcB)), IN(Register crcC),
8710   Westmere(Scratch(XMMRegister AXMM)), Westmere(Scratch(XMMRegister BXMM)), Westmere(Scratch(XMMRegister CXMM)),
8711   Scratch(Register E), Scratch(Register F),
8712   Nehalem(Scratch(Register G)),
8713   MacroAssembler * This) {
8714   PCLMULQDQ(AXMM, crcA, CONSTOrPreCompConstIndexU1, IsPclmulqdqSupported, CXMM, E, F, G, This);
8715   PCLMULQDQ(BXMM, crcB, CONSTOrPreCompConstIndexU2, IsPclmulqdqSupported, CXMM, E, F, G, This);
8716   
8717   This->psllq(AXMM, 1);
8718   This->movdl(E, AXMM);
8719   This->psrlq(AXMM, 32);
8720   This->movdl(crcA, AXMM);
8721 
8722   This->xorl(F, F);
8723   This->crc32(F, E, 4);
8724   This->xorl(crcA, F);
8725   
8726   This->psllq(BXMM, 1);
8727   This->movdl(E, BXMM);
8728   This->psrlq(BXMM, 32);
8729   This->movdl(crcB, BXMM);
8730 
8731   This->xorl(F, F);
8732   This->crc32(F, E, 4);
8733   This->xorl(crcB, F);
8734   This->xorl(crcA, crcB);
8735   This->xorl(crcA, crcC);
8736 }
8737 
8738 void ProcChunk(uint32_t size, uint32_t CONSTOrPreCompConstIndexU1, uint32_t CONSTOrPreCompConstIndexU2, bool IsPclmulqdqSupported,
8739   INOUT(Register len), INOUT(Register buf), INOUT(Register crc),
8740   Scratch(Register E), Scratch(Register F), Scratch(Register end),
8741   Westmere(Scratch(XMMRegister AXMM)), Westmere(Scratch(XMMRegister BXMM)), Westmere(Scratch(XMMRegister CXMM)),
8742   Scratch(Register G), Scratch(Register H),
8743   Nehalem(Scratch(Register I)),
8744   MacroAssembler * This) {
8745   Label L_processPartitions;
8746   Label L_processPartition;
8747   Label L_exit;
8748 
8749   This->bind(L_processPartitions);
8750   This->cmpl(len, 3 * size);
8751   This->jcc(Assembler::less, L_exit);
8752     This->xorl(E, E);
8753     This->xorl(F, F);
8754     This->movl(end, buf);
8755     This->addl(end, size);
8756 
8757     This->bind(L_processPartition);
8758       This->crc32(crc, Address(buf, 0), 4);
8759       This->crc32(E, Address(buf, size), 4);
8760       This->crc32(F, Address(buf, size*2), 4);
8761       This->crc32(crc, Address(buf, 0+4), 4);
8762       This->crc32(E, Address(buf, size+4), 4);
8763       This->crc32(F, Address(buf, size*2+4), 4);
8764       This->addl(buf, 8);
8765       This->cmpl(buf, end);
8766       This->jcc(Assembler::less, L_processPartition);
8767 
8768         This->push(end);
8769         This->push(len);
8770         This->push(buf);
8771         G = end;
8772         H = len;
8773         I = buf;
8774 
8775     RecAlt2(CONSTOrPreCompConstIndexU1, CONSTOrPreCompConstIndexU2, IsPclmulqdqSupported, crc, E, F,
8776       AXMM, BXMM, CXMM,
8777       G, H,
8778       I,
8779       This);
8780 
8781         This->pop(buf);
8782         This->pop(len);
8783         This->pop(end);
8784 
8785     This->addl(buf, 2 * size);
8786     This->subl(len, 3 * size);
8787     This->jmp(L_processPartitions);
8788 
8789   This->bind(L_exit);
8790 }
8791 #endif //LP64
8792 }
8793 #undef D
8794 
8795 #ifdef _LP64
8796 // Algorithm 2: Pipelined usage of the CRC32 instruction.
8797 // Input: A buffer I of L bytes.
8798 // Output: the CRC32C value of the buffer.
8799 // Notations:
8800 // Write L = 24N + r, with N = floor (L/24).
8801 // r = L mod 24 (0 <= r < 24).
8802 // Consider I as the concatenation of A|B|C|R, where A, B, C, each,
8803 // N quadwords, and R consists of r bytes.
8804 // A[j] = I [8j+7:8j], j= 0, 1, ..., N-1
8805 // B[j] = I [N + 8j+7:N + 8j], j= 0, 1, ..., N-1
8806 // C[j] = I [2N + 8j+7:2N + 8j], j= 0, 1, ..., N-1
8807 // if r > 0 R[j] = I [3N +j], j= 0, 1, ...,r-1
8808 void MacroAssembler::crc32c_IPL_Alg2Alt2Fast(Register crc, Register buf, Register len,
8809   Scratch(Register A), Scratch(Register  B), Scratch(Register C),
8810   Scratch(Register D), Scratch(Register  E), Scratch(Register F),
8811   Westmere(Scratch(XMMRegister AXMM)), Westmere(Scratch(XMMRegister BXMM)), Westmere(Scratch(XMMRegister CXMM)),
8812   bool IsPclmulqdqSupported) {
8813   uint32_t CONSTOrPreCompConstIndex[CRC32C::NUM_PRECOMPUTED_CONSTANTS];
8814   Label L_wordByWord;
8815   Label L_byteByByteProlog;
8816   Label L_byteByByte;
8817   Label L_exit;
8818 
8819   if (IsPclmulqdqSupported ) {
8820     CONSTOrPreCompConstIndex[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
8821     CONSTOrPreCompConstIndex[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr+1);
8822 
8823     CONSTOrPreCompConstIndex[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
8824     CONSTOrPreCompConstIndex[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
8825 
8826     CONSTOrPreCompConstIndex[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
8827     CONSTOrPreCompConstIndex[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
8828     assert((CRC32C::NUM_PRECOMPUTED_CONSTANTS - 1 ) == 5, "Checking whether you declared all of the constants based on the number of \"chunks\"");
8829   } else {
8830     CONSTOrPreCompConstIndex[0] = 1;
8831     CONSTOrPreCompConstIndex[1] = 0;
8832 
8833     CONSTOrPreCompConstIndex[2] = 3;
8834     CONSTOrPreCompConstIndex[3] = 2;
8835 
8836     CONSTOrPreCompConstIndex[4] = 5;
8837     CONSTOrPreCompConstIndex[5] = 4;
8838    }
8839   CRC32C::ProcChunk(CRC32C::HIGH, CONSTOrPreCompConstIndex[0], CONSTOrPreCompConstIndex[1], IsPclmulqdqSupported,
8840     len, buf, crc, 
8841     A, B, C,
8842     AXMM, BXMM, CXMM,
8843     D, E, 
8844     F, 
8845     this);
8846   CRC32C::ProcChunk(CRC32C::MIDDLE, CONSTOrPreCompConstIndex[2], CONSTOrPreCompConstIndex[3], IsPclmulqdqSupported,
8847     len, buf, crc,
8848     A, B, C,
8849     AXMM, BXMM, CXMM,
8850     D, E, 
8851     F,
8852     this);
8853   CRC32C::ProcChunk(CRC32C::LOW, CONSTOrPreCompConstIndex[4], CONSTOrPreCompConstIndex[5], IsPclmulqdqSupported,
8854     len, buf, crc,
8855     A, B, C,
8856     AXMM, BXMM, CXMM,
8857     D, E, 
8858     F, 
8859     this);
8860   movl(A, len);
8861   andl(A, 0x00000007);
8862   negl(A);
8863   addl(A, len);
8864   addq(A, buf);
8865 
8866   BIND(L_wordByWord);
8867   cmpq(buf, A);
8868   jcc(Assembler::greaterEqual, L_byteByByteProlog);
8869     crc32(crc, Address(buf, 0), 4);
8870     addq(buf, 4);
8871     jmp(L_wordByWord);
8872   
8873   BIND(L_byteByByteProlog);
8874   andl(len, 0x00000007);
8875   movl(B, 1);
8876 
8877   BIND(L_byteByByte);
8878   cmpl(B, len);
8879   jccb(Assembler::greater, L_exit);
8880     crc32(crc, Address(buf, 0), 1);
8881     incq(buf);
8882     incl(B);
8883     jmp(L_byteByByte);
8884 
8885   BIND(L_exit);
8886 }
8887 #else
8888 void MacroAssembler::crc32c_IPL_Alg2Alt2Fast(Register crc, Register buf, Register len,
8889   Scratch(Register A), Scratch(Register  B), Scratch(Register C),
8890   Scratch(Register D), Scratch(Register  E), Scratch(Register F),
8891   Westmere(Scratch(XMMRegister AXMM)), Westmere(Scratch(XMMRegister BXMM)), Westmere(Scratch(XMMRegister CXMM)),
8892   bool IsPclmulqdqSupported) {
8893   uint32_t CONSTOrPreCompConstIndex[CRC32C::NUM_PRECOMPUTED_CONSTANTS];
8894   Label L_wordByWord;
8895   Label L_byteByByteProlog;
8896   Label L_byteByByte;
8897   Label L_exit;
8898 
8899   if (IsPclmulqdqSupported) {
8900     CONSTOrPreCompConstIndex[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
8901     CONSTOrPreCompConstIndex[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 1);
8902 
8903     CONSTOrPreCompConstIndex[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
8904     CONSTOrPreCompConstIndex[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
8905 
8906     CONSTOrPreCompConstIndex[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
8907     CONSTOrPreCompConstIndex[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
8908   } else {
8909     CONSTOrPreCompConstIndex[0] = 1;
8910     CONSTOrPreCompConstIndex[1] = 0;
8911 
8912     CONSTOrPreCompConstIndex[2] = 3;
8913     CONSTOrPreCompConstIndex[3] = 2;
8914 
8915     CONSTOrPreCompConstIndex[4] = 5;
8916     CONSTOrPreCompConstIndex[5] = 4;
8917   }
8918   CRC32C::ProcChunk(CRC32C::HIGH, CONSTOrPreCompConstIndex[0], CONSTOrPreCompConstIndex[1], IsPclmulqdqSupported,
8919     len, buf, crc,
8920     A, B, C,
8921     AXMM, BXMM, CXMM,
8922     D, E,
8923     F,
8924     this);
8925   CRC32C::ProcChunk(CRC32C::MIDDLE, CONSTOrPreCompConstIndex[2], CONSTOrPreCompConstIndex[3], IsPclmulqdqSupported,
8926     len, buf, crc,
8927     A, B, C,
8928     AXMM, BXMM, CXMM,
8929     D, E,
8930     F,
8931     this);
8932   CRC32C::ProcChunk(CRC32C::LOW, CONSTOrPreCompConstIndex[4], CONSTOrPreCompConstIndex[5], IsPclmulqdqSupported,
8933     len, buf, crc,
8934     A, B, C,
8935     AXMM, BXMM, CXMM,
8936     D, E,
8937     F,
8938     this);
8939   movl(A, len);
8940   andl(A, 0x00000007);
8941   negl(A);
8942   addl(A, len);
8943   addl(A, buf);
8944 
8945   BIND(L_wordByWord);
8946   cmpl(buf, A);
8947   jcc(Assembler::greaterEqual, L_byteByByteProlog);
8948     crc32(crc, Address(buf,0), 4);
8949     addl(buf, 4);
8950     jmp(L_wordByWord);
8951 
8952   BIND(L_byteByByteProlog);
8953   andl(len, 0x00000007);
8954   movl(B, 1);
8955 
8956   BIND(L_byteByByte);
8957   cmpl(B, len);
8958   jccb(Assembler::greater, L_exit);
8959     movb(A, Address(buf, 0));
8960     crc32(crc, A, 1);
8961     incl(buf);
8962     incl(B);
8963     jmp(L_byteByByte);
8964 
8965   BIND(L_exit);
8966 }
8967 #endif // LP64
8968 
8969 
8970 Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
8971   switch (cond) {
8972     // Note some conditions are synonyms for others
8973     case Assembler::zero:         return Assembler::notZero;
8974     case Assembler::notZero:      return Assembler::zero;
8975     case Assembler::less:         return Assembler::greaterEqual;
8976     case Assembler::lessEqual:    return Assembler::greater;
8977     case Assembler::greater:      return Assembler::lessEqual;
8978     case Assembler::greaterEqual: return Assembler::less;
8979     case Assembler::below:        return Assembler::aboveEqual;
8980     case Assembler::belowEqual:   return Assembler::above;
8981     case Assembler::above:        return Assembler::belowEqual;
8982     case Assembler::aboveEqual:   return Assembler::below;
8983     case Assembler::overflow:     return Assembler::noOverflow;
8984     case Assembler::noOverflow:   return Assembler::overflow;
8985     case Assembler::negative:     return Assembler::positive;
8986     case Assembler::positive:     return Assembler::negative;
8987     case Assembler::parity:       return Assembler::noParity;
8988     case Assembler::noParity:     return Assembler::parity;
8989   }
8990   ShouldNotReachHere(); return Assembler::overflow;
8991 }
8992 
8993 SkipIfEqual::SkipIfEqual(
8994     MacroAssembler* masm, const bool* flag_addr, bool value) {
8995   _masm = masm;
8996   _masm->cmp8(ExternalAddress((address)flag_addr), value);
8997   _masm->jcc(Assembler::equal, _label);
8998 }
8999 
9000 SkipIfEqual::~SkipIfEqual() {
9001   _masm->bind(_label);
9002 }