1 /*
   2  * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "jvm.h"
  27 #include "asm/assembler.hpp"
  28 #include "asm/assembler.inline.hpp"
  29 #include "compiler/disassembler.hpp"
  30 #include "gc/shared/barrierSet.hpp"
  31 #include "gc/shared/barrierSetAssembler.hpp"
  32 #include "gc/shared/collectedHeap.inline.hpp"
  33 #include "interpreter/interpreter.hpp"
  34 #include "memory/resourceArea.hpp"
  35 #include "memory/universe.hpp"
  36 #include "oops/accessDecorators.hpp"
  37 #include "oops/compressedOops.inline.hpp"
  38 #include "oops/klass.inline.hpp"
  39 #include "prims/methodHandles.hpp"
  40 #include "runtime/biasedLocking.hpp"
  41 #include "runtime/flags/flagSetting.hpp"
  42 #include "runtime/interfaceSupport.inline.hpp"
  43 #include "runtime/objectMonitor.hpp"
  44 #include "runtime/os.hpp"
  45 #include "runtime/safepoint.hpp"
  46 #include "runtime/safepointMechanism.hpp"
  47 #include "runtime/sharedRuntime.hpp"
  48 #include "runtime/signature_cc.hpp"
  49 #include "runtime/stubRoutines.hpp"
  50 #include "runtime/thread.hpp"
  51 #include "utilities/macros.hpp"
  52 #include "vmreg_x86.inline.hpp"
  53 #include "crc32c.h"
  54 #ifdef COMPILER2
  55 #include "opto/intrinsicnode.hpp"
  56 #endif
  57 
  58 #ifdef PRODUCT
  59 #define BLOCK_COMMENT(str) /* nothing */
  60 #define STOP(error) stop(error)
  61 #else
  62 #define BLOCK_COMMENT(str) block_comment(str)
  63 #define STOP(error) block_comment(error); stop(error)
  64 #endif
  65 
  66 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  67 
  68 #ifdef ASSERT
  69 bool AbstractAssembler::pd_check_instruction_mark() { return true; }
  70 #endif
  71 
  72 static Assembler::Condition reverse[] = {
  73     Assembler::noOverflow     /* overflow      = 0x0 */ ,
  74     Assembler::overflow       /* noOverflow    = 0x1 */ ,
  75     Assembler::aboveEqual     /* carrySet      = 0x2, below         = 0x2 */ ,
  76     Assembler::below          /* aboveEqual    = 0x3, carryClear    = 0x3 */ ,
  77     Assembler::notZero        /* zero          = 0x4, equal         = 0x4 */ ,
  78     Assembler::zero           /* notZero       = 0x5, notEqual      = 0x5 */ ,
  79     Assembler::above          /* belowEqual    = 0x6 */ ,
  80     Assembler::belowEqual     /* above         = 0x7 */ ,
  81     Assembler::positive       /* negative      = 0x8 */ ,
  82     Assembler::negative       /* positive      = 0x9 */ ,
  83     Assembler::noParity       /* parity        = 0xa */ ,
  84     Assembler::parity         /* noParity      = 0xb */ ,
  85     Assembler::greaterEqual   /* less          = 0xc */ ,
  86     Assembler::less           /* greaterEqual  = 0xd */ ,
  87     Assembler::greater        /* lessEqual     = 0xe */ ,
  88     Assembler::lessEqual      /* greater       = 0xf, */
  89 
  90 };
  91 
  92 
  93 // Implementation of MacroAssembler
  94 
  95 // First all the versions that have distinct versions depending on 32/64 bit
  96 // Unless the difference is trivial (1 line or so).
  97 
  98 #ifndef _LP64
  99 
 100 // 32bit versions
 101 
 102 Address MacroAssembler::as_Address(AddressLiteral adr) {
 103   return Address(adr.target(), adr.rspec());
 104 }
 105 
 106 Address MacroAssembler::as_Address(ArrayAddress adr) {
 107   return Address::make_array(adr);
 108 }
 109 
 110 void MacroAssembler::call_VM_leaf_base(address entry_point,
 111                                        int number_of_arguments) {
 112   call(RuntimeAddress(entry_point));
 113   increment(rsp, number_of_arguments * wordSize);
 114 }
 115 
 116 void MacroAssembler::cmpklass(Address src1, Metadata* obj) {
 117   cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 118 }
 119 
 120 void MacroAssembler::cmpklass(Register src1, Metadata* obj) {
 121   cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 122 }
 123 
 124 void MacroAssembler::cmpoop_raw(Address src1, jobject obj) {
 125   cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
 126 }
 127 
 128 void MacroAssembler::cmpoop_raw(Register src1, jobject obj) {
 129   cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
 130 }
 131 
 132 void MacroAssembler::cmpoop(Address src1, jobject obj) {
 133   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 134   bs->obj_equals(this, src1, obj);
 135 }
 136 
 137 void MacroAssembler::cmpoop(Register src1, jobject obj) {
 138   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 139   bs->obj_equals(this, src1, obj);
 140 }
 141 
 142 void MacroAssembler::extend_sign(Register hi, Register lo) {
 143   // According to Intel Doc. AP-526, "Integer Divide", p.18.
 144   if (VM_Version::is_P6() && hi == rdx && lo == rax) {
 145     cdql();
 146   } else {
 147     movl(hi, lo);
 148     sarl(hi, 31);
 149   }
 150 }
 151 
 152 void MacroAssembler::jC2(Register tmp, Label& L) {
 153   // set parity bit if FPU flag C2 is set (via rax)
 154   save_rax(tmp);
 155   fwait(); fnstsw_ax();
 156   sahf();
 157   restore_rax(tmp);
 158   // branch
 159   jcc(Assembler::parity, L);
 160 }
 161 
 162 void MacroAssembler::jnC2(Register tmp, Label& L) {
 163   // set parity bit if FPU flag C2 is set (via rax)
 164   save_rax(tmp);
 165   fwait(); fnstsw_ax();
 166   sahf();
 167   restore_rax(tmp);
 168   // branch
 169   jcc(Assembler::noParity, L);
 170 }
 171 
 172 // 32bit can do a case table jump in one instruction but we no longer allow the base
 173 // to be installed in the Address class
 174 void MacroAssembler::jump(ArrayAddress entry) {
 175   jmp(as_Address(entry));
 176 }
 177 
 178 // Note: y_lo will be destroyed
 179 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
 180   // Long compare for Java (semantics as described in JVM spec.)
 181   Label high, low, done;
 182 
 183   cmpl(x_hi, y_hi);
 184   jcc(Assembler::less, low);
 185   jcc(Assembler::greater, high);
 186   // x_hi is the return register
 187   xorl(x_hi, x_hi);
 188   cmpl(x_lo, y_lo);
 189   jcc(Assembler::below, low);
 190   jcc(Assembler::equal, done);
 191 
 192   bind(high);
 193   xorl(x_hi, x_hi);
 194   increment(x_hi);
 195   jmp(done);
 196 
 197   bind(low);
 198   xorl(x_hi, x_hi);
 199   decrementl(x_hi);
 200 
 201   bind(done);
 202 }
 203 
 204 void MacroAssembler::lea(Register dst, AddressLiteral src) {
 205     mov_literal32(dst, (int32_t)src.target(), src.rspec());
 206 }
 207 
 208 void MacroAssembler::lea(Address dst, AddressLiteral adr) {
 209   // leal(dst, as_Address(adr));
 210   // see note in movl as to why we must use a move
 211   mov_literal32(dst, (int32_t) adr.target(), adr.rspec());
 212 }
 213 
 214 void MacroAssembler::leave() {
 215   mov(rsp, rbp);
 216   pop(rbp);
 217 }
 218 
 219 void MacroAssembler::lmul(int x_rsp_offset, int y_rsp_offset) {
 220   // Multiplication of two Java long values stored on the stack
 221   // as illustrated below. Result is in rdx:rax.
 222   //
 223   // rsp ---> [  ??  ] \               \
 224   //            ....    | y_rsp_offset  |
 225   //          [ y_lo ] /  (in bytes)    | x_rsp_offset
 226   //          [ y_hi ]                  | (in bytes)
 227   //            ....                    |
 228   //          [ x_lo ]                 /
 229   //          [ x_hi ]
 230   //            ....
 231   //
 232   // Basic idea: lo(result) = lo(x_lo * y_lo)
 233   //             hi(result) = hi(x_lo * y_lo) + lo(x_hi * y_lo) + lo(x_lo * y_hi)
 234   Address x_hi(rsp, x_rsp_offset + wordSize); Address x_lo(rsp, x_rsp_offset);
 235   Address y_hi(rsp, y_rsp_offset + wordSize); Address y_lo(rsp, y_rsp_offset);
 236   Label quick;
 237   // load x_hi, y_hi and check if quick
 238   // multiplication is possible
 239   movl(rbx, x_hi);
 240   movl(rcx, y_hi);
 241   movl(rax, rbx);
 242   orl(rbx, rcx);                                 // rbx, = 0 <=> x_hi = 0 and y_hi = 0
 243   jcc(Assembler::zero, quick);                   // if rbx, = 0 do quick multiply
 244   // do full multiplication
 245   // 1st step
 246   mull(y_lo);                                    // x_hi * y_lo
 247   movl(rbx, rax);                                // save lo(x_hi * y_lo) in rbx,
 248   // 2nd step
 249   movl(rax, x_lo);
 250   mull(rcx);                                     // x_lo * y_hi
 251   addl(rbx, rax);                                // add lo(x_lo * y_hi) to rbx,
 252   // 3rd step
 253   bind(quick);                                   // note: rbx, = 0 if quick multiply!
 254   movl(rax, x_lo);
 255   mull(y_lo);                                    // x_lo * y_lo
 256   addl(rdx, rbx);                                // correct hi(x_lo * y_lo)
 257 }
 258 
 259 void MacroAssembler::lneg(Register hi, Register lo) {
 260   negl(lo);
 261   adcl(hi, 0);
 262   negl(hi);
 263 }
 264 
 265 void MacroAssembler::lshl(Register hi, Register lo) {
 266   // Java shift left long support (semantics as described in JVM spec., p.305)
 267   // (basic idea for shift counts s >= n: x << s == (x << n) << (s - n))
 268   // shift value is in rcx !
 269   assert(hi != rcx, "must not use rcx");
 270   assert(lo != rcx, "must not use rcx");
 271   const Register s = rcx;                        // shift count
 272   const int      n = BitsPerWord;
 273   Label L;
 274   andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
 275   cmpl(s, n);                                    // if (s < n)
 276   jcc(Assembler::less, L);                       // else (s >= n)
 277   movl(hi, lo);                                  // x := x << n
 278   xorl(lo, lo);
 279   // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
 280   bind(L);                                       // s (mod n) < n
 281   shldl(hi, lo);                                 // x := x << s
 282   shll(lo);
 283 }
 284 
 285 
 286 void MacroAssembler::lshr(Register hi, Register lo, bool sign_extension) {
 287   // Java shift right long support (semantics as described in JVM spec., p.306 & p.310)
 288   // (basic idea for shift counts s >= n: x >> s == (x >> n) >> (s - n))
 289   assert(hi != rcx, "must not use rcx");
 290   assert(lo != rcx, "must not use rcx");
 291   const Register s = rcx;                        // shift count
 292   const int      n = BitsPerWord;
 293   Label L;
 294   andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
 295   cmpl(s, n);                                    // if (s < n)
 296   jcc(Assembler::less, L);                       // else (s >= n)
 297   movl(lo, hi);                                  // x := x >> n
 298   if (sign_extension) sarl(hi, 31);
 299   else                xorl(hi, hi);
 300   // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
 301   bind(L);                                       // s (mod n) < n
 302   shrdl(lo, hi);                                 // x := x >> s
 303   if (sign_extension) sarl(hi);
 304   else                shrl(hi);
 305 }
 306 
 307 void MacroAssembler::movoop(Register dst, jobject obj) {
 308   mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
 309 }
 310 
 311 void MacroAssembler::movoop(Address dst, jobject obj) {
 312   mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
 313 }
 314 
 315 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
 316   mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 317 }
 318 
 319 void MacroAssembler::mov_metadata(Address dst, Metadata* obj) {
 320   mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 321 }
 322 
 323 void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) {
 324   // scratch register is not used,
 325   // it is defined to match parameters of 64-bit version of this method.
 326   if (src.is_lval()) {
 327     mov_literal32(dst, (intptr_t)src.target(), src.rspec());
 328   } else {
 329     movl(dst, as_Address(src));
 330   }
 331 }
 332 
 333 void MacroAssembler::movptr(ArrayAddress dst, Register src) {
 334   movl(as_Address(dst), src);
 335 }
 336 
 337 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
 338   movl(dst, as_Address(src));
 339 }
 340 
 341 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
 342 void MacroAssembler::movptr(Address dst, intptr_t src) {
 343   movl(dst, src);
 344 }
 345 
 346 
 347 void MacroAssembler::pop_callee_saved_registers() {
 348   pop(rcx);
 349   pop(rdx);
 350   pop(rdi);
 351   pop(rsi);
 352 }
 353 
 354 void MacroAssembler::pop_fTOS() {
 355   fld_d(Address(rsp, 0));
 356   addl(rsp, 2 * wordSize);
 357 }
 358 
 359 void MacroAssembler::push_callee_saved_registers() {
 360   push(rsi);
 361   push(rdi);
 362   push(rdx);
 363   push(rcx);
 364 }
 365 
 366 void MacroAssembler::push_fTOS() {
 367   subl(rsp, 2 * wordSize);
 368   fstp_d(Address(rsp, 0));
 369 }
 370 
 371 
 372 void MacroAssembler::pushoop(jobject obj) {
 373   push_literal32((int32_t)obj, oop_Relocation::spec_for_immediate());
 374 }
 375 
 376 void MacroAssembler::pushklass(Metadata* obj) {
 377   push_literal32((int32_t)obj, metadata_Relocation::spec_for_immediate());
 378 }
 379 
 380 void MacroAssembler::pushptr(AddressLiteral src) {
 381   if (src.is_lval()) {
 382     push_literal32((int32_t)src.target(), src.rspec());
 383   } else {
 384     pushl(as_Address(src));
 385   }
 386 }
 387 
 388 void MacroAssembler::set_word_if_not_zero(Register dst) {
 389   xorl(dst, dst);
 390   set_byte_if_not_zero(dst);
 391 }
 392 
 393 static void pass_arg0(MacroAssembler* masm, Register arg) {
 394   masm->push(arg);
 395 }
 396 
 397 static void pass_arg1(MacroAssembler* masm, Register arg) {
 398   masm->push(arg);
 399 }
 400 
 401 static void pass_arg2(MacroAssembler* masm, Register arg) {
 402   masm->push(arg);
 403 }
 404 
 405 static void pass_arg3(MacroAssembler* masm, Register arg) {
 406   masm->push(arg);
 407 }
 408 
 409 #ifndef PRODUCT
 410 extern "C" void findpc(intptr_t x);
 411 #endif
 412 
 413 void MacroAssembler::debug32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip, char* msg) {
 414   // In order to get locks to work, we need to fake a in_VM state
 415   JavaThread* thread = JavaThread::current();
 416   JavaThreadState saved_state = thread->thread_state();
 417   thread->set_thread_state(_thread_in_vm);
 418   if (ShowMessageBoxOnError) {
 419     JavaThread* thread = JavaThread::current();
 420     JavaThreadState saved_state = thread->thread_state();
 421     thread->set_thread_state(_thread_in_vm);
 422     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
 423       ttyLocker ttyl;
 424       BytecodeCounter::print();
 425     }
 426     // To see where a verify_oop failed, get $ebx+40/X for this frame.
 427     // This is the value of eip which points to where verify_oop will return.
 428     if (os::message_box(msg, "Execution stopped, print registers?")) {
 429       print_state32(rdi, rsi, rbp, rsp, rbx, rdx, rcx, rax, eip);
 430       BREAKPOINT;
 431     }
 432   }
 433   fatal("DEBUG MESSAGE: %s", msg);
 434 }
 435 
 436 void MacroAssembler::print_state32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip) {
 437   ttyLocker ttyl;
 438   FlagSetting fs(Debugging, true);
 439   tty->print_cr("eip = 0x%08x", eip);
 440 #ifndef PRODUCT
 441   if ((WizardMode || Verbose) && PrintMiscellaneous) {
 442     tty->cr();
 443     findpc(eip);
 444     tty->cr();
 445   }
 446 #endif
 447 #define PRINT_REG(rax) \
 448   { tty->print("%s = ", #rax); os::print_location(tty, rax); }
 449   PRINT_REG(rax);
 450   PRINT_REG(rbx);
 451   PRINT_REG(rcx);
 452   PRINT_REG(rdx);
 453   PRINT_REG(rdi);
 454   PRINT_REG(rsi);
 455   PRINT_REG(rbp);
 456   PRINT_REG(rsp);
 457 #undef PRINT_REG
 458   // Print some words near top of staack.
 459   int* dump_sp = (int*) rsp;
 460   for (int col1 = 0; col1 < 8; col1++) {
 461     tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 462     os::print_location(tty, *dump_sp++);
 463   }
 464   for (int row = 0; row < 16; row++) {
 465     tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 466     for (int col = 0; col < 8; col++) {
 467       tty->print(" 0x%08x", *dump_sp++);
 468     }
 469     tty->cr();
 470   }
 471   // Print some instructions around pc:
 472   Disassembler::decode((address)eip-64, (address)eip);
 473   tty->print_cr("--------");
 474   Disassembler::decode((address)eip, (address)eip+32);
 475 }
 476 
 477 void MacroAssembler::stop(const char* msg) {
 478   ExternalAddress message((address)msg);
 479   // push address of message
 480   pushptr(message.addr());
 481   { Label L; call(L, relocInfo::none); bind(L); }     // push eip
 482   pusha();                                            // push registers
 483   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug32)));
 484   hlt();
 485 }
 486 
 487 void MacroAssembler::warn(const char* msg) {
 488   push_CPU_state();
 489 
 490   ExternalAddress message((address) msg);
 491   // push address of message
 492   pushptr(message.addr());
 493 
 494   call(RuntimeAddress(CAST_FROM_FN_PTR(address, warning)));
 495   addl(rsp, wordSize);       // discard argument
 496   pop_CPU_state();
 497 }
 498 
 499 void MacroAssembler::print_state() {
 500   { Label L; call(L, relocInfo::none); bind(L); }     // push eip
 501   pusha();                                            // push registers
 502 
 503   push_CPU_state();
 504   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::print_state32)));
 505   pop_CPU_state();
 506 
 507   popa();
 508   addl(rsp, wordSize);
 509 }
 510 
 511 #else // _LP64
 512 
 513 // 64 bit versions
 514 
 515 Address MacroAssembler::as_Address(AddressLiteral adr) {
 516   // amd64 always does this as a pc-rel
 517   // we can be absolute or disp based on the instruction type
 518   // jmp/call are displacements others are absolute
 519   assert(!adr.is_lval(), "must be rval");
 520   assert(reachable(adr), "must be");
 521   return Address((int32_t)(intptr_t)(adr.target() - pc()), adr.target(), adr.reloc());
 522 
 523 }
 524 
 525 Address MacroAssembler::as_Address(ArrayAddress adr) {
 526   AddressLiteral base = adr.base();
 527   lea(rscratch1, base);
 528   Address index = adr.index();
 529   assert(index._disp == 0, "must not have disp"); // maybe it can?
 530   Address array(rscratch1, index._index, index._scale, index._disp);
 531   return array;
 532 }
 533 
 534 void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) {
 535   Label L, E;
 536 
 537 #ifdef _WIN64
 538   // Windows always allocates space for it's register args
 539   assert(num_args <= 4, "only register arguments supported");
 540   subq(rsp,  frame::arg_reg_save_area_bytes);
 541 #endif
 542 
 543   // Align stack if necessary
 544   testl(rsp, 15);
 545   jcc(Assembler::zero, L);
 546 
 547   subq(rsp, 8);
 548   {
 549     call(RuntimeAddress(entry_point));
 550   }
 551   addq(rsp, 8);
 552   jmp(E);
 553 
 554   bind(L);
 555   {
 556     call(RuntimeAddress(entry_point));
 557   }
 558 
 559   bind(E);
 560 
 561 #ifdef _WIN64
 562   // restore stack pointer
 563   addq(rsp, frame::arg_reg_save_area_bytes);
 564 #endif
 565 
 566 }
 567 
 568 void MacroAssembler::cmp64(Register src1, AddressLiteral src2) {
 569   assert(!src2.is_lval(), "should use cmpptr");
 570 
 571   if (reachable(src2)) {
 572     cmpq(src1, as_Address(src2));
 573   } else {
 574     lea(rscratch1, src2);
 575     Assembler::cmpq(src1, Address(rscratch1, 0));
 576   }
 577 }
 578 
 579 int MacroAssembler::corrected_idivq(Register reg) {
 580   // Full implementation of Java ldiv and lrem; checks for special
 581   // case as described in JVM spec., p.243 & p.271.  The function
 582   // returns the (pc) offset of the idivl instruction - may be needed
 583   // for implicit exceptions.
 584   //
 585   //         normal case                           special case
 586   //
 587   // input : rax: dividend                         min_long
 588   //         reg: divisor   (may not be eax/edx)   -1
 589   //
 590   // output: rax: quotient  (= rax idiv reg)       min_long
 591   //         rdx: remainder (= rax irem reg)       0
 592   assert(reg != rax && reg != rdx, "reg cannot be rax or rdx register");
 593   static const int64_t min_long = 0x8000000000000000;
 594   Label normal_case, special_case;
 595 
 596   // check for special case
 597   cmp64(rax, ExternalAddress((address) &min_long));
 598   jcc(Assembler::notEqual, normal_case);
 599   xorl(rdx, rdx); // prepare rdx for possible special case (where
 600                   // remainder = 0)
 601   cmpq(reg, -1);
 602   jcc(Assembler::equal, special_case);
 603 
 604   // handle normal case
 605   bind(normal_case);
 606   cdqq();
 607   int idivq_offset = offset();
 608   idivq(reg);
 609 
 610   // normal and special case exit
 611   bind(special_case);
 612 
 613   return idivq_offset;
 614 }
 615 
 616 void MacroAssembler::decrementq(Register reg, int value) {
 617   if (value == min_jint) { subq(reg, value); return; }
 618   if (value <  0) { incrementq(reg, -value); return; }
 619   if (value == 0) {                        ; return; }
 620   if (value == 1 && UseIncDec) { decq(reg) ; return; }
 621   /* else */      { subq(reg, value)       ; return; }
 622 }
 623 
 624 void MacroAssembler::decrementq(Address dst, int value) {
 625   if (value == min_jint) { subq(dst, value); return; }
 626   if (value <  0) { incrementq(dst, -value); return; }
 627   if (value == 0) {                        ; return; }
 628   if (value == 1 && UseIncDec) { decq(dst) ; return; }
 629   /* else */      { subq(dst, value)       ; return; }
 630 }
 631 
 632 void MacroAssembler::incrementq(AddressLiteral dst) {
 633   if (reachable(dst)) {
 634     incrementq(as_Address(dst));
 635   } else {
 636     lea(rscratch1, dst);
 637     incrementq(Address(rscratch1, 0));
 638   }
 639 }
 640 
 641 void MacroAssembler::incrementq(Register reg, int value) {
 642   if (value == min_jint) { addq(reg, value); return; }
 643   if (value <  0) { decrementq(reg, -value); return; }
 644   if (value == 0) {                        ; return; }
 645   if (value == 1 && UseIncDec) { incq(reg) ; return; }
 646   /* else */      { addq(reg, value)       ; return; }
 647 }
 648 
 649 void MacroAssembler::incrementq(Address dst, int value) {
 650   if (value == min_jint) { addq(dst, value); return; }
 651   if (value <  0) { decrementq(dst, -value); return; }
 652   if (value == 0) {                        ; return; }
 653   if (value == 1 && UseIncDec) { incq(dst) ; return; }
 654   /* else */      { addq(dst, value)       ; return; }
 655 }
 656 
 657 // 32bit can do a case table jump in one instruction but we no longer allow the base
 658 // to be installed in the Address class
 659 void MacroAssembler::jump(ArrayAddress entry) {
 660   lea(rscratch1, entry.base());
 661   Address dispatch = entry.index();
 662   assert(dispatch._base == noreg, "must be");
 663   dispatch._base = rscratch1;
 664   jmp(dispatch);
 665 }
 666 
 667 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
 668   ShouldNotReachHere(); // 64bit doesn't use two regs
 669   cmpq(x_lo, y_lo);
 670 }
 671 
 672 void MacroAssembler::lea(Register dst, AddressLiteral src) {
 673     mov_literal64(dst, (intptr_t)src.target(), src.rspec());
 674 }
 675 
 676 void MacroAssembler::lea(Address dst, AddressLiteral adr) {
 677   mov_literal64(rscratch1, (intptr_t)adr.target(), adr.rspec());
 678   movptr(dst, rscratch1);
 679 }
 680 
 681 void MacroAssembler::leave() {
 682   // %%% is this really better? Why not on 32bit too?
 683   emit_int8((unsigned char)0xC9); // LEAVE
 684 }
 685 
 686 void MacroAssembler::lneg(Register hi, Register lo) {
 687   ShouldNotReachHere(); // 64bit doesn't use two regs
 688   negq(lo);
 689 }
 690 
 691 void MacroAssembler::movoop(Register dst, jobject obj) {
 692   mov_literal64(dst, (intptr_t)obj, oop_Relocation::spec_for_immediate());
 693 }
 694 
 695 void MacroAssembler::movoop(Address dst, jobject obj) {
 696   mov_literal64(rscratch1, (intptr_t)obj, oop_Relocation::spec_for_immediate());
 697   movq(dst, rscratch1);
 698 }
 699 
 700 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
 701   mov_literal64(dst, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
 702 }
 703 
 704 void MacroAssembler::mov_metadata(Address dst, Metadata* obj) {
 705   mov_literal64(rscratch1, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
 706   movq(dst, rscratch1);
 707 }
 708 
 709 void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) {
 710   if (src.is_lval()) {
 711     mov_literal64(dst, (intptr_t)src.target(), src.rspec());
 712   } else {
 713     if (reachable(src)) {
 714       movq(dst, as_Address(src));
 715     } else {
 716       lea(scratch, src);
 717       movq(dst, Address(scratch, 0));
 718     }
 719   }
 720 }
 721 
 722 void MacroAssembler::movptr(ArrayAddress dst, Register src) {
 723   movq(as_Address(dst), src);
 724 }
 725 
 726 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
 727   movq(dst, as_Address(src));
 728 }
 729 
 730 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
 731 void MacroAssembler::movptr(Address dst, intptr_t src) {
 732   mov64(rscratch1, src);
 733   movq(dst, rscratch1);
 734 }
 735 
 736 // These are mostly for initializing NULL
 737 void MacroAssembler::movptr(Address dst, int32_t src) {
 738   movslq(dst, src);
 739 }
 740 
 741 void MacroAssembler::movptr(Register dst, int32_t src) {
 742   mov64(dst, (intptr_t)src);
 743 }
 744 
 745 void MacroAssembler::pushoop(jobject obj) {
 746   movoop(rscratch1, obj);
 747   push(rscratch1);
 748 }
 749 
 750 void MacroAssembler::pushklass(Metadata* obj) {
 751   mov_metadata(rscratch1, obj);
 752   push(rscratch1);
 753 }
 754 
 755 void MacroAssembler::pushptr(AddressLiteral src) {
 756   lea(rscratch1, src);
 757   if (src.is_lval()) {
 758     push(rscratch1);
 759   } else {
 760     pushq(Address(rscratch1, 0));
 761   }
 762 }
 763 
 764 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 765   // we must set sp to zero to clear frame
 766   movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
 767   // must clear fp, so that compiled frames are not confused; it is
 768   // possible that we need it only for debugging
 769   if (clear_fp) {
 770     movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
 771   }
 772 
 773   // Always clear the pc because it could have been set by make_walkable()
 774   movptr(Address(r15_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
 775   vzeroupper();
 776 }
 777 
 778 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 779                                          Register last_java_fp,
 780                                          address  last_java_pc) {
 781   vzeroupper();
 782   // determine last_java_sp register
 783   if (!last_java_sp->is_valid()) {
 784     last_java_sp = rsp;
 785   }
 786 
 787   // last_java_fp is optional
 788   if (last_java_fp->is_valid()) {
 789     movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()),
 790            last_java_fp);
 791   }
 792 
 793   // last_java_pc is optional
 794   if (last_java_pc != NULL) {
 795     Address java_pc(r15_thread,
 796                     JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset());
 797     lea(rscratch1, InternalAddress(last_java_pc));
 798     movptr(java_pc, rscratch1);
 799   }
 800 
 801   movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
 802 }
 803 
 804 static void pass_arg0(MacroAssembler* masm, Register arg) {
 805   if (c_rarg0 != arg ) {
 806     masm->mov(c_rarg0, arg);
 807   }
 808 }
 809 
 810 static void pass_arg1(MacroAssembler* masm, Register arg) {
 811   if (c_rarg1 != arg ) {
 812     masm->mov(c_rarg1, arg);
 813   }
 814 }
 815 
 816 static void pass_arg2(MacroAssembler* masm, Register arg) {
 817   if (c_rarg2 != arg ) {
 818     masm->mov(c_rarg2, arg);
 819   }
 820 }
 821 
 822 static void pass_arg3(MacroAssembler* masm, Register arg) {
 823   if (c_rarg3 != arg ) {
 824     masm->mov(c_rarg3, arg);
 825   }
 826 }
 827 
 828 void MacroAssembler::stop(const char* msg) {
 829   if (ShowMessageBoxOnError) {
 830     address rip = pc();
 831     pusha(); // get regs on stack
 832     lea(c_rarg1, InternalAddress(rip));
 833     movq(c_rarg2, rsp); // pass pointer to regs array
 834   }
 835   lea(c_rarg0, ExternalAddress((address) msg));
 836   andq(rsp, -16); // align stack as required by ABI
 837   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
 838   hlt();
 839 }
 840 
 841 void MacroAssembler::warn(const char* msg) {
 842   push(rbp);
 843   movq(rbp, rsp);
 844   andq(rsp, -16);     // align stack as required by push_CPU_state and call
 845   push_CPU_state();   // keeps alignment at 16 bytes
 846   lea(c_rarg0, ExternalAddress((address) msg));
 847   lea(rax, ExternalAddress(CAST_FROM_FN_PTR(address, warning)));
 848   call(rax);
 849   pop_CPU_state();
 850   mov(rsp, rbp);
 851   pop(rbp);
 852 }
 853 
 854 void MacroAssembler::print_state() {
 855   address rip = pc();
 856   pusha();            // get regs on stack
 857   push(rbp);
 858   movq(rbp, rsp);
 859   andq(rsp, -16);     // align stack as required by push_CPU_state and call
 860   push_CPU_state();   // keeps alignment at 16 bytes
 861 
 862   lea(c_rarg0, InternalAddress(rip));
 863   lea(c_rarg1, Address(rbp, wordSize)); // pass pointer to regs array
 864   call_VM_leaf(CAST_FROM_FN_PTR(address, MacroAssembler::print_state64), c_rarg0, c_rarg1);
 865 
 866   pop_CPU_state();
 867   mov(rsp, rbp);
 868   pop(rbp);
 869   popa();
 870 }
 871 
 872 #ifndef PRODUCT
 873 extern "C" void findpc(intptr_t x);
 874 #endif
 875 
 876 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) {
 877   // In order to get locks to work, we need to fake a in_VM state
 878   if (ShowMessageBoxOnError) {
 879     JavaThread* thread = JavaThread::current();
 880     JavaThreadState saved_state = thread->thread_state();
 881     thread->set_thread_state(_thread_in_vm);
 882 #ifndef PRODUCT
 883     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
 884       ttyLocker ttyl;
 885       BytecodeCounter::print();
 886     }
 887 #endif
 888     // To see where a verify_oop failed, get $ebx+40/X for this frame.
 889     // XXX correct this offset for amd64
 890     // This is the value of eip which points to where verify_oop will return.
 891     if (os::message_box(msg, "Execution stopped, print registers?")) {
 892       print_state64(pc, regs);
 893       BREAKPOINT;
 894     }
 895   }
 896   fatal("DEBUG MESSAGE: %s", msg);
 897 }
 898 
 899 void MacroAssembler::print_state64(int64_t pc, int64_t regs[]) {
 900   ttyLocker ttyl;
 901   FlagSetting fs(Debugging, true);
 902   tty->print_cr("rip = 0x%016lx", (intptr_t)pc);
 903 #ifndef PRODUCT
 904   tty->cr();
 905   findpc(pc);
 906   tty->cr();
 907 #endif
 908 #define PRINT_REG(rax, value) \
 909   { tty->print("%s = ", #rax); os::print_location(tty, value); }
 910   PRINT_REG(rax, regs[15]);
 911   PRINT_REG(rbx, regs[12]);
 912   PRINT_REG(rcx, regs[14]);
 913   PRINT_REG(rdx, regs[13]);
 914   PRINT_REG(rdi, regs[8]);
 915   PRINT_REG(rsi, regs[9]);
 916   PRINT_REG(rbp, regs[10]);
 917   PRINT_REG(rsp, regs[11]);
 918   PRINT_REG(r8 , regs[7]);
 919   PRINT_REG(r9 , regs[6]);
 920   PRINT_REG(r10, regs[5]);
 921   PRINT_REG(r11, regs[4]);
 922   PRINT_REG(r12, regs[3]);
 923   PRINT_REG(r13, regs[2]);
 924   PRINT_REG(r14, regs[1]);
 925   PRINT_REG(r15, regs[0]);
 926 #undef PRINT_REG
 927   // Print some words near top of staack.
 928   int64_t* rsp = (int64_t*) regs[11];
 929   int64_t* dump_sp = rsp;
 930   for (int col1 = 0; col1 < 8; col1++) {
 931     tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 932     os::print_location(tty, *dump_sp++);
 933   }
 934   for (int row = 0; row < 25; row++) {
 935     tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 936     for (int col = 0; col < 4; col++) {
 937       tty->print(" 0x%016lx", (intptr_t)*dump_sp++);
 938     }
 939     tty->cr();
 940   }
 941   // Print some instructions around pc:
 942   Disassembler::decode((address)pc-64, (address)pc);
 943   tty->print_cr("--------");
 944   Disassembler::decode((address)pc, (address)pc+32);
 945 }
 946 
 947 #endif // _LP64
 948 
 949 // Now versions that are common to 32/64 bit
 950 
 951 void MacroAssembler::addptr(Register dst, int32_t imm32) {
 952   LP64_ONLY(addq(dst, imm32)) NOT_LP64(addl(dst, imm32));
 953 }
 954 
 955 void MacroAssembler::addptr(Register dst, Register src) {
 956   LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
 957 }
 958 
 959 void MacroAssembler::addptr(Address dst, Register src) {
 960   LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
 961 }
 962 
 963 void MacroAssembler::addsd(XMMRegister dst, AddressLiteral src) {
 964   if (reachable(src)) {
 965     Assembler::addsd(dst, as_Address(src));
 966   } else {
 967     lea(rscratch1, src);
 968     Assembler::addsd(dst, Address(rscratch1, 0));
 969   }
 970 }
 971 
 972 void MacroAssembler::addss(XMMRegister dst, AddressLiteral src) {
 973   if (reachable(src)) {
 974     addss(dst, as_Address(src));
 975   } else {
 976     lea(rscratch1, src);
 977     addss(dst, Address(rscratch1, 0));
 978   }
 979 }
 980 
 981 void MacroAssembler::addpd(XMMRegister dst, AddressLiteral src) {
 982   if (reachable(src)) {
 983     Assembler::addpd(dst, as_Address(src));
 984   } else {
 985     lea(rscratch1, src);
 986     Assembler::addpd(dst, Address(rscratch1, 0));
 987   }
 988 }
 989 
 990 void MacroAssembler::align(int modulus) {
 991   align(modulus, offset());
 992 }
 993 
 994 void MacroAssembler::align(int modulus, int target) {
 995   if (target % modulus != 0) {
 996     nop(modulus - (target % modulus));
 997   }
 998 }
 999 
1000 void MacroAssembler::andpd(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
1001   // Used in sign-masking with aligned address.
1002   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
1003   if (reachable(src)) {
1004     Assembler::andpd(dst, as_Address(src));
1005   } else {
1006     lea(scratch_reg, src);
1007     Assembler::andpd(dst, Address(scratch_reg, 0));
1008   }
1009 }
1010 
1011 void MacroAssembler::andps(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
1012   // Used in sign-masking with aligned address.
1013   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
1014   if (reachable(src)) {
1015     Assembler::andps(dst, as_Address(src));
1016   } else {
1017     lea(scratch_reg, src);
1018     Assembler::andps(dst, Address(scratch_reg, 0));
1019   }
1020 }
1021 
1022 void MacroAssembler::andptr(Register dst, int32_t imm32) {
1023   LP64_ONLY(andq(dst, imm32)) NOT_LP64(andl(dst, imm32));
1024 }
1025 
1026 void MacroAssembler::atomic_incl(Address counter_addr) {
1027   lock();
1028   incrementl(counter_addr);
1029 }
1030 
1031 void MacroAssembler::atomic_incl(AddressLiteral counter_addr, Register scr) {
1032   if (reachable(counter_addr)) {
1033     atomic_incl(as_Address(counter_addr));
1034   } else {
1035     lea(scr, counter_addr);
1036     atomic_incl(Address(scr, 0));
1037   }
1038 }
1039 
1040 #ifdef _LP64
1041 void MacroAssembler::atomic_incq(Address counter_addr) {
1042   lock();
1043   incrementq(counter_addr);
1044 }
1045 
1046 void MacroAssembler::atomic_incq(AddressLiteral counter_addr, Register scr) {
1047   if (reachable(counter_addr)) {
1048     atomic_incq(as_Address(counter_addr));
1049   } else {
1050     lea(scr, counter_addr);
1051     atomic_incq(Address(scr, 0));
1052   }
1053 }
1054 #endif
1055 
1056 // Writes to stack successive pages until offset reached to check for
1057 // stack overflow + shadow pages.  This clobbers tmp.
1058 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
1059   movptr(tmp, rsp);
1060   // Bang stack for total size given plus shadow page size.
1061   // Bang one page at a time because large size can bang beyond yellow and
1062   // red zones.
1063   Label loop;
1064   bind(loop);
1065   movl(Address(tmp, (-os::vm_page_size())), size );
1066   subptr(tmp, os::vm_page_size());
1067   subl(size, os::vm_page_size());
1068   jcc(Assembler::greater, loop);
1069 
1070   // Bang down shadow pages too.
1071   // At this point, (tmp-0) is the last address touched, so don't
1072   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
1073   // was post-decremented.)  Skip this address by starting at i=1, and
1074   // touch a few more pages below.  N.B.  It is important to touch all
1075   // the way down including all pages in the shadow zone.
1076   for (int i = 1; i < ((int)JavaThread::stack_shadow_zone_size() / os::vm_page_size()); i++) {
1077     // this could be any sized move but this is can be a debugging crumb
1078     // so the bigger the better.
1079     movptr(Address(tmp, (-i*os::vm_page_size())), size );
1080   }
1081 }
1082 
1083 void MacroAssembler::reserved_stack_check() {
1084     // testing if reserved zone needs to be enabled
1085     Label no_reserved_zone_enabling;
1086     Register thread = NOT_LP64(rsi) LP64_ONLY(r15_thread);
1087     NOT_LP64(get_thread(rsi);)
1088 
1089     cmpptr(rsp, Address(thread, JavaThread::reserved_stack_activation_offset()));
1090     jcc(Assembler::below, no_reserved_zone_enabling);
1091 
1092     call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), thread);
1093     jump(RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
1094     should_not_reach_here();
1095 
1096     bind(no_reserved_zone_enabling);
1097 }
1098 
1099 int MacroAssembler::biased_locking_enter(Register lock_reg,
1100                                          Register obj_reg,
1101                                          Register swap_reg,
1102                                          Register tmp_reg,
1103                                          bool swap_reg_contains_mark,
1104                                          Label& done,
1105                                          Label* slow_case,
1106                                          BiasedLockingCounters* counters) {
1107   assert(UseBiasedLocking, "why call this otherwise?");
1108   assert(swap_reg == rax, "swap_reg must be rax for cmpxchgq");
1109   assert(tmp_reg != noreg, "tmp_reg must be supplied");
1110   assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg);
1111   assert(markWord::age_shift == markWord::lock_bits + markWord::biased_lock_bits, "biased locking makes assumptions about bit layout");
1112   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
1113   NOT_LP64( Address saved_mark_addr(lock_reg, 0); )
1114 
1115   if (PrintBiasedLockingStatistics && counters == NULL) {
1116     counters = BiasedLocking::counters();
1117   }
1118   // Biased locking
1119   // See whether the lock is currently biased toward our thread and
1120   // whether the epoch is still valid
1121   // Note that the runtime guarantees sufficient alignment of JavaThread
1122   // pointers to allow age to be placed into low bits
1123   // First check to see whether biasing is even enabled for this object
1124   Label cas_label;
1125   int null_check_offset = -1;
1126   if (!swap_reg_contains_mark) {
1127     null_check_offset = offset();
1128     movptr(swap_reg, mark_addr);
1129   }
1130   movptr(tmp_reg, swap_reg);
1131   andptr(tmp_reg, markWord::biased_lock_mask_in_place);
1132   cmpptr(tmp_reg, markWord::biased_lock_pattern);
1133   jcc(Assembler::notEqual, cas_label);
1134   // The bias pattern is present in the object's header. Need to check
1135   // whether the bias owner and the epoch are both still current.
1136 #ifndef _LP64
1137   // Note that because there is no current thread register on x86_32 we
1138   // need to store off the mark word we read out of the object to
1139   // avoid reloading it and needing to recheck invariants below. This
1140   // store is unfortunate but it makes the overall code shorter and
1141   // simpler.
1142   movptr(saved_mark_addr, swap_reg);
1143 #endif
1144   if (swap_reg_contains_mark) {
1145     null_check_offset = offset();
1146   }
1147   load_prototype_header(tmp_reg, obj_reg);
1148 #ifdef _LP64
1149   orptr(tmp_reg, r15_thread);
1150   xorptr(tmp_reg, swap_reg);
1151   Register header_reg = tmp_reg;
1152 #else
1153   xorptr(tmp_reg, swap_reg);
1154   get_thread(swap_reg);
1155   xorptr(swap_reg, tmp_reg);
1156   Register header_reg = swap_reg;
1157 #endif
1158   andptr(header_reg, ~((int) markWord::age_mask_in_place));
1159   if (counters != NULL) {
1160     cond_inc32(Assembler::zero,
1161                ExternalAddress((address) counters->biased_lock_entry_count_addr()));
1162   }
1163   jcc(Assembler::equal, done);
1164 
1165   Label try_revoke_bias;
1166   Label try_rebias;
1167 
1168   // At this point we know that the header has the bias pattern and
1169   // that we are not the bias owner in the current epoch. We need to
1170   // figure out more details about the state of the header in order to
1171   // know what operations can be legally performed on the object's
1172   // header.
1173 
1174   // If the low three bits in the xor result aren't clear, that means
1175   // the prototype header is no longer biased and we have to revoke
1176   // the bias on this object.
1177   testptr(header_reg, markWord::biased_lock_mask_in_place);
1178   jccb(Assembler::notZero, try_revoke_bias);
1179 
1180   // Biasing is still enabled for this data type. See whether the
1181   // epoch of the current bias is still valid, meaning that the epoch
1182   // bits of the mark word are equal to the epoch bits of the
1183   // prototype header. (Note that the prototype header's epoch bits
1184   // only change at a safepoint.) If not, attempt to rebias the object
1185   // toward the current thread. Note that we must be absolutely sure
1186   // that the current epoch is invalid in order to do this because
1187   // otherwise the manipulations it performs on the mark word are
1188   // illegal.
1189   testptr(header_reg, markWord::epoch_mask_in_place);
1190   jccb(Assembler::notZero, try_rebias);
1191 
1192   // The epoch of the current bias is still valid but we know nothing
1193   // about the owner; it might be set or it might be clear. Try to
1194   // acquire the bias of the object using an atomic operation. If this
1195   // fails we will go in to the runtime to revoke the object's bias.
1196   // Note that we first construct the presumed unbiased header so we
1197   // don't accidentally blow away another thread's valid bias.
1198   NOT_LP64( movptr(swap_reg, saved_mark_addr); )
1199   andptr(swap_reg,
1200          markWord::biased_lock_mask_in_place | markWord::age_mask_in_place | markWord::epoch_mask_in_place);
1201 #ifdef _LP64
1202   movptr(tmp_reg, swap_reg);
1203   orptr(tmp_reg, r15_thread);
1204 #else
1205   get_thread(tmp_reg);
1206   orptr(tmp_reg, swap_reg);
1207 #endif
1208   lock();
1209   cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1210   // If the biasing toward our thread failed, this means that
1211   // another thread succeeded in biasing it toward itself and we
1212   // need to revoke that bias. The revocation will occur in the
1213   // interpreter runtime in the slow case.
1214   if (counters != NULL) {
1215     cond_inc32(Assembler::zero,
1216                ExternalAddress((address) counters->anonymously_biased_lock_entry_count_addr()));
1217   }
1218   if (slow_case != NULL) {
1219     jcc(Assembler::notZero, *slow_case);
1220   }
1221   jmp(done);
1222 
1223   bind(try_rebias);
1224   // At this point we know the epoch has expired, meaning that the
1225   // current "bias owner", if any, is actually invalid. Under these
1226   // circumstances _only_, we are allowed to use the current header's
1227   // value as the comparison value when doing the cas to acquire the
1228   // bias in the current epoch. In other words, we allow transfer of
1229   // the bias from one thread to another directly in this situation.
1230   //
1231   // FIXME: due to a lack of registers we currently blow away the age
1232   // bits in this situation. Should attempt to preserve them.
1233   load_prototype_header(tmp_reg, obj_reg);
1234 #ifdef _LP64
1235   orptr(tmp_reg, r15_thread);
1236 #else
1237   get_thread(swap_reg);
1238   orptr(tmp_reg, swap_reg);
1239   movptr(swap_reg, saved_mark_addr);
1240 #endif
1241   lock();
1242   cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1243   // If the biasing toward our thread failed, then another thread
1244   // succeeded in biasing it toward itself and we need to revoke that
1245   // bias. The revocation will occur in the runtime in the slow case.
1246   if (counters != NULL) {
1247     cond_inc32(Assembler::zero,
1248                ExternalAddress((address) counters->rebiased_lock_entry_count_addr()));
1249   }
1250   if (slow_case != NULL) {
1251     jcc(Assembler::notZero, *slow_case);
1252   }
1253   jmp(done);
1254 
1255   bind(try_revoke_bias);
1256   // The prototype mark in the klass doesn't have the bias bit set any
1257   // more, indicating that objects of this data type are not supposed
1258   // to be biased any more. We are going to try to reset the mark of
1259   // this object to the prototype value and fall through to the
1260   // CAS-based locking scheme. Note that if our CAS fails, it means
1261   // that another thread raced us for the privilege of revoking the
1262   // bias of this particular object, so it's okay to continue in the
1263   // normal locking code.
1264   //
1265   // FIXME: due to a lack of registers we currently blow away the age
1266   // bits in this situation. Should attempt to preserve them.
1267   NOT_LP64( movptr(swap_reg, saved_mark_addr); )
1268   load_prototype_header(tmp_reg, obj_reg);
1269   lock();
1270   cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1271   // Fall through to the normal CAS-based lock, because no matter what
1272   // the result of the above CAS, some thread must have succeeded in
1273   // removing the bias bit from the object's header.
1274   if (counters != NULL) {
1275     cond_inc32(Assembler::zero,
1276                ExternalAddress((address) counters->revoked_lock_entry_count_addr()));
1277   }
1278 
1279   bind(cas_label);
1280 
1281   return null_check_offset;
1282 }
1283 
1284 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
1285   assert(UseBiasedLocking, "why call this otherwise?");
1286 
1287   // Check for biased locking unlock case, which is a no-op
1288   // Note: we do not have to check the thread ID for two reasons.
1289   // First, the interpreter checks for IllegalMonitorStateException at
1290   // a higher level. Second, if the bias was revoked while we held the
1291   // lock, the object could not be rebiased toward another thread, so
1292   // the bias bit would be clear.
1293   movptr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
1294   andptr(temp_reg, markWord::biased_lock_mask_in_place);
1295   cmpptr(temp_reg, markWord::biased_lock_pattern);
1296   jcc(Assembler::equal, done);
1297 }
1298 
1299 #ifdef COMPILER2
1300 
1301 #if INCLUDE_RTM_OPT
1302 
1303 // Update rtm_counters based on abort status
1304 // input: abort_status
1305 //        rtm_counters (RTMLockingCounters*)
1306 // flags are killed
1307 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
1308 
1309   atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
1310   if (PrintPreciseRTMLockingStatistics) {
1311     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
1312       Label check_abort;
1313       testl(abort_status, (1<<i));
1314       jccb(Assembler::equal, check_abort);
1315       atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
1316       bind(check_abort);
1317     }
1318   }
1319 }
1320 
1321 // Branch if (random & (count-1) != 0), count is 2^n
1322 // tmp, scr and flags are killed
1323 void MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
1324   assert(tmp == rax, "");
1325   assert(scr == rdx, "");
1326   rdtsc(); // modifies EDX:EAX
1327   andptr(tmp, count-1);
1328   jccb(Assembler::notZero, brLabel);
1329 }
1330 
1331 // Perform abort ratio calculation, set no_rtm bit if high ratio
1332 // input:  rtm_counters_Reg (RTMLockingCounters* address)
1333 // tmpReg, rtm_counters_Reg and flags are killed
1334 void MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
1335                                                  Register rtm_counters_Reg,
1336                                                  RTMLockingCounters* rtm_counters,
1337                                                  Metadata* method_data) {
1338   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
1339 
1340   if (RTMLockingCalculationDelay > 0) {
1341     // Delay calculation
1342     movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()), tmpReg);
1343     testptr(tmpReg, tmpReg);
1344     jccb(Assembler::equal, L_done);
1345   }
1346   // Abort ratio calculation only if abort_count > RTMAbortThreshold
1347   //   Aborted transactions = abort_count * 100
1348   //   All transactions = total_count *  RTMTotalCountIncrRate
1349   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
1350 
1351   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
1352   cmpptr(tmpReg, RTMAbortThreshold);
1353   jccb(Assembler::below, L_check_always_rtm2);
1354   imulptr(tmpReg, tmpReg, 100);
1355 
1356   Register scrReg = rtm_counters_Reg;
1357   movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
1358   imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
1359   imulptr(scrReg, scrReg, RTMAbortRatio);
1360   cmpptr(tmpReg, scrReg);
1361   jccb(Assembler::below, L_check_always_rtm1);
1362   if (method_data != NULL) {
1363     // set rtm_state to "no rtm" in MDO
1364     mov_metadata(tmpReg, method_data);
1365     lock();
1366     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM);
1367   }
1368   jmpb(L_done);
1369   bind(L_check_always_rtm1);
1370   // Reload RTMLockingCounters* address
1371   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
1372   bind(L_check_always_rtm2);
1373   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
1374   cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
1375   jccb(Assembler::below, L_done);
1376   if (method_data != NULL) {
1377     // set rtm_state to "always rtm" in MDO
1378     mov_metadata(tmpReg, method_data);
1379     lock();
1380     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM);
1381   }
1382   bind(L_done);
1383 }
1384 
1385 // Update counters and perform abort ratio calculation
1386 // input:  abort_status_Reg
1387 // rtm_counters_Reg, flags are killed
1388 void MacroAssembler::rtm_profiling(Register abort_status_Reg,
1389                                    Register rtm_counters_Reg,
1390                                    RTMLockingCounters* rtm_counters,
1391                                    Metadata* method_data,
1392                                    bool profile_rtm) {
1393 
1394   assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
1395   // update rtm counters based on rax value at abort
1396   // reads abort_status_Reg, updates flags
1397   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
1398   rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
1399   if (profile_rtm) {
1400     // Save abort status because abort_status_Reg is used by following code.
1401     if (RTMRetryCount > 0) {
1402       push(abort_status_Reg);
1403     }
1404     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
1405     rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
1406     // restore abort status
1407     if (RTMRetryCount > 0) {
1408       pop(abort_status_Reg);
1409     }
1410   }
1411 }
1412 
1413 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
1414 // inputs: retry_count_Reg
1415 //       : abort_status_Reg
1416 // output: retry_count_Reg decremented by 1
1417 // flags are killed
1418 void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
1419   Label doneRetry;
1420   assert(abort_status_Reg == rax, "");
1421   // The abort reason bits are in eax (see all states in rtmLocking.hpp)
1422   // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
1423   // if reason is in 0x6 and retry count != 0 then retry
1424   andptr(abort_status_Reg, 0x6);
1425   jccb(Assembler::zero, doneRetry);
1426   testl(retry_count_Reg, retry_count_Reg);
1427   jccb(Assembler::zero, doneRetry);
1428   pause();
1429   decrementl(retry_count_Reg);
1430   jmp(retryLabel);
1431   bind(doneRetry);
1432 }
1433 
1434 // Spin and retry if lock is busy,
1435 // inputs: box_Reg (monitor address)
1436 //       : retry_count_Reg
1437 // output: retry_count_Reg decremented by 1
1438 //       : clear z flag if retry count exceeded
1439 // tmp_Reg, scr_Reg, flags are killed
1440 void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
1441                                             Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
1442   Label SpinLoop, SpinExit, doneRetry;
1443   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
1444 
1445   testl(retry_count_Reg, retry_count_Reg);
1446   jccb(Assembler::zero, doneRetry);
1447   decrementl(retry_count_Reg);
1448   movptr(scr_Reg, RTMSpinLoopCount);
1449 
1450   bind(SpinLoop);
1451   pause();
1452   decrementl(scr_Reg);
1453   jccb(Assembler::lessEqual, SpinExit);
1454   movptr(tmp_Reg, Address(box_Reg, owner_offset));
1455   testptr(tmp_Reg, tmp_Reg);
1456   jccb(Assembler::notZero, SpinLoop);
1457 
1458   bind(SpinExit);
1459   jmp(retryLabel);
1460   bind(doneRetry);
1461   incrementl(retry_count_Reg); // clear z flag
1462 }
1463 
1464 // Use RTM for normal stack locks
1465 // Input: objReg (object to lock)
1466 void MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
1467                                        Register retry_on_abort_count_Reg,
1468                                        RTMLockingCounters* stack_rtm_counters,
1469                                        Metadata* method_data, bool profile_rtm,
1470                                        Label& DONE_LABEL, Label& IsInflated) {
1471   assert(UseRTMForStackLocks, "why call this otherwise?");
1472   assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
1473   assert(tmpReg == rax, "");
1474   assert(scrReg == rdx, "");
1475   Label L_rtm_retry, L_decrement_retry, L_on_abort;
1476 
1477   if (RTMRetryCount > 0) {
1478     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
1479     bind(L_rtm_retry);
1480   }
1481   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
1482   testptr(tmpReg, markWord::monitor_value);  // inflated vs stack-locked|neutral|biased
1483   jcc(Assembler::notZero, IsInflated);
1484 
1485   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1486     Label L_noincrement;
1487     if (RTMTotalCountIncrRate > 1) {
1488       // tmpReg, scrReg and flags are killed
1489       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
1490     }
1491     assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
1492     atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
1493     bind(L_noincrement);
1494   }
1495   xbegin(L_on_abort);
1496   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));       // fetch markword
1497   andptr(tmpReg, markWord::biased_lock_mask_in_place); // look at 3 lock bits
1498   cmpptr(tmpReg, markWord::unlocked_value);            // bits = 001 unlocked
1499   jcc(Assembler::equal, DONE_LABEL);        // all done if unlocked
1500 
1501   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
1502   if (UseRTMXendForLockBusy) {
1503     xend();
1504     movptr(abort_status_Reg, 0x2);   // Set the abort status to 2 (so we can retry)
1505     jmp(L_decrement_retry);
1506   }
1507   else {
1508     xabort(0);
1509   }
1510   bind(L_on_abort);
1511   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1512     rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
1513   }
1514   bind(L_decrement_retry);
1515   if (RTMRetryCount > 0) {
1516     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
1517     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
1518   }
1519 }
1520 
1521 // Use RTM for inflating locks
1522 // inputs: objReg (object to lock)
1523 //         boxReg (on-stack box address (displaced header location) - KILLED)
1524 //         tmpReg (ObjectMonitor address + markWord::monitor_value)
1525 void MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
1526                                           Register scrReg, Register retry_on_busy_count_Reg,
1527                                           Register retry_on_abort_count_Reg,
1528                                           RTMLockingCounters* rtm_counters,
1529                                           Metadata* method_data, bool profile_rtm,
1530                                           Label& DONE_LABEL) {
1531   assert(UseRTMLocking, "why call this otherwise?");
1532   assert(tmpReg == rax, "");
1533   assert(scrReg == rdx, "");
1534   Label L_rtm_retry, L_decrement_retry, L_on_abort;
1535   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
1536 
1537   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
1538   movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
1539   movptr(boxReg, tmpReg); // Save ObjectMonitor address
1540 
1541   if (RTMRetryCount > 0) {
1542     movl(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy
1543     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
1544     bind(L_rtm_retry);
1545   }
1546   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1547     Label L_noincrement;
1548     if (RTMTotalCountIncrRate > 1) {
1549       // tmpReg, scrReg and flags are killed
1550       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
1551     }
1552     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
1553     atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
1554     bind(L_noincrement);
1555   }
1556   xbegin(L_on_abort);
1557   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
1558   movptr(tmpReg, Address(tmpReg, owner_offset));
1559   testptr(tmpReg, tmpReg);
1560   jcc(Assembler::zero, DONE_LABEL);
1561   if (UseRTMXendForLockBusy) {
1562     xend();
1563     jmp(L_decrement_retry);
1564   }
1565   else {
1566     xabort(0);
1567   }
1568   bind(L_on_abort);
1569   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
1570   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1571     rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
1572   }
1573   if (RTMRetryCount > 0) {
1574     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
1575     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
1576   }
1577 
1578   movptr(tmpReg, Address(boxReg, owner_offset)) ;
1579   testptr(tmpReg, tmpReg) ;
1580   jccb(Assembler::notZero, L_decrement_retry) ;
1581 
1582   // Appears unlocked - try to swing _owner from null to non-null.
1583   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
1584 #ifdef _LP64
1585   Register threadReg = r15_thread;
1586 #else
1587   get_thread(scrReg);
1588   Register threadReg = scrReg;
1589 #endif
1590   lock();
1591   cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
1592 
1593   if (RTMRetryCount > 0) {
1594     // success done else retry
1595     jccb(Assembler::equal, DONE_LABEL) ;
1596     bind(L_decrement_retry);
1597     // Spin and retry if lock is busy.
1598     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
1599   }
1600   else {
1601     bind(L_decrement_retry);
1602   }
1603 }
1604 
1605 #endif //  INCLUDE_RTM_OPT
1606 
1607 // fast_lock and fast_unlock used by C2
1608 
1609 // Because the transitions from emitted code to the runtime
1610 // monitorenter/exit helper stubs are so slow it's critical that
1611 // we inline both the stack-locking fast path and the inflated fast path.
1612 //
1613 // See also: cmpFastLock and cmpFastUnlock.
1614 //
1615 // What follows is a specialized inline transliteration of the code
1616 // in enter() and exit(). If we're concerned about I$ bloat another
1617 // option would be to emit TrySlowEnter and TrySlowExit methods
1618 // at startup-time.  These methods would accept arguments as
1619 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
1620 // indications in the icc.ZFlag.  fast_lock and fast_unlock would simply
1621 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
1622 // In practice, however, the # of lock sites is bounded and is usually small.
1623 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
1624 // if the processor uses simple bimodal branch predictors keyed by EIP
1625 // Since the helper routines would be called from multiple synchronization
1626 // sites.
1627 //
1628 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
1629 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
1630 // to those specialized methods.  That'd give us a mostly platform-independent
1631 // implementation that the JITs could optimize and inline at their pleasure.
1632 // Done correctly, the only time we'd need to cross to native could would be
1633 // to park() or unpark() threads.  We'd also need a few more unsafe operators
1634 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
1635 // (b) explicit barriers or fence operations.
1636 //
1637 // TODO:
1638 //
1639 // *  Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
1640 //    This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
1641 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
1642 //    the lock operators would typically be faster than reifying Self.
1643 //
1644 // *  Ideally I'd define the primitives as:
1645 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
1646 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
1647 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
1648 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
1649 //    Furthermore the register assignments are overconstrained, possibly resulting in
1650 //    sub-optimal code near the synchronization site.
1651 //
1652 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
1653 //    Alternately, use a better sp-proximity test.
1654 //
1655 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
1656 //    Either one is sufficient to uniquely identify a thread.
1657 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
1658 //
1659 // *  Intrinsify notify() and notifyAll() for the common cases where the
1660 //    object is locked by the calling thread but the waitlist is empty.
1661 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
1662 //
1663 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
1664 //    But beware of excessive branch density on AMD Opterons.
1665 //
1666 // *  Both fast_lock and fast_unlock set the ICC.ZF to indicate success
1667 //    or failure of the fast path.  If the fast path fails then we pass
1668 //    control to the slow path, typically in C.  In fast_lock and
1669 //    fast_unlock we often branch to DONE_LABEL, just to find that C2
1670 //    will emit a conditional branch immediately after the node.
1671 //    So we have branches to branches and lots of ICC.ZF games.
1672 //    Instead, it might be better to have C2 pass a "FailureLabel"
1673 //    into fast_lock and fast_unlock.  In the case of success, control
1674 //    will drop through the node.  ICC.ZF is undefined at exit.
1675 //    In the case of failure, the node will branch directly to the
1676 //    FailureLabel
1677 
1678 
1679 // obj: object to lock
1680 // box: on-stack box address (displaced header location) - KILLED
1681 // rax,: tmp -- KILLED
1682 // scr: tmp -- KILLED
1683 void MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
1684                                Register scrReg, Register cx1Reg, Register cx2Reg,
1685                                BiasedLockingCounters* counters,
1686                                RTMLockingCounters* rtm_counters,
1687                                RTMLockingCounters* stack_rtm_counters,
1688                                Metadata* method_data,
1689                                bool use_rtm, bool profile_rtm) {
1690   // Ensure the register assignments are disjoint
1691   assert(tmpReg == rax, "");
1692 
1693   if (use_rtm) {
1694     assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
1695   } else {
1696     assert(cx1Reg == noreg, "");
1697     assert(cx2Reg == noreg, "");
1698     assert_different_registers(objReg, boxReg, tmpReg, scrReg);
1699   }
1700 
1701   if (counters != NULL) {
1702     atomic_incl(ExternalAddress((address)counters->total_entry_count_addr()), scrReg);
1703   }
1704 
1705   // Possible cases that we'll encounter in fast_lock
1706   // ------------------------------------------------
1707   // * Inflated
1708   //    -- unlocked
1709   //    -- Locked
1710   //       = by self
1711   //       = by other
1712   // * biased
1713   //    -- by Self
1714   //    -- by other
1715   // * neutral
1716   // * stack-locked
1717   //    -- by self
1718   //       = sp-proximity test hits
1719   //       = sp-proximity test generates false-negative
1720   //    -- by other
1721   //
1722 
1723   Label IsInflated, DONE_LABEL;
1724 
1725   // it's stack-locked, biased or neutral
1726   // TODO: optimize away redundant LDs of obj->mark and improve the markword triage
1727   // order to reduce the number of conditional branches in the most common cases.
1728   // Beware -- there's a subtle invariant that fetch of the markword
1729   // at [FETCH], below, will never observe a biased encoding (*101b).
1730   // If this invariant is not held we risk exclusion (safety) failure.
1731   if (UseBiasedLocking && !UseOptoBiasInlining) {
1732     biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL, counters);
1733   }
1734 
1735 #if INCLUDE_RTM_OPT
1736   if (UseRTMForStackLocks && use_rtm) {
1737     rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
1738                       stack_rtm_counters, method_data, profile_rtm,
1739                       DONE_LABEL, IsInflated);
1740   }
1741 #endif // INCLUDE_RTM_OPT
1742 
1743   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
1744   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral|biased
1745   jccb(Assembler::notZero, IsInflated);
1746 
1747   // Attempt stack-locking ...
1748   orptr (tmpReg, markWord::unlocked_value);
1749   if (EnableValhalla && !UseBiasedLocking) {
1750     // Mask always_locked bit such that we go to the slow path if object is a value type
1751     andptr(tmpReg, ~((int) markWord::biased_lock_bit_in_place));
1752   }
1753   movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
1754   lock();
1755   cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
1756   if (counters != NULL) {
1757     cond_inc32(Assembler::equal,
1758                ExternalAddress((address)counters->fast_path_entry_count_addr()));
1759   }
1760   jcc(Assembler::equal, DONE_LABEL);           // Success
1761 
1762   // Recursive locking.
1763   // The object is stack-locked: markword contains stack pointer to BasicLock.
1764   // Locked by current thread if difference with current SP is less than one page.
1765   subptr(tmpReg, rsp);
1766   // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
1767   andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );
1768   movptr(Address(boxReg, 0), tmpReg);
1769   if (counters != NULL) {
1770     cond_inc32(Assembler::equal,
1771                ExternalAddress((address)counters->fast_path_entry_count_addr()));
1772   }
1773   jmp(DONE_LABEL);
1774 
1775   bind(IsInflated);
1776   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
1777 
1778 #if INCLUDE_RTM_OPT
1779   // Use the same RTM locking code in 32- and 64-bit VM.
1780   if (use_rtm) {
1781     rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
1782                          rtm_counters, method_data, profile_rtm, DONE_LABEL);
1783   } else {
1784 #endif // INCLUDE_RTM_OPT
1785 
1786 #ifndef _LP64
1787   // The object is inflated.
1788 
1789   // boxReg refers to the on-stack BasicLock in the current frame.
1790   // We'd like to write:
1791   //   set box->_displaced_header = markWord::unused_mark().  Any non-0 value suffices.
1792   // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
1793   // additional latency as we have another ST in the store buffer that must drain.
1794 
1795   // avoid ST-before-CAS
1796   // register juggle because we need tmpReg for cmpxchgptr below
1797   movptr(scrReg, boxReg);
1798   movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
1799 
1800   // Optimistic form: consider XORL tmpReg,tmpReg
1801   movptr(tmpReg, NULL_WORD);
1802 
1803   // Appears unlocked - try to swing _owner from null to non-null.
1804   // Ideally, I'd manifest "Self" with get_thread and then attempt
1805   // to CAS the register containing Self into m->Owner.
1806   // But we don't have enough registers, so instead we can either try to CAS
1807   // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
1808   // we later store "Self" into m->Owner.  Transiently storing a stack address
1809   // (rsp or the address of the box) into  m->owner is harmless.
1810   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
1811   lock();
1812   cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1813   movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
1814   // If we weren't able to swing _owner from NULL to the BasicLock
1815   // then take the slow path.
1816   jccb  (Assembler::notZero, DONE_LABEL);
1817   // update _owner from BasicLock to thread
1818   get_thread (scrReg);                    // beware: clobbers ICCs
1819   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
1820   xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
1821 
1822   // If the CAS fails we can either retry or pass control to the slow path.
1823   // We use the latter tactic.
1824   // Pass the CAS result in the icc.ZFlag into DONE_LABEL
1825   // If the CAS was successful ...
1826   //   Self has acquired the lock
1827   //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
1828   // Intentional fall-through into DONE_LABEL ...
1829 #else // _LP64
1830   // It's inflated and we use scrReg for ObjectMonitor* in this section.
1831   movq(scrReg, tmpReg);
1832   xorq(tmpReg, tmpReg);
1833   lock();
1834   cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1835   // Unconditionally set box->_displaced_header = markWord::unused_mark().
1836   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
1837   movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
1838   // Intentional fall-through into DONE_LABEL ...
1839   // Propagate ICC.ZF from CAS above into DONE_LABEL.
1840 #endif // _LP64
1841 #if INCLUDE_RTM_OPT
1842   } // use_rtm()
1843 #endif
1844   // DONE_LABEL is a hot target - we'd really like to place it at the
1845   // start of cache line by padding with NOPs.
1846   // See the AMD and Intel software optimization manuals for the
1847   // most efficient "long" NOP encodings.
1848   // Unfortunately none of our alignment mechanisms suffice.
1849   bind(DONE_LABEL);
1850 
1851   // At DONE_LABEL the icc ZFlag is set as follows ...
1852   // fast_unlock uses the same protocol.
1853   // ZFlag == 1 -> Success
1854   // ZFlag == 0 -> Failure - force control through the slow path
1855 }
1856 
1857 // obj: object to unlock
1858 // box: box address (displaced header location), killed.  Must be EAX.
1859 // tmp: killed, cannot be obj nor box.
1860 //
1861 // Some commentary on balanced locking:
1862 //
1863 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
1864 // Methods that don't have provably balanced locking are forced to run in the
1865 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
1866 // The interpreter provides two properties:
1867 // I1:  At return-time the interpreter automatically and quietly unlocks any
1868 //      objects acquired the current activation (frame).  Recall that the
1869 //      interpreter maintains an on-stack list of locks currently held by
1870 //      a frame.
1871 // I2:  If a method attempts to unlock an object that is not held by the
1872 //      the frame the interpreter throws IMSX.
1873 //
1874 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
1875 // B() doesn't have provably balanced locking so it runs in the interpreter.
1876 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
1877 // is still locked by A().
1878 //
1879 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
1880 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
1881 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
1882 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
1883 // Arguably given that the spec legislates the JNI case as undefined our implementation
1884 // could reasonably *avoid* checking owner in fast_unlock().
1885 // In the interest of performance we elide m->Owner==Self check in unlock.
1886 // A perfectly viable alternative is to elide the owner check except when
1887 // Xcheck:jni is enabled.
1888 
1889 void MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
1890   assert(boxReg == rax, "");
1891   assert_different_registers(objReg, boxReg, tmpReg);
1892 
1893   Label DONE_LABEL, Stacked, CheckSucc;
1894 
1895   // Critically, the biased locking test must have precedence over
1896   // and appear before the (box->dhw == 0) recursive stack-lock test.
1897   if (UseBiasedLocking && !UseOptoBiasInlining) {
1898     biased_locking_exit(objReg, tmpReg, DONE_LABEL);
1899   }
1900 
1901 #if INCLUDE_RTM_OPT
1902   if (UseRTMForStackLocks && use_rtm) {
1903     assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
1904     Label L_regular_unlock;
1905     movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
1906     andptr(tmpReg, markWord::biased_lock_mask_in_place);              // look at 3 lock bits
1907     cmpptr(tmpReg, markWord::unlocked_value);                         // bits = 001 unlocked
1908     jccb(Assembler::notEqual, L_regular_unlock);                      // if !HLE RegularLock
1909     xend();                                                           // otherwise end...
1910     jmp(DONE_LABEL);                                                  // ... and we're done
1911     bind(L_regular_unlock);
1912   }
1913 #endif
1914 
1915   cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD);                   // Examine the displaced header
1916   jcc   (Assembler::zero, DONE_LABEL);                              // 0 indicates recursive stack-lock
1917   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword
1918   testptr(tmpReg, markWord::monitor_value);                         // Inflated?
1919   jccb  (Assembler::zero, Stacked);
1920 
1921   // It's inflated.
1922 #if INCLUDE_RTM_OPT
1923   if (use_rtm) {
1924     Label L_regular_inflated_unlock;
1925     int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
1926     movptr(boxReg, Address(tmpReg, owner_offset));
1927     testptr(boxReg, boxReg);
1928     jccb(Assembler::notZero, L_regular_inflated_unlock);
1929     xend();
1930     jmpb(DONE_LABEL);
1931     bind(L_regular_inflated_unlock);
1932   }
1933 #endif
1934 
1935   // Despite our balanced locking property we still check that m->_owner == Self
1936   // as java routines or native JNI code called by this thread might
1937   // have released the lock.
1938   // Refer to the comments in synchronizer.cpp for how we might encode extra
1939   // state in _succ so we can avoid fetching EntryList|cxq.
1940   //
1941   // I'd like to add more cases in fast_lock() and fast_unlock() --
1942   // such as recursive enter and exit -- but we have to be wary of
1943   // I$ bloat, T$ effects and BP$ effects.
1944   //
1945   // If there's no contention try a 1-0 exit.  That is, exit without
1946   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
1947   // we detect and recover from the race that the 1-0 exit admits.
1948   //
1949   // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
1950   // before it STs null into _owner, releasing the lock.  Updates
1951   // to data protected by the critical section must be visible before
1952   // we drop the lock (and thus before any other thread could acquire
1953   // the lock and observe the fields protected by the lock).
1954   // IA32's memory-model is SPO, so STs are ordered with respect to
1955   // each other and there's no need for an explicit barrier (fence).
1956   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
1957 #ifndef _LP64
1958   get_thread (boxReg);
1959 
1960   // Note that we could employ various encoding schemes to reduce
1961   // the number of loads below (currently 4) to just 2 or 3.
1962   // Refer to the comments in synchronizer.cpp.
1963   // In practice the chain of fetches doesn't seem to impact performance, however.
1964   xorptr(boxReg, boxReg);
1965   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
1966   jccb  (Assembler::notZero, DONE_LABEL);
1967   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
1968   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
1969   jccb  (Assembler::notZero, CheckSucc);
1970   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
1971   jmpb  (DONE_LABEL);
1972 
1973   bind (Stacked);
1974   // It's not inflated and it's not recursively stack-locked and it's not biased.
1975   // It must be stack-locked.
1976   // Try to reset the header to displaced header.
1977   // The "box" value on the stack is stable, so we can reload
1978   // and be assured we observe the same value as above.
1979   movptr(tmpReg, Address(boxReg, 0));
1980   lock();
1981   cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
1982   // Intention fall-thru into DONE_LABEL
1983 
1984   // DONE_LABEL is a hot target - we'd really like to place it at the
1985   // start of cache line by padding with NOPs.
1986   // See the AMD and Intel software optimization manuals for the
1987   // most efficient "long" NOP encodings.
1988   // Unfortunately none of our alignment mechanisms suffice.
1989   bind (CheckSucc);
1990 #else // _LP64
1991   // It's inflated
1992   xorptr(boxReg, boxReg);
1993   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
1994   jccb  (Assembler::notZero, DONE_LABEL);
1995   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
1996   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
1997   jccb  (Assembler::notZero, CheckSucc);
1998   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
1999   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
2000   jmpb  (DONE_LABEL);
2001 
2002   // Try to avoid passing control into the slow_path ...
2003   Label LSuccess, LGoSlowPath ;
2004   bind  (CheckSucc);
2005 
2006   // The following optional optimization can be elided if necessary
2007   // Effectively: if (succ == null) goto slow path
2008   // The code reduces the window for a race, however,
2009   // and thus benefits performance.
2010   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
2011   jccb  (Assembler::zero, LGoSlowPath);
2012 
2013   xorptr(boxReg, boxReg);
2014   // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
2015   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
2016 
2017   // Memory barrier/fence
2018   // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
2019   // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
2020   // This is faster on Nehalem and AMD Shanghai/Barcelona.
2021   // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
2022   // We might also restructure (ST Owner=0;barrier;LD _Succ) to
2023   // (mov box,0; xchgq box, &m->Owner; LD _succ) .
2024   lock(); addl(Address(rsp, 0), 0);
2025 
2026   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
2027   jccb  (Assembler::notZero, LSuccess);
2028 
2029   // Rare inopportune interleaving - race.
2030   // The successor vanished in the small window above.
2031   // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
2032   // We need to ensure progress and succession.
2033   // Try to reacquire the lock.
2034   // If that fails then the new owner is responsible for succession and this
2035   // thread needs to take no further action and can exit via the fast path (success).
2036   // If the re-acquire succeeds then pass control into the slow path.
2037   // As implemented, this latter mode is horrible because we generated more
2038   // coherence traffic on the lock *and* artifically extended the critical section
2039   // length while by virtue of passing control into the slow path.
2040 
2041   // box is really RAX -- the following CMPXCHG depends on that binding
2042   // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
2043   lock();
2044   cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
2045   // There's no successor so we tried to regrab the lock.
2046   // If that didn't work, then another thread grabbed the
2047   // lock so we're done (and exit was a success).
2048   jccb  (Assembler::notEqual, LSuccess);
2049   // Intentional fall-through into slow path
2050 
2051   bind  (LGoSlowPath);
2052   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
2053   jmpb  (DONE_LABEL);
2054 
2055   bind  (LSuccess);
2056   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
2057   jmpb  (DONE_LABEL);
2058 
2059   bind  (Stacked);
2060   movptr(tmpReg, Address (boxReg, 0));      // re-fetch
2061   lock();
2062   cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
2063 
2064 #endif
2065   bind(DONE_LABEL);
2066 }
2067 #endif // COMPILER2
2068 
2069 void MacroAssembler::c2bool(Register x) {
2070   // implements x == 0 ? 0 : 1
2071   // note: must only look at least-significant byte of x
2072   //       since C-style booleans are stored in one byte
2073   //       only! (was bug)
2074   andl(x, 0xFF);
2075   setb(Assembler::notZero, x);
2076 }
2077 
2078 // Wouldn't need if AddressLiteral version had new name
2079 void MacroAssembler::call(Label& L, relocInfo::relocType rtype) {
2080   Assembler::call(L, rtype);
2081 }
2082 
2083 void MacroAssembler::call(Register entry) {
2084   Assembler::call(entry);
2085 }
2086 
2087 void MacroAssembler::call(AddressLiteral entry) {
2088   if (reachable(entry)) {
2089     Assembler::call_literal(entry.target(), entry.rspec());
2090   } else {
2091     lea(rscratch1, entry);
2092     Assembler::call(rscratch1);
2093   }
2094 }
2095 
2096 void MacroAssembler::ic_call(address entry, jint method_index) {
2097   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
2098   movptr(rax, (intptr_t)Universe::non_oop_word());
2099   call(AddressLiteral(entry, rh));
2100 }
2101 
2102 // Implementation of call_VM versions
2103 
2104 void MacroAssembler::call_VM(Register oop_result,
2105                              address entry_point,
2106                              bool check_exceptions) {
2107   Label C, E;
2108   call(C, relocInfo::none);
2109   jmp(E);
2110 
2111   bind(C);
2112   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
2113   ret(0);
2114 
2115   bind(E);
2116 }
2117 
2118 void MacroAssembler::call_VM(Register oop_result,
2119                              address entry_point,
2120                              Register arg_1,
2121                              bool check_exceptions) {
2122   Label C, E;
2123   call(C, relocInfo::none);
2124   jmp(E);
2125 
2126   bind(C);
2127   pass_arg1(this, arg_1);
2128   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
2129   ret(0);
2130 
2131   bind(E);
2132 }
2133 
2134 void MacroAssembler::call_VM(Register oop_result,
2135                              address entry_point,
2136                              Register arg_1,
2137                              Register arg_2,
2138                              bool check_exceptions) {
2139   Label C, E;
2140   call(C, relocInfo::none);
2141   jmp(E);
2142 
2143   bind(C);
2144 
2145   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2146 
2147   pass_arg2(this, arg_2);
2148   pass_arg1(this, arg_1);
2149   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
2150   ret(0);
2151 
2152   bind(E);
2153 }
2154 
2155 void MacroAssembler::call_VM(Register oop_result,
2156                              address entry_point,
2157                              Register arg_1,
2158                              Register arg_2,
2159                              Register arg_3,
2160                              bool check_exceptions) {
2161   Label C, E;
2162   call(C, relocInfo::none);
2163   jmp(E);
2164 
2165   bind(C);
2166 
2167   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2168   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2169   pass_arg3(this, arg_3);
2170 
2171   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2172   pass_arg2(this, arg_2);
2173 
2174   pass_arg1(this, arg_1);
2175   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
2176   ret(0);
2177 
2178   bind(E);
2179 }
2180 
2181 void MacroAssembler::call_VM(Register oop_result,
2182                              Register last_java_sp,
2183                              address entry_point,
2184                              int number_of_arguments,
2185                              bool check_exceptions) {
2186   Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
2187   call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
2188 }
2189 
2190 void MacroAssembler::call_VM(Register oop_result,
2191                              Register last_java_sp,
2192                              address entry_point,
2193                              Register arg_1,
2194                              bool check_exceptions) {
2195   pass_arg1(this, arg_1);
2196   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
2197 }
2198 
2199 void MacroAssembler::call_VM(Register oop_result,
2200                              Register last_java_sp,
2201                              address entry_point,
2202                              Register arg_1,
2203                              Register arg_2,
2204                              bool check_exceptions) {
2205 
2206   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2207   pass_arg2(this, arg_2);
2208   pass_arg1(this, arg_1);
2209   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
2210 }
2211 
2212 void MacroAssembler::call_VM(Register oop_result,
2213                              Register last_java_sp,
2214                              address entry_point,
2215                              Register arg_1,
2216                              Register arg_2,
2217                              Register arg_3,
2218                              bool check_exceptions) {
2219   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2220   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2221   pass_arg3(this, arg_3);
2222   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2223   pass_arg2(this, arg_2);
2224   pass_arg1(this, arg_1);
2225   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
2226 }
2227 
2228 void MacroAssembler::super_call_VM(Register oop_result,
2229                                    Register last_java_sp,
2230                                    address entry_point,
2231                                    int number_of_arguments,
2232                                    bool check_exceptions) {
2233   Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
2234   MacroAssembler::call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
2235 }
2236 
2237 void MacroAssembler::super_call_VM(Register oop_result,
2238                                    Register last_java_sp,
2239                                    address entry_point,
2240                                    Register arg_1,
2241                                    bool check_exceptions) {
2242   pass_arg1(this, arg_1);
2243   super_call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
2244 }
2245 
2246 void MacroAssembler::super_call_VM(Register oop_result,
2247                                    Register last_java_sp,
2248                                    address entry_point,
2249                                    Register arg_1,
2250                                    Register arg_2,
2251                                    bool check_exceptions) {
2252 
2253   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2254   pass_arg2(this, arg_2);
2255   pass_arg1(this, arg_1);
2256   super_call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
2257 }
2258 
2259 void MacroAssembler::super_call_VM(Register oop_result,
2260                                    Register last_java_sp,
2261                                    address entry_point,
2262                                    Register arg_1,
2263                                    Register arg_2,
2264                                    Register arg_3,
2265                                    bool check_exceptions) {
2266   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2267   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2268   pass_arg3(this, arg_3);
2269   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2270   pass_arg2(this, arg_2);
2271   pass_arg1(this, arg_1);
2272   super_call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
2273 }
2274 
2275 void MacroAssembler::call_VM_base(Register oop_result,
2276                                   Register java_thread,
2277                                   Register last_java_sp,
2278                                   address  entry_point,
2279                                   int      number_of_arguments,
2280                                   bool     check_exceptions) {
2281   // determine java_thread register
2282   if (!java_thread->is_valid()) {
2283 #ifdef _LP64
2284     java_thread = r15_thread;
2285 #else
2286     java_thread = rdi;
2287     get_thread(java_thread);
2288 #endif // LP64
2289   }
2290   // determine last_java_sp register
2291   if (!last_java_sp->is_valid()) {
2292     last_java_sp = rsp;
2293   }
2294   // debugging support
2295   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
2296   LP64_ONLY(assert(java_thread == r15_thread, "unexpected register"));
2297 #ifdef ASSERT
2298   // TraceBytecodes does not use r12 but saves it over the call, so don't verify
2299   // r12 is the heapbase.
2300   LP64_ONLY(if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");)
2301 #endif // ASSERT
2302 
2303   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
2304   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
2305 
2306   // push java thread (becomes first argument of C function)
2307 
2308   NOT_LP64(push(java_thread); number_of_arguments++);
2309   LP64_ONLY(mov(c_rarg0, r15_thread));
2310 
2311   // set last Java frame before call
2312   assert(last_java_sp != rbp, "can't use ebp/rbp");
2313 
2314   // Only interpreter should have to set fp
2315   set_last_Java_frame(java_thread, last_java_sp, rbp, NULL);
2316 
2317   // do the call, remove parameters
2318   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments);
2319 
2320   // restore the thread (cannot use the pushed argument since arguments
2321   // may be overwritten by C code generated by an optimizing compiler);
2322   // however can use the register value directly if it is callee saved.
2323   if (LP64_ONLY(true ||) java_thread == rdi || java_thread == rsi) {
2324     // rdi & rsi (also r15) are callee saved -> nothing to do
2325 #ifdef ASSERT
2326     guarantee(java_thread != rax, "change this code");
2327     push(rax);
2328     { Label L;
2329       get_thread(rax);
2330       cmpptr(java_thread, rax);
2331       jcc(Assembler::equal, L);
2332       STOP("MacroAssembler::call_VM_base: rdi not callee saved?");
2333       bind(L);
2334     }
2335     pop(rax);
2336 #endif
2337   } else {
2338     get_thread(java_thread);
2339   }
2340   // reset last Java frame
2341   // Only interpreter should have to clear fp
2342   reset_last_Java_frame(java_thread, true);
2343 
2344    // C++ interp handles this in the interpreter
2345   check_and_handle_popframe(java_thread);
2346   check_and_handle_earlyret(java_thread);
2347 
2348   if (check_exceptions) {
2349     // check for pending exceptions (java_thread is set upon return)
2350     cmpptr(Address(java_thread, Thread::pending_exception_offset()), (int32_t) NULL_WORD);
2351 #ifndef _LP64
2352     jump_cc(Assembler::notEqual,
2353             RuntimeAddress(StubRoutines::forward_exception_entry()));
2354 #else
2355     // This used to conditionally jump to forward_exception however it is
2356     // possible if we relocate that the branch will not reach. So we must jump
2357     // around so we can always reach
2358 
2359     Label ok;
2360     jcc(Assembler::equal, ok);
2361     jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2362     bind(ok);
2363 #endif // LP64
2364   }
2365 
2366   // get oop result if there is one and reset the value in the thread
2367   if (oop_result->is_valid()) {
2368     get_vm_result(oop_result, java_thread);
2369   }
2370 }
2371 
2372 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
2373 
2374   // Calculate the value for last_Java_sp
2375   // somewhat subtle. call_VM does an intermediate call
2376   // which places a return address on the stack just under the
2377   // stack pointer as the user finsihed with it. This allows
2378   // use to retrieve last_Java_pc from last_Java_sp[-1].
2379   // On 32bit we then have to push additional args on the stack to accomplish
2380   // the actual requested call. On 64bit call_VM only can use register args
2381   // so the only extra space is the return address that call_VM created.
2382   // This hopefully explains the calculations here.
2383 
2384 #ifdef _LP64
2385   // We've pushed one address, correct last_Java_sp
2386   lea(rax, Address(rsp, wordSize));
2387 #else
2388   lea(rax, Address(rsp, (1 + number_of_arguments) * wordSize));
2389 #endif // LP64
2390 
2391   call_VM_base(oop_result, noreg, rax, entry_point, number_of_arguments, check_exceptions);
2392 
2393 }
2394 
2395 // Use this method when MacroAssembler version of call_VM_leaf_base() should be called from Interpreter.
2396 void MacroAssembler::call_VM_leaf0(address entry_point) {
2397   MacroAssembler::call_VM_leaf_base(entry_point, 0);
2398 }
2399 
2400 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
2401   call_VM_leaf_base(entry_point, number_of_arguments);
2402 }
2403 
2404 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
2405   pass_arg0(this, arg_0);
2406   call_VM_leaf(entry_point, 1);
2407 }
2408 
2409 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
2410 
2411   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2412   pass_arg1(this, arg_1);
2413   pass_arg0(this, arg_0);
2414   call_VM_leaf(entry_point, 2);
2415 }
2416 
2417 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
2418   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
2419   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2420   pass_arg2(this, arg_2);
2421   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2422   pass_arg1(this, arg_1);
2423   pass_arg0(this, arg_0);
2424   call_VM_leaf(entry_point, 3);
2425 }
2426 
2427 void MacroAssembler::super_call_VM_leaf(address entry_point) {
2428   MacroAssembler::call_VM_leaf_base(entry_point, 1);
2429 }
2430 
2431 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
2432   pass_arg0(this, arg_0);
2433   MacroAssembler::call_VM_leaf_base(entry_point, 1);
2434 }
2435 
2436 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
2437 
2438   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2439   pass_arg1(this, arg_1);
2440   pass_arg0(this, arg_0);
2441   MacroAssembler::call_VM_leaf_base(entry_point, 2);
2442 }
2443 
2444 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
2445   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
2446   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2447   pass_arg2(this, arg_2);
2448   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2449   pass_arg1(this, arg_1);
2450   pass_arg0(this, arg_0);
2451   MacroAssembler::call_VM_leaf_base(entry_point, 3);
2452 }
2453 
2454 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
2455   LP64_ONLY(assert(arg_0 != c_rarg3, "smashed arg"));
2456   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2457   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2458   pass_arg3(this, arg_3);
2459   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
2460   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2461   pass_arg2(this, arg_2);
2462   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2463   pass_arg1(this, arg_1);
2464   pass_arg0(this, arg_0);
2465   MacroAssembler::call_VM_leaf_base(entry_point, 4);
2466 }
2467 
2468 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
2469   movptr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
2470   movptr(Address(java_thread, JavaThread::vm_result_offset()), NULL_WORD);
2471   verify_oop(oop_result, "broken oop in call_VM_base");
2472 }
2473 
2474 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
2475   movptr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
2476   movptr(Address(java_thread, JavaThread::vm_result_2_offset()), NULL_WORD);
2477 }
2478 
2479 void MacroAssembler::check_and_handle_earlyret(Register java_thread) {
2480 }
2481 
2482 void MacroAssembler::check_and_handle_popframe(Register java_thread) {
2483 }
2484 
2485 void MacroAssembler::cmp32(AddressLiteral src1, int32_t imm) {
2486   if (reachable(src1)) {
2487     cmpl(as_Address(src1), imm);
2488   } else {
2489     lea(rscratch1, src1);
2490     cmpl(Address(rscratch1, 0), imm);
2491   }
2492 }
2493 
2494 void MacroAssembler::cmp32(Register src1, AddressLiteral src2) {
2495   assert(!src2.is_lval(), "use cmpptr");
2496   if (reachable(src2)) {
2497     cmpl(src1, as_Address(src2));
2498   } else {
2499     lea(rscratch1, src2);
2500     cmpl(src1, Address(rscratch1, 0));
2501   }
2502 }
2503 
2504 void MacroAssembler::cmp32(Register src1, int32_t imm) {
2505   Assembler::cmpl(src1, imm);
2506 }
2507 
2508 void MacroAssembler::cmp32(Register src1, Address src2) {
2509   Assembler::cmpl(src1, src2);
2510 }
2511 
2512 void MacroAssembler::cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
2513   ucomisd(opr1, opr2);
2514 
2515   Label L;
2516   if (unordered_is_less) {
2517     movl(dst, -1);
2518     jcc(Assembler::parity, L);
2519     jcc(Assembler::below , L);
2520     movl(dst, 0);
2521     jcc(Assembler::equal , L);
2522     increment(dst);
2523   } else { // unordered is greater
2524     movl(dst, 1);
2525     jcc(Assembler::parity, L);
2526     jcc(Assembler::above , L);
2527     movl(dst, 0);
2528     jcc(Assembler::equal , L);
2529     decrementl(dst);
2530   }
2531   bind(L);
2532 }
2533 
2534 void MacroAssembler::cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
2535   ucomiss(opr1, opr2);
2536 
2537   Label L;
2538   if (unordered_is_less) {
2539     movl(dst, -1);
2540     jcc(Assembler::parity, L);
2541     jcc(Assembler::below , L);
2542     movl(dst, 0);
2543     jcc(Assembler::equal , L);
2544     increment(dst);
2545   } else { // unordered is greater
2546     movl(dst, 1);
2547     jcc(Assembler::parity, L);
2548     jcc(Assembler::above , L);
2549     movl(dst, 0);
2550     jcc(Assembler::equal , L);
2551     decrementl(dst);
2552   }
2553   bind(L);
2554 }
2555 
2556 
2557 void MacroAssembler::cmp8(AddressLiteral src1, int imm) {
2558   if (reachable(src1)) {
2559     cmpb(as_Address(src1), imm);
2560   } else {
2561     lea(rscratch1, src1);
2562     cmpb(Address(rscratch1, 0), imm);
2563   }
2564 }
2565 
2566 void MacroAssembler::cmpptr(Register src1, AddressLiteral src2) {
2567 #ifdef _LP64
2568   if (src2.is_lval()) {
2569     movptr(rscratch1, src2);
2570     Assembler::cmpq(src1, rscratch1);
2571   } else if (reachable(src2)) {
2572     cmpq(src1, as_Address(src2));
2573   } else {
2574     lea(rscratch1, src2);
2575     Assembler::cmpq(src1, Address(rscratch1, 0));
2576   }
2577 #else
2578   if (src2.is_lval()) {
2579     cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
2580   } else {
2581     cmpl(src1, as_Address(src2));
2582   }
2583 #endif // _LP64
2584 }
2585 
2586 void MacroAssembler::cmpptr(Address src1, AddressLiteral src2) {
2587   assert(src2.is_lval(), "not a mem-mem compare");
2588 #ifdef _LP64
2589   // moves src2's literal address
2590   movptr(rscratch1, src2);
2591   Assembler::cmpq(src1, rscratch1);
2592 #else
2593   cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
2594 #endif // _LP64
2595 }
2596 
2597 void MacroAssembler::cmpoop(Register src1, Register src2) {
2598   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2599   bs->obj_equals(this, src1, src2);
2600 }
2601 
2602 void MacroAssembler::cmpoop(Register src1, Address src2) {
2603   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2604   bs->obj_equals(this, src1, src2);
2605 }
2606 
2607 #ifdef _LP64
2608 void MacroAssembler::cmpoop(Register src1, jobject src2) {
2609   movoop(rscratch1, src2);
2610   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2611   bs->obj_equals(this, src1, rscratch1);
2612 }
2613 #endif
2614 
2615 void MacroAssembler::locked_cmpxchgptr(Register reg, AddressLiteral adr) {
2616   if (reachable(adr)) {
2617     lock();
2618     cmpxchgptr(reg, as_Address(adr));
2619   } else {
2620     lea(rscratch1, adr);
2621     lock();
2622     cmpxchgptr(reg, Address(rscratch1, 0));
2623   }
2624 }
2625 
2626 void MacroAssembler::cmpxchgptr(Register reg, Address adr) {
2627   LP64_ONLY(cmpxchgq(reg, adr)) NOT_LP64(cmpxchgl(reg, adr));
2628 }
2629 
2630 void MacroAssembler::comisd(XMMRegister dst, AddressLiteral src) {
2631   if (reachable(src)) {
2632     Assembler::comisd(dst, as_Address(src));
2633   } else {
2634     lea(rscratch1, src);
2635     Assembler::comisd(dst, Address(rscratch1, 0));
2636   }
2637 }
2638 
2639 void MacroAssembler::comiss(XMMRegister dst, AddressLiteral src) {
2640   if (reachable(src)) {
2641     Assembler::comiss(dst, as_Address(src));
2642   } else {
2643     lea(rscratch1, src);
2644     Assembler::comiss(dst, Address(rscratch1, 0));
2645   }
2646 }
2647 
2648 
2649 void MacroAssembler::cond_inc32(Condition cond, AddressLiteral counter_addr) {
2650   Condition negated_cond = negate_condition(cond);
2651   Label L;
2652   jcc(negated_cond, L);
2653   pushf(); // Preserve flags
2654   atomic_incl(counter_addr);
2655   popf();
2656   bind(L);
2657 }
2658 
2659 int MacroAssembler::corrected_idivl(Register reg) {
2660   // Full implementation of Java idiv and irem; checks for
2661   // special case as described in JVM spec., p.243 & p.271.
2662   // The function returns the (pc) offset of the idivl
2663   // instruction - may be needed for implicit exceptions.
2664   //
2665   //         normal case                           special case
2666   //
2667   // input : rax,: dividend                         min_int
2668   //         reg: divisor   (may not be rax,/rdx)   -1
2669   //
2670   // output: rax,: quotient  (= rax, idiv reg)       min_int
2671   //         rdx: remainder (= rax, irem reg)       0
2672   assert(reg != rax && reg != rdx, "reg cannot be rax, or rdx register");
2673   const int min_int = 0x80000000;
2674   Label normal_case, special_case;
2675 
2676   // check for special case
2677   cmpl(rax, min_int);
2678   jcc(Assembler::notEqual, normal_case);
2679   xorl(rdx, rdx); // prepare rdx for possible special case (where remainder = 0)
2680   cmpl(reg, -1);
2681   jcc(Assembler::equal, special_case);
2682 
2683   // handle normal case
2684   bind(normal_case);
2685   cdql();
2686   int idivl_offset = offset();
2687   idivl(reg);
2688 
2689   // normal and special case exit
2690   bind(special_case);
2691 
2692   return idivl_offset;
2693 }
2694 
2695 
2696 
2697 void MacroAssembler::decrementl(Register reg, int value) {
2698   if (value == min_jint) {subl(reg, value) ; return; }
2699   if (value <  0) { incrementl(reg, -value); return; }
2700   if (value == 0) {                        ; return; }
2701   if (value == 1 && UseIncDec) { decl(reg) ; return; }
2702   /* else */      { subl(reg, value)       ; return; }
2703 }
2704 
2705 void MacroAssembler::decrementl(Address dst, int value) {
2706   if (value == min_jint) {subl(dst, value) ; return; }
2707   if (value <  0) { incrementl(dst, -value); return; }
2708   if (value == 0) {                        ; return; }
2709   if (value == 1 && UseIncDec) { decl(dst) ; return; }
2710   /* else */      { subl(dst, value)       ; return; }
2711 }
2712 
2713 void MacroAssembler::division_with_shift (Register reg, int shift_value) {
2714   assert (shift_value > 0, "illegal shift value");
2715   Label _is_positive;
2716   testl (reg, reg);
2717   jcc (Assembler::positive, _is_positive);
2718   int offset = (1 << shift_value) - 1 ;
2719 
2720   if (offset == 1) {
2721     incrementl(reg);
2722   } else {
2723     addl(reg, offset);
2724   }
2725 
2726   bind (_is_positive);
2727   sarl(reg, shift_value);
2728 }
2729 
2730 void MacroAssembler::divsd(XMMRegister dst, AddressLiteral src) {
2731   if (reachable(src)) {
2732     Assembler::divsd(dst, as_Address(src));
2733   } else {
2734     lea(rscratch1, src);
2735     Assembler::divsd(dst, Address(rscratch1, 0));
2736   }
2737 }
2738 
2739 void MacroAssembler::divss(XMMRegister dst, AddressLiteral src) {
2740   if (reachable(src)) {
2741     Assembler::divss(dst, as_Address(src));
2742   } else {
2743     lea(rscratch1, src);
2744     Assembler::divss(dst, Address(rscratch1, 0));
2745   }
2746 }
2747 
2748 // !defined(COMPILER2) is because of stupid core builds
2749 #if !defined(_LP64) || defined(COMPILER1) || !defined(COMPILER2) || INCLUDE_JVMCI
2750 void MacroAssembler::empty_FPU_stack() {
2751   if (VM_Version::supports_mmx()) {
2752     emms();
2753   } else {
2754     for (int i = 8; i-- > 0; ) ffree(i);
2755   }
2756 }
2757 #endif // !LP64 || C1 || !C2 || INCLUDE_JVMCI
2758 
2759 
2760 void MacroAssembler::enter() {
2761   push(rbp);
2762   mov(rbp, rsp);
2763 }
2764 
2765 // A 5 byte nop that is safe for patching (see patch_verified_entry)
2766 void MacroAssembler::fat_nop() {
2767   if (UseAddressNop) {
2768     addr_nop_5();
2769   } else {
2770     emit_int8(0x26); // es:
2771     emit_int8(0x2e); // cs:
2772     emit_int8(0x64); // fs:
2773     emit_int8(0x65); // gs:
2774     emit_int8((unsigned char)0x90);
2775   }
2776 }
2777 
2778 void MacroAssembler::fcmp(Register tmp) {
2779   fcmp(tmp, 1, true, true);
2780 }
2781 
2782 void MacroAssembler::fcmp(Register tmp, int index, bool pop_left, bool pop_right) {
2783   assert(!pop_right || pop_left, "usage error");
2784   if (VM_Version::supports_cmov()) {
2785     assert(tmp == noreg, "unneeded temp");
2786     if (pop_left) {
2787       fucomip(index);
2788     } else {
2789       fucomi(index);
2790     }
2791     if (pop_right) {
2792       fpop();
2793     }
2794   } else {
2795     assert(tmp != noreg, "need temp");
2796     if (pop_left) {
2797       if (pop_right) {
2798         fcompp();
2799       } else {
2800         fcomp(index);
2801       }
2802     } else {
2803       fcom(index);
2804     }
2805     // convert FPU condition into eflags condition via rax,
2806     save_rax(tmp);
2807     fwait(); fnstsw_ax();
2808     sahf();
2809     restore_rax(tmp);
2810   }
2811   // condition codes set as follows:
2812   //
2813   // CF (corresponds to C0) if x < y
2814   // PF (corresponds to C2) if unordered
2815   // ZF (corresponds to C3) if x = y
2816 }
2817 
2818 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less) {
2819   fcmp2int(dst, unordered_is_less, 1, true, true);
2820 }
2821 
2822 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less, int index, bool pop_left, bool pop_right) {
2823   fcmp(VM_Version::supports_cmov() ? noreg : dst, index, pop_left, pop_right);
2824   Label L;
2825   if (unordered_is_less) {
2826     movl(dst, -1);
2827     jcc(Assembler::parity, L);
2828     jcc(Assembler::below , L);
2829     movl(dst, 0);
2830     jcc(Assembler::equal , L);
2831     increment(dst);
2832   } else { // unordered is greater
2833     movl(dst, 1);
2834     jcc(Assembler::parity, L);
2835     jcc(Assembler::above , L);
2836     movl(dst, 0);
2837     jcc(Assembler::equal , L);
2838     decrementl(dst);
2839   }
2840   bind(L);
2841 }
2842 
2843 void MacroAssembler::fld_d(AddressLiteral src) {
2844   fld_d(as_Address(src));
2845 }
2846 
2847 void MacroAssembler::fld_s(AddressLiteral src) {
2848   fld_s(as_Address(src));
2849 }
2850 
2851 void MacroAssembler::fld_x(AddressLiteral src) {
2852   Assembler::fld_x(as_Address(src));
2853 }
2854 
2855 void MacroAssembler::fldcw(AddressLiteral src) {
2856   Assembler::fldcw(as_Address(src));
2857 }
2858 
2859 void MacroAssembler::mulpd(XMMRegister dst, AddressLiteral src) {
2860   if (reachable(src)) {
2861     Assembler::mulpd(dst, as_Address(src));
2862   } else {
2863     lea(rscratch1, src);
2864     Assembler::mulpd(dst, Address(rscratch1, 0));
2865   }
2866 }
2867 
2868 void MacroAssembler::increase_precision() {
2869   subptr(rsp, BytesPerWord);
2870   fnstcw(Address(rsp, 0));
2871   movl(rax, Address(rsp, 0));
2872   orl(rax, 0x300);
2873   push(rax);
2874   fldcw(Address(rsp, 0));
2875   pop(rax);
2876 }
2877 
2878 void MacroAssembler::restore_precision() {
2879   fldcw(Address(rsp, 0));
2880   addptr(rsp, BytesPerWord);
2881 }
2882 
2883 void MacroAssembler::fpop() {
2884   ffree();
2885   fincstp();
2886 }
2887 
2888 void MacroAssembler::load_float(Address src) {
2889   if (UseSSE >= 1) {
2890     movflt(xmm0, src);
2891   } else {
2892     LP64_ONLY(ShouldNotReachHere());
2893     NOT_LP64(fld_s(src));
2894   }
2895 }
2896 
2897 void MacroAssembler::store_float(Address dst) {
2898   if (UseSSE >= 1) {
2899     movflt(dst, xmm0);
2900   } else {
2901     LP64_ONLY(ShouldNotReachHere());
2902     NOT_LP64(fstp_s(dst));
2903   }
2904 }
2905 
2906 void MacroAssembler::load_double(Address src) {
2907   if (UseSSE >= 2) {
2908     movdbl(xmm0, src);
2909   } else {
2910     LP64_ONLY(ShouldNotReachHere());
2911     NOT_LP64(fld_d(src));
2912   }
2913 }
2914 
2915 void MacroAssembler::store_double(Address dst) {
2916   if (UseSSE >= 2) {
2917     movdbl(dst, xmm0);
2918   } else {
2919     LP64_ONLY(ShouldNotReachHere());
2920     NOT_LP64(fstp_d(dst));
2921   }
2922 }
2923 
2924 void MacroAssembler::fremr(Register tmp) {
2925   save_rax(tmp);
2926   { Label L;
2927     bind(L);
2928     fprem();
2929     fwait(); fnstsw_ax();
2930 #ifdef _LP64
2931     testl(rax, 0x400);
2932     jcc(Assembler::notEqual, L);
2933 #else
2934     sahf();
2935     jcc(Assembler::parity, L);
2936 #endif // _LP64
2937   }
2938   restore_rax(tmp);
2939   // Result is in ST0.
2940   // Note: fxch & fpop to get rid of ST1
2941   // (otherwise FPU stack could overflow eventually)
2942   fxch(1);
2943   fpop();
2944 }
2945 
2946 // dst = c = a * b + c
2947 void MacroAssembler::fmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
2948   Assembler::vfmadd231sd(c, a, b);
2949   if (dst != c) {
2950     movdbl(dst, c);
2951   }
2952 }
2953 
2954 // dst = c = a * b + c
2955 void MacroAssembler::fmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
2956   Assembler::vfmadd231ss(c, a, b);
2957   if (dst != c) {
2958     movflt(dst, c);
2959   }
2960 }
2961 
2962 // dst = c = a * b + c
2963 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
2964   Assembler::vfmadd231pd(c, a, b, vector_len);
2965   if (dst != c) {
2966     vmovdqu(dst, c);
2967   }
2968 }
2969 
2970 // dst = c = a * b + c
2971 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
2972   Assembler::vfmadd231ps(c, a, b, vector_len);
2973   if (dst != c) {
2974     vmovdqu(dst, c);
2975   }
2976 }
2977 
2978 // dst = c = a * b + c
2979 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
2980   Assembler::vfmadd231pd(c, a, b, vector_len);
2981   if (dst != c) {
2982     vmovdqu(dst, c);
2983   }
2984 }
2985 
2986 // dst = c = a * b + c
2987 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
2988   Assembler::vfmadd231ps(c, a, b, vector_len);
2989   if (dst != c) {
2990     vmovdqu(dst, c);
2991   }
2992 }
2993 
2994 void MacroAssembler::incrementl(AddressLiteral dst) {
2995   if (reachable(dst)) {
2996     incrementl(as_Address(dst));
2997   } else {
2998     lea(rscratch1, dst);
2999     incrementl(Address(rscratch1, 0));
3000   }
3001 }
3002 
3003 void MacroAssembler::incrementl(ArrayAddress dst) {
3004   incrementl(as_Address(dst));
3005 }
3006 
3007 void MacroAssembler::incrementl(Register reg, int value) {
3008   if (value == min_jint) {addl(reg, value) ; return; }
3009   if (value <  0) { decrementl(reg, -value); return; }
3010   if (value == 0) {                        ; return; }
3011   if (value == 1 && UseIncDec) { incl(reg) ; return; }
3012   /* else */      { addl(reg, value)       ; return; }
3013 }
3014 
3015 void MacroAssembler::incrementl(Address dst, int value) {
3016   if (value == min_jint) {addl(dst, value) ; return; }
3017   if (value <  0) { decrementl(dst, -value); return; }
3018   if (value == 0) {                        ; return; }
3019   if (value == 1 && UseIncDec) { incl(dst) ; return; }
3020   /* else */      { addl(dst, value)       ; return; }
3021 }
3022 
3023 void MacroAssembler::jump(AddressLiteral dst) {
3024   if (reachable(dst)) {
3025     jmp_literal(dst.target(), dst.rspec());
3026   } else {
3027     lea(rscratch1, dst);
3028     jmp(rscratch1);
3029   }
3030 }
3031 
3032 void MacroAssembler::jump_cc(Condition cc, AddressLiteral dst) {
3033   if (reachable(dst)) {
3034     InstructionMark im(this);
3035     relocate(dst.reloc());
3036     const int short_size = 2;
3037     const int long_size = 6;
3038     int offs = (intptr_t)dst.target() - ((intptr_t)pc());
3039     if (dst.reloc() == relocInfo::none && is8bit(offs - short_size)) {
3040       // 0111 tttn #8-bit disp
3041       emit_int8(0x70 | cc);
3042       emit_int8((offs - short_size) & 0xFF);
3043     } else {
3044       // 0000 1111 1000 tttn #32-bit disp
3045       emit_int8(0x0F);
3046       emit_int8((unsigned char)(0x80 | cc));
3047       emit_int32(offs - long_size);
3048     }
3049   } else {
3050 #ifdef ASSERT
3051     warning("reversing conditional branch");
3052 #endif /* ASSERT */
3053     Label skip;
3054     jccb(reverse[cc], skip);
3055     lea(rscratch1, dst);
3056     Assembler::jmp(rscratch1);
3057     bind(skip);
3058   }
3059 }
3060 
3061 void MacroAssembler::ldmxcsr(AddressLiteral src) {
3062   if (reachable(src)) {
3063     Assembler::ldmxcsr(as_Address(src));
3064   } else {
3065     lea(rscratch1, src);
3066     Assembler::ldmxcsr(Address(rscratch1, 0));
3067   }
3068 }
3069 
3070 int MacroAssembler::load_signed_byte(Register dst, Address src) {
3071   int off;
3072   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
3073     off = offset();
3074     movsbl(dst, src); // movsxb
3075   } else {
3076     off = load_unsigned_byte(dst, src);
3077     shll(dst, 24);
3078     sarl(dst, 24);
3079   }
3080   return off;
3081 }
3082 
3083 // Note: load_signed_short used to be called load_signed_word.
3084 // Although the 'w' in x86 opcodes refers to the term "word" in the assembler
3085 // manual, which means 16 bits, that usage is found nowhere in HotSpot code.
3086 // The term "word" in HotSpot means a 32- or 64-bit machine word.
3087 int MacroAssembler::load_signed_short(Register dst, Address src) {
3088   int off;
3089   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
3090     // This is dubious to me since it seems safe to do a signed 16 => 64 bit
3091     // version but this is what 64bit has always done. This seems to imply
3092     // that users are only using 32bits worth.
3093     off = offset();
3094     movswl(dst, src); // movsxw
3095   } else {
3096     off = load_unsigned_short(dst, src);
3097     shll(dst, 16);
3098     sarl(dst, 16);
3099   }
3100   return off;
3101 }
3102 
3103 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
3104   // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
3105   // and "3.9 Partial Register Penalties", p. 22).
3106   int off;
3107   if (LP64_ONLY(true || ) VM_Version::is_P6() || src.uses(dst)) {
3108     off = offset();
3109     movzbl(dst, src); // movzxb
3110   } else {
3111     xorl(dst, dst);
3112     off = offset();
3113     movb(dst, src);
3114   }
3115   return off;
3116 }
3117 
3118 // Note: load_unsigned_short used to be called load_unsigned_word.
3119 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
3120   // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
3121   // and "3.9 Partial Register Penalties", p. 22).
3122   int off;
3123   if (LP64_ONLY(true ||) VM_Version::is_P6() || src.uses(dst)) {
3124     off = offset();
3125     movzwl(dst, src); // movzxw
3126   } else {
3127     xorl(dst, dst);
3128     off = offset();
3129     movw(dst, src);
3130   }
3131   return off;
3132 }
3133 
3134 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
3135   switch (size_in_bytes) {
3136 #ifndef _LP64
3137   case  8:
3138     assert(dst2 != noreg, "second dest register required");
3139     movl(dst,  src);
3140     movl(dst2, src.plus_disp(BytesPerInt));
3141     break;
3142 #else
3143   case  8:  movq(dst, src); break;
3144 #endif
3145   case  4:  movl(dst, src); break;
3146   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
3147   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
3148   default:  ShouldNotReachHere();
3149   }
3150 }
3151 
3152 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
3153   switch (size_in_bytes) {
3154 #ifndef _LP64
3155   case  8:
3156     assert(src2 != noreg, "second source register required");
3157     movl(dst,                        src);
3158     movl(dst.plus_disp(BytesPerInt), src2);
3159     break;
3160 #else
3161   case  8:  movq(dst, src); break;
3162 #endif
3163   case  4:  movl(dst, src); break;
3164   case  2:  movw(dst, src); break;
3165   case  1:  movb(dst, src); break;
3166   default:  ShouldNotReachHere();
3167   }
3168 }
3169 
3170 void MacroAssembler::mov32(AddressLiteral dst, Register src) {
3171   if (reachable(dst)) {
3172     movl(as_Address(dst), src);
3173   } else {
3174     lea(rscratch1, dst);
3175     movl(Address(rscratch1, 0), src);
3176   }
3177 }
3178 
3179 void MacroAssembler::mov32(Register dst, AddressLiteral src) {
3180   if (reachable(src)) {
3181     movl(dst, as_Address(src));
3182   } else {
3183     lea(rscratch1, src);
3184     movl(dst, Address(rscratch1, 0));
3185   }
3186 }
3187 
3188 // C++ bool manipulation
3189 
3190 void MacroAssembler::movbool(Register dst, Address src) {
3191   if(sizeof(bool) == 1)
3192     movb(dst, src);
3193   else if(sizeof(bool) == 2)
3194     movw(dst, src);
3195   else if(sizeof(bool) == 4)
3196     movl(dst, src);
3197   else
3198     // unsupported
3199     ShouldNotReachHere();
3200 }
3201 
3202 void MacroAssembler::movbool(Address dst, bool boolconst) {
3203   if(sizeof(bool) == 1)
3204     movb(dst, (int) boolconst);
3205   else if(sizeof(bool) == 2)
3206     movw(dst, (int) boolconst);
3207   else if(sizeof(bool) == 4)
3208     movl(dst, (int) boolconst);
3209   else
3210     // unsupported
3211     ShouldNotReachHere();
3212 }
3213 
3214 void MacroAssembler::movbool(Address dst, Register src) {
3215   if(sizeof(bool) == 1)
3216     movb(dst, src);
3217   else if(sizeof(bool) == 2)
3218     movw(dst, src);
3219   else if(sizeof(bool) == 4)
3220     movl(dst, src);
3221   else
3222     // unsupported
3223     ShouldNotReachHere();
3224 }
3225 
3226 void MacroAssembler::movbyte(ArrayAddress dst, int src) {
3227   movb(as_Address(dst), src);
3228 }
3229 
3230 void MacroAssembler::movdl(XMMRegister dst, AddressLiteral src) {
3231   if (reachable(src)) {
3232     movdl(dst, as_Address(src));
3233   } else {
3234     lea(rscratch1, src);
3235     movdl(dst, Address(rscratch1, 0));
3236   }
3237 }
3238 
3239 void MacroAssembler::movq(XMMRegister dst, AddressLiteral src) {
3240   if (reachable(src)) {
3241     movq(dst, as_Address(src));
3242   } else {
3243     lea(rscratch1, src);
3244     movq(dst, Address(rscratch1, 0));
3245   }
3246 }
3247 
3248 #ifdef COMPILER2
3249 void MacroAssembler::setvectmask(Register dst, Register src) {
3250   guarantee(PostLoopMultiversioning, "must be");
3251   Assembler::movl(dst, 1);
3252   Assembler::shlxl(dst, dst, src);
3253   Assembler::decl(dst);
3254   Assembler::kmovdl(k1, dst);
3255   Assembler::movl(dst, src);
3256 }
3257 
3258 void MacroAssembler::restorevectmask() {
3259   guarantee(PostLoopMultiversioning, "must be");
3260   Assembler::knotwl(k1, k0);
3261 }
3262 #endif // COMPILER2
3263 
3264 void MacroAssembler::movdbl(XMMRegister dst, AddressLiteral src) {
3265   if (reachable(src)) {
3266     if (UseXmmLoadAndClearUpper) {
3267       movsd (dst, as_Address(src));
3268     } else {
3269       movlpd(dst, as_Address(src));
3270     }
3271   } else {
3272     lea(rscratch1, src);
3273     if (UseXmmLoadAndClearUpper) {
3274       movsd (dst, Address(rscratch1, 0));
3275     } else {
3276       movlpd(dst, Address(rscratch1, 0));
3277     }
3278   }
3279 }
3280 
3281 void MacroAssembler::movflt(XMMRegister dst, AddressLiteral src) {
3282   if (reachable(src)) {
3283     movss(dst, as_Address(src));
3284   } else {
3285     lea(rscratch1, src);
3286     movss(dst, Address(rscratch1, 0));
3287   }
3288 }
3289 
3290 void MacroAssembler::movptr(Register dst, Register src) {
3291   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3292 }
3293 
3294 void MacroAssembler::movptr(Register dst, Address src) {
3295   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3296 }
3297 
3298 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
3299 void MacroAssembler::movptr(Register dst, intptr_t src) {
3300   LP64_ONLY(mov64(dst, src)) NOT_LP64(movl(dst, src));
3301 }
3302 
3303 void MacroAssembler::movptr(Address dst, Register src) {
3304   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3305 }
3306 
3307 void MacroAssembler::movdqu(Address dst, XMMRegister src) {
3308     assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3309     Assembler::movdqu(dst, src);
3310 }
3311 
3312 void MacroAssembler::movdqu(XMMRegister dst, Address src) {
3313     assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3314     Assembler::movdqu(dst, src);
3315 }
3316 
3317 void MacroAssembler::movdqu(XMMRegister dst, XMMRegister src) {
3318     assert(((dst->encoding() < 16  && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3319     Assembler::movdqu(dst, src);
3320 }
3321 
3322 void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src, Register scratchReg) {
3323   if (reachable(src)) {
3324     movdqu(dst, as_Address(src));
3325   } else {
3326     lea(scratchReg, src);
3327     movdqu(dst, Address(scratchReg, 0));
3328   }
3329 }
3330 
3331 void MacroAssembler::vmovdqu(Address dst, XMMRegister src) {
3332     assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3333     Assembler::vmovdqu(dst, src);
3334 }
3335 
3336 void MacroAssembler::vmovdqu(XMMRegister dst, Address src) {
3337     assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3338     Assembler::vmovdqu(dst, src);
3339 }
3340 
3341 void MacroAssembler::vmovdqu(XMMRegister dst, XMMRegister src) {
3342     assert(((dst->encoding() < 16  && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3343     Assembler::vmovdqu(dst, src);
3344 }
3345 
3346 void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
3347   if (reachable(src)) {
3348     vmovdqu(dst, as_Address(src));
3349   }
3350   else {
3351     lea(scratch_reg, src);
3352     vmovdqu(dst, Address(scratch_reg, 0));
3353   }
3354 }
3355 
3356 void MacroAssembler::evmovdquq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
3357   if (reachable(src)) {
3358     Assembler::evmovdquq(dst, as_Address(src), vector_len);
3359   } else {
3360     lea(rscratch, src);
3361     Assembler::evmovdquq(dst, Address(rscratch, 0), vector_len);
3362   }
3363 }
3364 
3365 void MacroAssembler::movdqa(XMMRegister dst, AddressLiteral src) {
3366   if (reachable(src)) {
3367     Assembler::movdqa(dst, as_Address(src));
3368   } else {
3369     lea(rscratch1, src);
3370     Assembler::movdqa(dst, Address(rscratch1, 0));
3371   }
3372 }
3373 
3374 void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src) {
3375   if (reachable(src)) {
3376     Assembler::movsd(dst, as_Address(src));
3377   } else {
3378     lea(rscratch1, src);
3379     Assembler::movsd(dst, Address(rscratch1, 0));
3380   }
3381 }
3382 
3383 void MacroAssembler::movss(XMMRegister dst, AddressLiteral src) {
3384   if (reachable(src)) {
3385     Assembler::movss(dst, as_Address(src));
3386   } else {
3387     lea(rscratch1, src);
3388     Assembler::movss(dst, Address(rscratch1, 0));
3389   }
3390 }
3391 
3392 void MacroAssembler::mulsd(XMMRegister dst, AddressLiteral src) {
3393   if (reachable(src)) {
3394     Assembler::mulsd(dst, as_Address(src));
3395   } else {
3396     lea(rscratch1, src);
3397     Assembler::mulsd(dst, Address(rscratch1, 0));
3398   }
3399 }
3400 
3401 void MacroAssembler::mulss(XMMRegister dst, AddressLiteral src) {
3402   if (reachable(src)) {
3403     Assembler::mulss(dst, as_Address(src));
3404   } else {
3405     lea(rscratch1, src);
3406     Assembler::mulss(dst, Address(rscratch1, 0));
3407   }
3408 }
3409 
3410 void MacroAssembler::null_check(Register reg, int offset) {
3411   if (needs_explicit_null_check(offset)) {
3412     // provoke OS NULL exception if reg = NULL by
3413     // accessing M[reg] w/o changing any (non-CC) registers
3414     // NOTE: cmpl is plenty here to provoke a segv
3415     cmpptr(rax, Address(reg, 0));
3416     // Note: should probably use testl(rax, Address(reg, 0));
3417     //       may be shorter code (however, this version of
3418     //       testl needs to be implemented first)
3419   } else {
3420     // nothing to do, (later) access of M[reg + offset]
3421     // will provoke OS NULL exception if reg = NULL
3422   }
3423 }
3424 
3425 void MacroAssembler::test_klass_is_value(Register klass, Register temp_reg, Label& is_value) {
3426   movl(temp_reg, Address(klass, Klass::access_flags_offset()));
3427   testl(temp_reg, JVM_ACC_VALUE);
3428   jcc(Assembler::notZero, is_value);
3429 }
3430 
3431 void MacroAssembler::test_field_is_flattenable(Register flags, Register temp_reg, Label& is_flattenable) {
3432   movl(temp_reg, flags);
3433   shrl(temp_reg, ConstantPoolCacheEntry::is_flattenable_field_shift);
3434   andl(temp_reg, 0x1);
3435   testl(temp_reg, temp_reg);
3436   jcc(Assembler::notZero, is_flattenable);
3437 }
3438 
3439 void MacroAssembler::test_field_is_not_flattenable(Register flags, Register temp_reg, Label& notFlattenable) {
3440   movl(temp_reg, flags);
3441   shrl(temp_reg, ConstantPoolCacheEntry::is_flattenable_field_shift);
3442   andl(temp_reg, 0x1);
3443   testl(temp_reg, temp_reg);
3444   jcc(Assembler::zero, notFlattenable);
3445 }
3446 
3447 void MacroAssembler::test_field_is_flattened(Register flags, Register temp_reg, Label& is_flattened) {
3448   movl(temp_reg, flags);
3449   shrl(temp_reg, ConstantPoolCacheEntry::is_flattened_field_shift);
3450   andl(temp_reg, 0x1);
3451   testl(temp_reg, temp_reg);
3452   jcc(Assembler::notZero, is_flattened);
3453 }
3454 
3455 void MacroAssembler::test_flattened_array_oop(Register oop, Register temp_reg,
3456                                               Label&is_flattened_array) {
3457   load_storage_props(temp_reg, oop);
3458   testb(temp_reg, ArrayStorageProperties::flattened_value);
3459   jcc(Assembler::notZero, is_flattened_array);
3460 }
3461 
3462 void MacroAssembler::test_non_flattened_array_oop(Register oop, Register temp_reg,
3463                                                   Label&is_non_flattened_array) {
3464   load_storage_props(temp_reg, oop);
3465   testb(temp_reg, ArrayStorageProperties::flattened_value);
3466   jcc(Assembler::zero, is_non_flattened_array);
3467 }
3468 
3469 void MacroAssembler::test_null_free_array_oop(Register oop, Register temp_reg, Label&is_null_free_array) {
3470   load_storage_props(temp_reg, oop);
3471   testb(temp_reg, ArrayStorageProperties::null_free_value);
3472   jcc(Assembler::notZero, is_null_free_array);
3473 }
3474 
3475 void MacroAssembler::test_non_null_free_array_oop(Register oop, Register temp_reg, Label&is_non_null_free_array) {
3476   load_storage_props(temp_reg, oop);
3477   testb(temp_reg, ArrayStorageProperties::null_free_value);
3478   jcc(Assembler::zero, is_non_null_free_array);
3479 }
3480 
3481 void MacroAssembler::os_breakpoint() {
3482   // instead of directly emitting a breakpoint, call os:breakpoint for better debugability
3483   // (e.g., MSVC can't call ps() otherwise)
3484   call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint)));
3485 }
3486 
3487 void MacroAssembler::unimplemented(const char* what) {
3488   const char* buf = NULL;
3489   {
3490     ResourceMark rm;
3491     stringStream ss;
3492     ss.print("unimplemented: %s", what);
3493     buf = code_string(ss.as_string());
3494   }
3495   stop(buf);
3496 }
3497 
3498 #ifdef _LP64
3499 #define XSTATE_BV 0x200
3500 #endif
3501 
3502 void MacroAssembler::pop_CPU_state() {
3503   pop_FPU_state();
3504   pop_IU_state();
3505 }
3506 
3507 void MacroAssembler::pop_FPU_state() {
3508 #ifndef _LP64
3509   frstor(Address(rsp, 0));
3510 #else
3511   fxrstor(Address(rsp, 0));
3512 #endif
3513   addptr(rsp, FPUStateSizeInWords * wordSize);
3514 }
3515 
3516 void MacroAssembler::pop_IU_state() {
3517   popa();
3518   LP64_ONLY(addq(rsp, 8));
3519   popf();
3520 }
3521 
3522 // Save Integer and Float state
3523 // Warning: Stack must be 16 byte aligned (64bit)
3524 void MacroAssembler::push_CPU_state() {
3525   push_IU_state();
3526   push_FPU_state();
3527 }
3528 
3529 void MacroAssembler::push_FPU_state() {
3530   subptr(rsp, FPUStateSizeInWords * wordSize);
3531 #ifndef _LP64
3532   fnsave(Address(rsp, 0));
3533   fwait();
3534 #else
3535   fxsave(Address(rsp, 0));
3536 #endif // LP64
3537 }
3538 
3539 void MacroAssembler::push_IU_state() {
3540   // Push flags first because pusha kills them
3541   pushf();
3542   // Make sure rsp stays 16-byte aligned
3543   LP64_ONLY(subq(rsp, 8));
3544   pusha();
3545 }
3546 
3547 void MacroAssembler::reset_last_Java_frame(Register java_thread, bool clear_fp) { // determine java_thread register
3548   if (!java_thread->is_valid()) {
3549     java_thread = rdi;
3550     get_thread(java_thread);
3551   }
3552   // we must set sp to zero to clear frame
3553   movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
3554   if (clear_fp) {
3555     movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
3556   }
3557 
3558   // Always clear the pc because it could have been set by make_walkable()
3559   movptr(Address(java_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
3560 
3561   vzeroupper();
3562 }
3563 
3564 void MacroAssembler::restore_rax(Register tmp) {
3565   if (tmp == noreg) pop(rax);
3566   else if (tmp != rax) mov(rax, tmp);
3567 }
3568 
3569 void MacroAssembler::round_to(Register reg, int modulus) {
3570   addptr(reg, modulus - 1);
3571   andptr(reg, -modulus);
3572 }
3573 
3574 void MacroAssembler::save_rax(Register tmp) {
3575   if (tmp == noreg) push(rax);
3576   else if (tmp != rax) mov(tmp, rax);
3577 }
3578 
3579 void MacroAssembler::safepoint_poll(Label& slow_path, Register thread_reg, Register temp_reg) {
3580   if (SafepointMechanism::uses_thread_local_poll()) {
3581 #ifdef _LP64
3582     assert(thread_reg == r15_thread, "should be");
3583 #else
3584     if (thread_reg == noreg) {
3585       thread_reg = temp_reg;
3586       get_thread(thread_reg);
3587     }
3588 #endif
3589     testb(Address(thread_reg, Thread::polling_page_offset()), SafepointMechanism::poll_bit());
3590     jcc(Assembler::notZero, slow_path); // handshake bit set implies poll
3591   } else {
3592     cmp32(ExternalAddress(SafepointSynchronize::address_of_state()),
3593         SafepointSynchronize::_not_synchronized);
3594     jcc(Assembler::notEqual, slow_path);
3595   }
3596 }
3597 
3598 // Calls to C land
3599 //
3600 // When entering C land, the rbp, & rsp of the last Java frame have to be recorded
3601 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
3602 // has to be reset to 0. This is required to allow proper stack traversal.
3603 void MacroAssembler::set_last_Java_frame(Register java_thread,
3604                                          Register last_java_sp,
3605                                          Register last_java_fp,
3606                                          address  last_java_pc) {
3607   vzeroupper();
3608   // determine java_thread register
3609   if (!java_thread->is_valid()) {
3610     java_thread = rdi;
3611     get_thread(java_thread);
3612   }
3613   // determine last_java_sp register
3614   if (!last_java_sp->is_valid()) {
3615     last_java_sp = rsp;
3616   }
3617 
3618   // last_java_fp is optional
3619 
3620   if (last_java_fp->is_valid()) {
3621     movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), last_java_fp);
3622   }
3623 
3624   // last_java_pc is optional
3625 
3626   if (last_java_pc != NULL) {
3627     lea(Address(java_thread,
3628                  JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()),
3629         InternalAddress(last_java_pc));
3630 
3631   }
3632   movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
3633 }
3634 
3635 void MacroAssembler::shlptr(Register dst, int imm8) {
3636   LP64_ONLY(shlq(dst, imm8)) NOT_LP64(shll(dst, imm8));
3637 }
3638 
3639 void MacroAssembler::shrptr(Register dst, int imm8) {
3640   LP64_ONLY(shrq(dst, imm8)) NOT_LP64(shrl(dst, imm8));
3641 }
3642 
3643 void MacroAssembler::sign_extend_byte(Register reg) {
3644   if (LP64_ONLY(true ||) (VM_Version::is_P6() && reg->has_byte_register())) {
3645     movsbl(reg, reg); // movsxb
3646   } else {
3647     shll(reg, 24);
3648     sarl(reg, 24);
3649   }
3650 }
3651 
3652 void MacroAssembler::sign_extend_short(Register reg) {
3653   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
3654     movswl(reg, reg); // movsxw
3655   } else {
3656     shll(reg, 16);
3657     sarl(reg, 16);
3658   }
3659 }
3660 
3661 void MacroAssembler::testl(Register dst, AddressLiteral src) {
3662   assert(reachable(src), "Address should be reachable");
3663   testl(dst, as_Address(src));
3664 }
3665 
3666 void MacroAssembler::pcmpeqb(XMMRegister dst, XMMRegister src) {
3667   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3668   Assembler::pcmpeqb(dst, src);
3669 }
3670 
3671 void MacroAssembler::pcmpeqw(XMMRegister dst, XMMRegister src) {
3672   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3673   Assembler::pcmpeqw(dst, src);
3674 }
3675 
3676 void MacroAssembler::pcmpestri(XMMRegister dst, Address src, int imm8) {
3677   assert((dst->encoding() < 16),"XMM register should be 0-15");
3678   Assembler::pcmpestri(dst, src, imm8);
3679 }
3680 
3681 void MacroAssembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) {
3682   assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3683   Assembler::pcmpestri(dst, src, imm8);
3684 }
3685 
3686 void MacroAssembler::pmovzxbw(XMMRegister dst, XMMRegister src) {
3687   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3688   Assembler::pmovzxbw(dst, src);
3689 }
3690 
3691 void MacroAssembler::pmovzxbw(XMMRegister dst, Address src) {
3692   assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3693   Assembler::pmovzxbw(dst, src);
3694 }
3695 
3696 void MacroAssembler::pmovmskb(Register dst, XMMRegister src) {
3697   assert((src->encoding() < 16),"XMM register should be 0-15");
3698   Assembler::pmovmskb(dst, src);
3699 }
3700 
3701 void MacroAssembler::ptest(XMMRegister dst, XMMRegister src) {
3702   assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3703   Assembler::ptest(dst, src);
3704 }
3705 
3706 void MacroAssembler::sqrtsd(XMMRegister dst, AddressLiteral src) {
3707   if (reachable(src)) {
3708     Assembler::sqrtsd(dst, as_Address(src));
3709   } else {
3710     lea(rscratch1, src);
3711     Assembler::sqrtsd(dst, Address(rscratch1, 0));
3712   }
3713 }
3714 
3715 void MacroAssembler::sqrtss(XMMRegister dst, AddressLiteral src) {
3716   if (reachable(src)) {
3717     Assembler::sqrtss(dst, as_Address(src));
3718   } else {
3719     lea(rscratch1, src);
3720     Assembler::sqrtss(dst, Address(rscratch1, 0));
3721   }
3722 }
3723 
3724 void MacroAssembler::subsd(XMMRegister dst, AddressLiteral src) {
3725   if (reachable(src)) {
3726     Assembler::subsd(dst, as_Address(src));
3727   } else {
3728     lea(rscratch1, src);
3729     Assembler::subsd(dst, Address(rscratch1, 0));
3730   }
3731 }
3732 
3733 void MacroAssembler::roundsd(XMMRegister dst, AddressLiteral src, int32_t rmode, Register scratch_reg) {
3734   if (reachable(src)) {
3735     Assembler::roundsd(dst, as_Address(src), rmode);
3736   } else {
3737     lea(scratch_reg, src);
3738     Assembler::roundsd(dst, Address(scratch_reg, 0), rmode);
3739   }
3740 }
3741 
3742 void MacroAssembler::subss(XMMRegister dst, AddressLiteral src) {
3743   if (reachable(src)) {
3744     Assembler::subss(dst, as_Address(src));
3745   } else {
3746     lea(rscratch1, src);
3747     Assembler::subss(dst, Address(rscratch1, 0));
3748   }
3749 }
3750 
3751 void MacroAssembler::ucomisd(XMMRegister dst, AddressLiteral src) {
3752   if (reachable(src)) {
3753     Assembler::ucomisd(dst, as_Address(src));
3754   } else {
3755     lea(rscratch1, src);
3756     Assembler::ucomisd(dst, Address(rscratch1, 0));
3757   }
3758 }
3759 
3760 void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src) {
3761   if (reachable(src)) {
3762     Assembler::ucomiss(dst, as_Address(src));
3763   } else {
3764     lea(rscratch1, src);
3765     Assembler::ucomiss(dst, Address(rscratch1, 0));
3766   }
3767 }
3768 
3769 void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
3770   // Used in sign-bit flipping with aligned address.
3771   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
3772   if (reachable(src)) {
3773     Assembler::xorpd(dst, as_Address(src));
3774   } else {
3775     lea(scratch_reg, src);
3776     Assembler::xorpd(dst, Address(scratch_reg, 0));
3777   }
3778 }
3779 
3780 void MacroAssembler::xorpd(XMMRegister dst, XMMRegister src) {
3781   if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
3782     Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
3783   }
3784   else {
3785     Assembler::xorpd(dst, src);
3786   }
3787 }
3788 
3789 void MacroAssembler::xorps(XMMRegister dst, XMMRegister src) {
3790   if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
3791     Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
3792   } else {
3793     Assembler::xorps(dst, src);
3794   }
3795 }
3796 
3797 void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
3798   // Used in sign-bit flipping with aligned address.
3799   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
3800   if (reachable(src)) {
3801     Assembler::xorps(dst, as_Address(src));
3802   } else {
3803     lea(scratch_reg, src);
3804     Assembler::xorps(dst, Address(scratch_reg, 0));
3805   }
3806 }
3807 
3808 void MacroAssembler::pshufb(XMMRegister dst, AddressLiteral src) {
3809   // Used in sign-bit flipping with aligned address.
3810   bool aligned_adr = (((intptr_t)src.target() & 15) == 0);
3811   assert((UseAVX > 0) || aligned_adr, "SSE mode requires address alignment 16 bytes");
3812   if (reachable(src)) {
3813     Assembler::pshufb(dst, as_Address(src));
3814   } else {
3815     lea(rscratch1, src);
3816     Assembler::pshufb(dst, Address(rscratch1, 0));
3817   }
3818 }
3819 
3820 // AVX 3-operands instructions
3821 
3822 void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3823   if (reachable(src)) {
3824     vaddsd(dst, nds, as_Address(src));
3825   } else {
3826     lea(rscratch1, src);
3827     vaddsd(dst, nds, Address(rscratch1, 0));
3828   }
3829 }
3830 
3831 void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3832   if (reachable(src)) {
3833     vaddss(dst, nds, as_Address(src));
3834   } else {
3835     lea(rscratch1, src);
3836     vaddss(dst, nds, Address(rscratch1, 0));
3837   }
3838 }
3839 
3840 void MacroAssembler::vpaddd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3841   assert(UseAVX > 0, "requires some form of AVX");
3842   if (reachable(src)) {
3843     Assembler::vpaddd(dst, nds, as_Address(src), vector_len);
3844   } else {
3845     lea(rscratch, src);
3846     Assembler::vpaddd(dst, nds, Address(rscratch, 0), vector_len);
3847   }
3848 }
3849 
3850 void MacroAssembler::vabsss(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
3851   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3852   vandps(dst, nds, negate_field, vector_len);
3853 }
3854 
3855 void MacroAssembler::vabssd(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
3856   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3857   vandpd(dst, nds, negate_field, vector_len);
3858 }
3859 
3860 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3861   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3862   Assembler::vpaddb(dst, nds, src, vector_len);
3863 }
3864 
3865 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3866   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3867   Assembler::vpaddb(dst, nds, src, vector_len);
3868 }
3869 
3870 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3871   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3872   Assembler::vpaddw(dst, nds, src, vector_len);
3873 }
3874 
3875 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3876   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3877   Assembler::vpaddw(dst, nds, src, vector_len);
3878 }
3879 
3880 void MacroAssembler::vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3881   if (reachable(src)) {
3882     Assembler::vpand(dst, nds, as_Address(src), vector_len);
3883   } else {
3884     lea(scratch_reg, src);
3885     Assembler::vpand(dst, nds, Address(scratch_reg, 0), vector_len);
3886   }
3887 }
3888 
3889 void MacroAssembler::vpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len) {
3890   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3891   Assembler::vpbroadcastw(dst, src, vector_len);
3892 }
3893 
3894 void MacroAssembler::vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3895   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3896   Assembler::vpcmpeqb(dst, nds, src, vector_len);
3897 }
3898 
3899 void MacroAssembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3900   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3901   Assembler::vpcmpeqw(dst, nds, src, vector_len);
3902 }
3903 
3904 void MacroAssembler::vpmovzxbw(XMMRegister dst, Address src, int vector_len) {
3905   assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3906   Assembler::vpmovzxbw(dst, src, vector_len);
3907 }
3908 
3909 void MacroAssembler::vpmovmskb(Register dst, XMMRegister src) {
3910   assert((src->encoding() < 16),"XMM register should be 0-15");
3911   Assembler::vpmovmskb(dst, src);
3912 }
3913 
3914 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3915   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3916   Assembler::vpmullw(dst, nds, src, vector_len);
3917 }
3918 
3919 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3920   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3921   Assembler::vpmullw(dst, nds, src, vector_len);
3922 }
3923 
3924 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3925   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3926   Assembler::vpsubb(dst, nds, src, vector_len);
3927 }
3928 
3929 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3930   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3931   Assembler::vpsubb(dst, nds, src, vector_len);
3932 }
3933 
3934 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3935   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3936   Assembler::vpsubw(dst, nds, src, vector_len);
3937 }
3938 
3939 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3940   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3941   Assembler::vpsubw(dst, nds, src, vector_len);
3942 }
3943 
3944 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3945   assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3946   Assembler::vpsraw(dst, nds, shift, vector_len);
3947 }
3948 
3949 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3950   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3951   Assembler::vpsraw(dst, nds, shift, vector_len);
3952 }
3953 
3954 void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3955   assert(UseAVX > 2,"");
3956   if (!VM_Version::supports_avx512vl() && vector_len < 2) {
3957      vector_len = 2;
3958   }
3959   Assembler::evpsraq(dst, nds, shift, vector_len);
3960 }
3961 
3962 void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3963   assert(UseAVX > 2,"");
3964   if (!VM_Version::supports_avx512vl() && vector_len < 2) {
3965      vector_len = 2;
3966   }
3967   Assembler::evpsraq(dst, nds, shift, vector_len);
3968 }
3969 
3970 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3971   assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3972   Assembler::vpsrlw(dst, nds, shift, vector_len);
3973 }
3974 
3975 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3976   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3977   Assembler::vpsrlw(dst, nds, shift, vector_len);
3978 }
3979 
3980 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3981   assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3982   Assembler::vpsllw(dst, nds, shift, vector_len);
3983 }
3984 
3985 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3986   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3987   Assembler::vpsllw(dst, nds, shift, vector_len);
3988 }
3989 
3990 void MacroAssembler::vptest(XMMRegister dst, XMMRegister src) {
3991   assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3992   Assembler::vptest(dst, src);
3993 }
3994 
3995 void MacroAssembler::punpcklbw(XMMRegister dst, XMMRegister src) {
3996   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3997   Assembler::punpcklbw(dst, src);
3998 }
3999 
4000 void MacroAssembler::pshufd(XMMRegister dst, Address src, int mode) {
4001   assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
4002   Assembler::pshufd(dst, src, mode);
4003 }
4004 
4005 void MacroAssembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
4006   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
4007   Assembler::pshuflw(dst, src, mode);
4008 }
4009 
4010 void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
4011   if (reachable(src)) {
4012     vandpd(dst, nds, as_Address(src), vector_len);
4013   } else {
4014     lea(scratch_reg, src);
4015     vandpd(dst, nds, Address(scratch_reg, 0), vector_len);
4016   }
4017 }
4018 
4019 void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
4020   if (reachable(src)) {
4021     vandps(dst, nds, as_Address(src), vector_len);
4022   } else {
4023     lea(scratch_reg, src);
4024     vandps(dst, nds, Address(scratch_reg, 0), vector_len);
4025   }
4026 }
4027 
4028 void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4029   if (reachable(src)) {
4030     vdivsd(dst, nds, as_Address(src));
4031   } else {
4032     lea(rscratch1, src);
4033     vdivsd(dst, nds, Address(rscratch1, 0));
4034   }
4035 }
4036 
4037 void MacroAssembler::vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4038   if (reachable(src)) {
4039     vdivss(dst, nds, as_Address(src));
4040   } else {
4041     lea(rscratch1, src);
4042     vdivss(dst, nds, Address(rscratch1, 0));
4043   }
4044 }
4045 
4046 void MacroAssembler::vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4047   if (reachable(src)) {
4048     vmulsd(dst, nds, as_Address(src));
4049   } else {
4050     lea(rscratch1, src);
4051     vmulsd(dst, nds, Address(rscratch1, 0));
4052   }
4053 }
4054 
4055 void MacroAssembler::vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4056   if (reachable(src)) {
4057     vmulss(dst, nds, as_Address(src));
4058   } else {
4059     lea(rscratch1, src);
4060     vmulss(dst, nds, Address(rscratch1, 0));
4061   }
4062 }
4063 
4064 void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4065   if (reachable(src)) {
4066     vsubsd(dst, nds, as_Address(src));
4067   } else {
4068     lea(rscratch1, src);
4069     vsubsd(dst, nds, Address(rscratch1, 0));
4070   }
4071 }
4072 
4073 void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4074   if (reachable(src)) {
4075     vsubss(dst, nds, as_Address(src));
4076   } else {
4077     lea(rscratch1, src);
4078     vsubss(dst, nds, Address(rscratch1, 0));
4079   }
4080 }
4081 
4082 void MacroAssembler::vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4083   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
4084   vxorps(dst, nds, src, Assembler::AVX_128bit);
4085 }
4086 
4087 void MacroAssembler::vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4088   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
4089   vxorpd(dst, nds, src, Assembler::AVX_128bit);
4090 }
4091 
4092 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
4093   if (reachable(src)) {
4094     vxorpd(dst, nds, as_Address(src), vector_len);
4095   } else {
4096     lea(scratch_reg, src);
4097     vxorpd(dst, nds, Address(scratch_reg, 0), vector_len);
4098   }
4099 }
4100 
4101 void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
4102   if (reachable(src)) {
4103     vxorps(dst, nds, as_Address(src), vector_len);
4104   } else {
4105     lea(scratch_reg, src);
4106     vxorps(dst, nds, Address(scratch_reg, 0), vector_len);
4107   }
4108 }
4109 
4110 void MacroAssembler::vpxor(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
4111   if (UseAVX > 1 || (vector_len < 1)) {
4112     if (reachable(src)) {
4113       Assembler::vpxor(dst, nds, as_Address(src), vector_len);
4114     } else {
4115       lea(scratch_reg, src);
4116       Assembler::vpxor(dst, nds, Address(scratch_reg, 0), vector_len);
4117     }
4118   }
4119   else {
4120     MacroAssembler::vxorpd(dst, nds, src, vector_len, scratch_reg);
4121   }
4122 }
4123 
4124 //-------------------------------------------------------------------------------------------
4125 #ifdef COMPILER2
4126 // Generic instructions support for use in .ad files C2 code generation
4127 
4128 void MacroAssembler::vabsnegd(int opcode, XMMRegister dst, Register scr) {
4129   if (opcode == Op_AbsVD) {
4130     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), scr);
4131   } else {
4132     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
4133     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scr);
4134   }
4135 }
4136 
4137 void MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
4138   if (opcode == Op_AbsVD) {
4139     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, scr);
4140   } else {
4141     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
4142     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, scr);
4143   }
4144 }
4145 
4146 void MacroAssembler::vabsnegf(int opcode, XMMRegister dst, Register scr) {
4147   if (opcode == Op_AbsVF) {
4148     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), scr);
4149   } else {
4150     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
4151     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scr);
4152   }
4153 }
4154 
4155 void MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
4156   if (opcode == Op_AbsVF) {
4157     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, scr);
4158   } else {
4159     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
4160     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, scr);
4161   }
4162 }
4163 
4164 void MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
4165   if (sign) {
4166     pmovsxbw(dst, src);
4167   } else {
4168     pmovzxbw(dst, src);
4169   }
4170 }
4171 
4172 void MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
4173   if (sign) {
4174     vpmovsxbw(dst, src, vector_len);
4175   } else {
4176     vpmovzxbw(dst, src, vector_len);
4177   }
4178 }
4179 
4180 void MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src) {
4181   if (opcode == Op_RShiftVI) {
4182     psrad(dst, src);
4183   } else if (opcode == Op_LShiftVI) {
4184     pslld(dst, src);
4185   } else {
4186     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
4187     psrld(dst, src);
4188   }
4189 }
4190 
4191 void MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4192   if (opcode == Op_RShiftVI) {
4193     vpsrad(dst, nds, src, vector_len);
4194   } else if (opcode == Op_LShiftVI) {
4195     vpslld(dst, nds, src, vector_len);
4196   } else {
4197     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
4198     vpsrld(dst, nds, src, vector_len);
4199   }
4200 }
4201 
4202 void MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src) {
4203   if ((opcode == Op_RShiftVS) || (opcode == Op_RShiftVB)) {
4204     psraw(dst, src);
4205   } else if ((opcode == Op_LShiftVS) || (opcode == Op_LShiftVB)) {
4206     psllw(dst, src);
4207   } else {
4208     assert(((opcode == Op_URShiftVS) || (opcode == Op_URShiftVB)),"opcode should be one of Op_URShiftVS or Op_URShiftVB");
4209     psrlw(dst, src);
4210   }
4211 }
4212 
4213 void MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4214   if ((opcode == Op_RShiftVS) || (opcode == Op_RShiftVB)) {
4215     vpsraw(dst, nds, src, vector_len);
4216   } else if ((opcode == Op_LShiftVS) || (opcode == Op_LShiftVB)) {
4217     vpsllw(dst, nds, src, vector_len);
4218   } else {
4219     assert(((opcode == Op_URShiftVS) || (opcode == Op_URShiftVB)),"opcode should be one of Op_URShiftVS or Op_URShiftVB");
4220     vpsrlw(dst, nds, src, vector_len);
4221   }
4222 }
4223 
4224 void MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src) {
4225   if (opcode == Op_RShiftVL) {
4226     psrlq(dst, src);  // using srl to implement sra on pre-avs512 systems
4227   } else if (opcode == Op_LShiftVL) {
4228     psllq(dst, src);
4229   } else {
4230     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
4231     psrlq(dst, src);
4232   }
4233 }
4234 
4235 void MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4236   if (opcode == Op_RShiftVL) {
4237     evpsraq(dst, nds, src, vector_len);
4238   } else if (opcode == Op_LShiftVL) {
4239     vpsllq(dst, nds, src, vector_len);
4240   } else {
4241     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
4242     vpsrlq(dst, nds, src, vector_len);
4243   }
4244 }
4245 #endif
4246 //-------------------------------------------------------------------------------------------
4247 
4248 void MacroAssembler::clear_jweak_tag(Register possibly_jweak) {
4249   const int32_t inverted_jweak_mask = ~static_cast<int32_t>(JNIHandles::weak_tag_mask);
4250   STATIC_ASSERT(inverted_jweak_mask == -2); // otherwise check this code
4251   // The inverted mask is sign-extended
4252   andptr(possibly_jweak, inverted_jweak_mask);
4253 }
4254 
4255 void MacroAssembler::resolve_jobject(Register value,
4256                                      Register thread,
4257                                      Register tmp) {
4258   assert_different_registers(value, thread, tmp);
4259   Label done, not_weak;
4260   testptr(value, value);
4261   jcc(Assembler::zero, done);                // Use NULL as-is.
4262   testptr(value, JNIHandles::weak_tag_mask); // Test for jweak tag.
4263   jcc(Assembler::zero, not_weak);
4264   // Resolve jweak.
4265   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
4266                  value, Address(value, -JNIHandles::weak_tag_value), tmp, thread);
4267   verify_oop(value);
4268   jmp(done);
4269   bind(not_weak);
4270   // Resolve (untagged) jobject.
4271   access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread);
4272   verify_oop(value);
4273   bind(done);
4274 }
4275 
4276 void MacroAssembler::subptr(Register dst, int32_t imm32) {
4277   LP64_ONLY(subq(dst, imm32)) NOT_LP64(subl(dst, imm32));
4278 }
4279 
4280 // Force generation of a 4 byte immediate value even if it fits into 8bit
4281 void MacroAssembler::subptr_imm32(Register dst, int32_t imm32) {
4282   LP64_ONLY(subq_imm32(dst, imm32)) NOT_LP64(subl_imm32(dst, imm32));
4283 }
4284 
4285 void MacroAssembler::subptr(Register dst, Register src) {
4286   LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src));
4287 }
4288 
4289 // C++ bool manipulation
4290 void MacroAssembler::testbool(Register dst) {
4291   if(sizeof(bool) == 1)
4292     testb(dst, 0xff);
4293   else if(sizeof(bool) == 2) {
4294     // testw implementation needed for two byte bools
4295     ShouldNotReachHere();
4296   } else if(sizeof(bool) == 4)
4297     testl(dst, dst);
4298   else
4299     // unsupported
4300     ShouldNotReachHere();
4301 }
4302 
4303 void MacroAssembler::testptr(Register dst, Register src) {
4304   LP64_ONLY(testq(dst, src)) NOT_LP64(testl(dst, src));
4305 }
4306 
4307 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
4308 void MacroAssembler::tlab_allocate(Register thread, Register obj,
4309                                    Register var_size_in_bytes,
4310                                    int con_size_in_bytes,
4311                                    Register t1,
4312                                    Register t2,
4313                                    Label& slow_case) {
4314   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4315   bs->tlab_allocate(this, thread, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
4316 }
4317 
4318 // Defines obj, preserves var_size_in_bytes
4319 void MacroAssembler::eden_allocate(Register thread, Register obj,
4320                                    Register var_size_in_bytes,
4321                                    int con_size_in_bytes,
4322                                    Register t1,
4323                                    Label& slow_case) {
4324   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4325   bs->eden_allocate(this, thread, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case);
4326 }
4327 
4328 // Preserves the contents of address, destroys the contents length_in_bytes and temp.
4329 void MacroAssembler::zero_memory(Register address, Register length_in_bytes, int offset_in_bytes, Register temp) {
4330   assert(address != length_in_bytes && address != temp && temp != length_in_bytes, "registers must be different");
4331   assert((offset_in_bytes & (BytesPerWord - 1)) == 0, "offset must be a multiple of BytesPerWord");
4332   Label done;
4333 
4334   testptr(length_in_bytes, length_in_bytes);
4335   jcc(Assembler::zero, done);
4336 
4337   // initialize topmost word, divide index by 2, check if odd and test if zero
4338   // note: for the remaining code to work, index must be a multiple of BytesPerWord
4339 #ifdef ASSERT
4340   {
4341     Label L;
4342     testptr(length_in_bytes, BytesPerWord - 1);
4343     jcc(Assembler::zero, L);
4344     stop("length must be a multiple of BytesPerWord");
4345     bind(L);
4346   }
4347 #endif
4348   Register index = length_in_bytes;
4349   xorptr(temp, temp);    // use _zero reg to clear memory (shorter code)
4350   if (UseIncDec) {
4351     shrptr(index, 3);  // divide by 8/16 and set carry flag if bit 2 was set
4352   } else {
4353     shrptr(index, 2);  // use 2 instructions to avoid partial flag stall
4354     shrptr(index, 1);
4355   }
4356 #ifndef _LP64
4357   // index could have not been a multiple of 8 (i.e., bit 2 was set)
4358   {
4359     Label even;
4360     // note: if index was a multiple of 8, then it cannot
4361     //       be 0 now otherwise it must have been 0 before
4362     //       => if it is even, we don't need to check for 0 again
4363     jcc(Assembler::carryClear, even);
4364     // clear topmost word (no jump would be needed if conditional assignment worked here)
4365     movptr(Address(address, index, Address::times_8, offset_in_bytes - 0*BytesPerWord), temp);
4366     // index could be 0 now, must check again
4367     jcc(Assembler::zero, done);
4368     bind(even);
4369   }
4370 #endif // !_LP64
4371   // initialize remaining object fields: index is a multiple of 2 now
4372   {
4373     Label loop;
4374     bind(loop);
4375     movptr(Address(address, index, Address::times_8, offset_in_bytes - 1*BytesPerWord), temp);
4376     NOT_LP64(movptr(Address(address, index, Address::times_8, offset_in_bytes - 2*BytesPerWord), temp);)
4377     decrement(index);
4378     jcc(Assembler::notZero, loop);
4379   }
4380 
4381   bind(done);
4382 }
4383 
4384 // Look up the method for a megamorphic invokeinterface call.
4385 // The target method is determined by <intf_klass, itable_index>.
4386 // The receiver klass is in recv_klass.
4387 // On success, the result will be in method_result, and execution falls through.
4388 // On failure, execution transfers to the given label.
4389 void MacroAssembler::lookup_interface_method(Register recv_klass,
4390                                              Register intf_klass,
4391                                              RegisterOrConstant itable_index,
4392                                              Register method_result,
4393                                              Register scan_temp,
4394                                              Label& L_no_such_interface,
4395                                              bool return_method) {
4396   assert_different_registers(recv_klass, intf_klass, scan_temp);
4397   assert_different_registers(method_result, intf_klass, scan_temp);
4398   assert(recv_klass != method_result || !return_method,
4399          "recv_klass can be destroyed when method isn't needed");
4400 
4401   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
4402          "caller must use same register for non-constant itable index as for method");
4403 
4404   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
4405   int vtable_base = in_bytes(Klass::vtable_start_offset());
4406   int itentry_off = itableMethodEntry::method_offset_in_bytes();
4407   int scan_step   = itableOffsetEntry::size() * wordSize;
4408   int vte_size    = vtableEntry::size_in_bytes();
4409   Address::ScaleFactor times_vte_scale = Address::times_ptr;
4410   assert(vte_size == wordSize, "else adjust times_vte_scale");
4411 
4412   movl(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
4413 
4414   // %%% Could store the aligned, prescaled offset in the klassoop.
4415   lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
4416 
4417   if (return_method) {
4418     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
4419     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
4420     lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
4421   }
4422 
4423   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
4424   //   if (scan->interface() == intf) {
4425   //     result = (klass + scan->offset() + itable_index);
4426   //   }
4427   // }
4428   Label search, found_method;
4429 
4430   for (int peel = 1; peel >= 0; peel--) {
4431     movptr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
4432     cmpptr(intf_klass, method_result);
4433 
4434     if (peel) {
4435       jccb(Assembler::equal, found_method);
4436     } else {
4437       jccb(Assembler::notEqual, search);
4438       // (invert the test to fall through to found_method...)
4439     }
4440 
4441     if (!peel)  break;
4442 
4443     bind(search);
4444 
4445     // Check that the previous entry is non-null.  A null entry means that
4446     // the receiver class doesn't implement the interface, and wasn't the
4447     // same as when the caller was compiled.
4448     testptr(method_result, method_result);
4449     jcc(Assembler::zero, L_no_such_interface);
4450     addptr(scan_temp, scan_step);
4451   }
4452 
4453   bind(found_method);
4454 
4455   if (return_method) {
4456     // Got a hit.
4457     movl(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
4458     movptr(method_result, Address(recv_klass, scan_temp, Address::times_1));
4459   }
4460 }
4461 
4462 
4463 // virtual method calling
4464 void MacroAssembler::lookup_virtual_method(Register recv_klass,
4465                                            RegisterOrConstant vtable_index,
4466                                            Register method_result) {
4467   const int base = in_bytes(Klass::vtable_start_offset());
4468   assert(vtableEntry::size() * wordSize == wordSize, "else adjust the scaling in the code below");
4469   Address vtable_entry_addr(recv_klass,
4470                             vtable_index, Address::times_ptr,
4471                             base + vtableEntry::method_offset_in_bytes());
4472   movptr(method_result, vtable_entry_addr);
4473 }
4474 
4475 
4476 void MacroAssembler::check_klass_subtype(Register sub_klass,
4477                            Register super_klass,
4478                            Register temp_reg,
4479                            Label& L_success) {
4480   Label L_failure;
4481   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
4482   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
4483   bind(L_failure);
4484 }
4485 
4486 
4487 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
4488                                                    Register super_klass,
4489                                                    Register temp_reg,
4490                                                    Label* L_success,
4491                                                    Label* L_failure,
4492                                                    Label* L_slow_path,
4493                                         RegisterOrConstant super_check_offset) {
4494   assert_different_registers(sub_klass, super_klass, temp_reg);
4495   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
4496   if (super_check_offset.is_register()) {
4497     assert_different_registers(sub_klass, super_klass,
4498                                super_check_offset.as_register());
4499   } else if (must_load_sco) {
4500     assert(temp_reg != noreg, "supply either a temp or a register offset");
4501   }
4502 
4503   Label L_fallthrough;
4504   int label_nulls = 0;
4505   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
4506   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
4507   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
4508   assert(label_nulls <= 1, "at most one NULL in the batch");
4509 
4510   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4511   int sco_offset = in_bytes(Klass::super_check_offset_offset());
4512   Address super_check_offset_addr(super_klass, sco_offset);
4513 
4514   // Hacked jcc, which "knows" that L_fallthrough, at least, is in
4515   // range of a jccb.  If this routine grows larger, reconsider at
4516   // least some of these.
4517 #define local_jcc(assembler_cond, label)                                \
4518   if (&(label) == &L_fallthrough)  jccb(assembler_cond, label);         \
4519   else                             jcc( assembler_cond, label) /*omit semi*/
4520 
4521   // Hacked jmp, which may only be used just before L_fallthrough.
4522 #define final_jmp(label)                                                \
4523   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
4524   else                            jmp(label)                /*omit semi*/
4525 
4526   // If the pointers are equal, we are done (e.g., String[] elements).
4527   // This self-check enables sharing of secondary supertype arrays among
4528   // non-primary types such as array-of-interface.  Otherwise, each such
4529   // type would need its own customized SSA.
4530   // We move this check to the front of the fast path because many
4531   // type checks are in fact trivially successful in this manner,
4532   // so we get a nicely predicted branch right at the start of the check.
4533   cmpptr(sub_klass, super_klass);
4534   local_jcc(Assembler::equal, *L_success);
4535 
4536   // Check the supertype display:
4537   if (must_load_sco) {
4538     // Positive movl does right thing on LP64.
4539     movl(temp_reg, super_check_offset_addr);
4540     super_check_offset = RegisterOrConstant(temp_reg);
4541   }
4542   Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0);
4543   cmpptr(super_klass, super_check_addr); // load displayed supertype
4544 
4545   // This check has worked decisively for primary supers.
4546   // Secondary supers are sought in the super_cache ('super_cache_addr').
4547   // (Secondary supers are interfaces and very deeply nested subtypes.)
4548   // This works in the same check above because of a tricky aliasing
4549   // between the super_cache and the primary super display elements.
4550   // (The 'super_check_addr' can address either, as the case requires.)
4551   // Note that the cache is updated below if it does not help us find
4552   // what we need immediately.
4553   // So if it was a primary super, we can just fail immediately.
4554   // Otherwise, it's the slow path for us (no success at this point).
4555 
4556   if (super_check_offset.is_register()) {
4557     local_jcc(Assembler::equal, *L_success);
4558     cmpl(super_check_offset.as_register(), sc_offset);
4559     if (L_failure == &L_fallthrough) {
4560       local_jcc(Assembler::equal, *L_slow_path);
4561     } else {
4562       local_jcc(Assembler::notEqual, *L_failure);
4563       final_jmp(*L_slow_path);
4564     }
4565   } else if (super_check_offset.as_constant() == sc_offset) {
4566     // Need a slow path; fast failure is impossible.
4567     if (L_slow_path == &L_fallthrough) {
4568       local_jcc(Assembler::equal, *L_success);
4569     } else {
4570       local_jcc(Assembler::notEqual, *L_slow_path);
4571       final_jmp(*L_success);
4572     }
4573   } else {
4574     // No slow path; it's a fast decision.
4575     if (L_failure == &L_fallthrough) {
4576       local_jcc(Assembler::equal, *L_success);
4577     } else {
4578       local_jcc(Assembler::notEqual, *L_failure);
4579       final_jmp(*L_success);
4580     }
4581   }
4582 
4583   bind(L_fallthrough);
4584 
4585 #undef local_jcc
4586 #undef final_jmp
4587 }
4588 
4589 
4590 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
4591                                                    Register super_klass,
4592                                                    Register temp_reg,
4593                                                    Register temp2_reg,
4594                                                    Label* L_success,
4595                                                    Label* L_failure,
4596                                                    bool set_cond_codes) {
4597   assert_different_registers(sub_klass, super_klass, temp_reg);
4598   if (temp2_reg != noreg)
4599     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg);
4600 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
4601 
4602   Label L_fallthrough;
4603   int label_nulls = 0;
4604   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
4605   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
4606   assert(label_nulls <= 1, "at most one NULL in the batch");
4607 
4608   // a couple of useful fields in sub_klass:
4609   int ss_offset = in_bytes(Klass::secondary_supers_offset());
4610   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4611   Address secondary_supers_addr(sub_klass, ss_offset);
4612   Address super_cache_addr(     sub_klass, sc_offset);
4613 
4614   // Do a linear scan of the secondary super-klass chain.
4615   // This code is rarely used, so simplicity is a virtue here.
4616   // The repne_scan instruction uses fixed registers, which we must spill.
4617   // Don't worry too much about pre-existing connections with the input regs.
4618 
4619   assert(sub_klass != rax, "killed reg"); // killed by mov(rax, super)
4620   assert(sub_klass != rcx, "killed reg"); // killed by lea(rcx, &pst_counter)
4621 
4622   // Get super_klass value into rax (even if it was in rdi or rcx).
4623   bool pushed_rax = false, pushed_rcx = false, pushed_rdi = false;
4624   if (super_klass != rax || UseCompressedOops) {
4625     if (!IS_A_TEMP(rax)) { push(rax); pushed_rax = true; }
4626     mov(rax, super_klass);
4627   }
4628   if (!IS_A_TEMP(rcx)) { push(rcx); pushed_rcx = true; }
4629   if (!IS_A_TEMP(rdi)) { push(rdi); pushed_rdi = true; }
4630 
4631 #ifndef PRODUCT
4632   int* pst_counter = &SharedRuntime::_partial_subtype_ctr;
4633   ExternalAddress pst_counter_addr((address) pst_counter);
4634   NOT_LP64(  incrementl(pst_counter_addr) );
4635   LP64_ONLY( lea(rcx, pst_counter_addr) );
4636   LP64_ONLY( incrementl(Address(rcx, 0)) );
4637 #endif //PRODUCT
4638 
4639   // We will consult the secondary-super array.
4640   movptr(rdi, secondary_supers_addr);
4641   // Load the array length.  (Positive movl does right thing on LP64.)
4642   movl(rcx, Address(rdi, Array<Klass*>::length_offset_in_bytes()));
4643   // Skip to start of data.
4644   addptr(rdi, Array<Klass*>::base_offset_in_bytes());
4645 
4646   // Scan RCX words at [RDI] for an occurrence of RAX.
4647   // Set NZ/Z based on last compare.
4648   // Z flag value will not be set by 'repne' if RCX == 0 since 'repne' does
4649   // not change flags (only scas instruction which is repeated sets flags).
4650   // Set Z = 0 (not equal) before 'repne' to indicate that class was not found.
4651 
4652     testptr(rax,rax); // Set Z = 0
4653     repne_scan();
4654 
4655   // Unspill the temp. registers:
4656   if (pushed_rdi)  pop(rdi);
4657   if (pushed_rcx)  pop(rcx);
4658   if (pushed_rax)  pop(rax);
4659 
4660   if (set_cond_codes) {
4661     // Special hack for the AD files:  rdi is guaranteed non-zero.
4662     assert(!pushed_rdi, "rdi must be left non-NULL");
4663     // Also, the condition codes are properly set Z/NZ on succeed/failure.
4664   }
4665 
4666   if (L_failure == &L_fallthrough)
4667         jccb(Assembler::notEqual, *L_failure);
4668   else  jcc(Assembler::notEqual, *L_failure);
4669 
4670   // Success.  Cache the super we found and proceed in triumph.
4671   movptr(super_cache_addr, super_klass);
4672 
4673   if (L_success != &L_fallthrough) {
4674     jmp(*L_success);
4675   }
4676 
4677 #undef IS_A_TEMP
4678 
4679   bind(L_fallthrough);
4680 }
4681 
4682 void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) {
4683   assert(L_fast_path != NULL || L_slow_path != NULL, "at least one is required");
4684 
4685   Label L_fallthrough;
4686   if (L_fast_path == NULL) {
4687     L_fast_path = &L_fallthrough;
4688   } else if (L_slow_path == NULL) {
4689     L_slow_path = &L_fallthrough;
4690   }
4691 
4692   // Fast path check: class is fully initialized
4693   cmpb(Address(klass, InstanceKlass::init_state_offset()), InstanceKlass::fully_initialized);
4694   jcc(Assembler::equal, *L_fast_path);
4695 
4696   // Fast path check: current thread is initializer thread
4697   cmpptr(thread, Address(klass, InstanceKlass::init_thread_offset()));
4698   if (L_slow_path == &L_fallthrough) {
4699     jcc(Assembler::equal, *L_fast_path);
4700     bind(*L_slow_path);
4701   } else if (L_fast_path == &L_fallthrough) {
4702     jcc(Assembler::notEqual, *L_slow_path);
4703     bind(*L_fast_path);
4704   } else {
4705     Unimplemented();
4706   }
4707 }
4708 
4709 void MacroAssembler::cmov32(Condition cc, Register dst, Address src) {
4710   if (VM_Version::supports_cmov()) {
4711     cmovl(cc, dst, src);
4712   } else {
4713     Label L;
4714     jccb(negate_condition(cc), L);
4715     movl(dst, src);
4716     bind(L);
4717   }
4718 }
4719 
4720 void MacroAssembler::cmov32(Condition cc, Register dst, Register src) {
4721   if (VM_Version::supports_cmov()) {
4722     cmovl(cc, dst, src);
4723   } else {
4724     Label L;
4725     jccb(negate_condition(cc), L);
4726     movl(dst, src);
4727     bind(L);
4728   }
4729 }
4730 
4731 void MacroAssembler::verify_oop(Register reg, const char* s) {
4732   if (!VerifyOops || VerifyAdapterSharing) {
4733     // Below address of the code string confuses VerifyAdapterSharing
4734     // because it may differ between otherwise equivalent adapters.
4735     return;
4736   }
4737 
4738   // Pass register number to verify_oop_subroutine
4739   const char* b = NULL;
4740   {
4741     ResourceMark rm;
4742     stringStream ss;
4743     ss.print("verify_oop: %s: %s", reg->name(), s);
4744     b = code_string(ss.as_string());
4745   }
4746   BLOCK_COMMENT("verify_oop {");
4747 #ifdef _LP64
4748   push(rscratch1);                    // save r10, trashed by movptr()
4749 #endif
4750   push(rax);                          // save rax,
4751   push(reg);                          // pass register argument
4752   ExternalAddress buffer((address) b);
4753   // avoid using pushptr, as it modifies scratch registers
4754   // and our contract is not to modify anything
4755   movptr(rax, buffer.addr());
4756   push(rax);
4757   // call indirectly to solve generation ordering problem
4758   movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
4759   call(rax);
4760   // Caller pops the arguments (oop, message) and restores rax, r10
4761   BLOCK_COMMENT("} verify_oop");
4762 }
4763 
4764 
4765 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
4766                                                       Register tmp,
4767                                                       int offset) {
4768   intptr_t value = *delayed_value_addr;
4769   if (value != 0)
4770     return RegisterOrConstant(value + offset);
4771 
4772   // load indirectly to solve generation ordering problem
4773   movptr(tmp, ExternalAddress((address) delayed_value_addr));
4774 
4775 #ifdef ASSERT
4776   { Label L;
4777     testptr(tmp, tmp);
4778     if (WizardMode) {
4779       const char* buf = NULL;
4780       {
4781         ResourceMark rm;
4782         stringStream ss;
4783         ss.print("DelayedValue=" INTPTR_FORMAT, delayed_value_addr[1]);
4784         buf = code_string(ss.as_string());
4785       }
4786       jcc(Assembler::notZero, L);
4787       STOP(buf);
4788     } else {
4789       jccb(Assembler::notZero, L);
4790       hlt();
4791     }
4792     bind(L);
4793   }
4794 #endif
4795 
4796   if (offset != 0)
4797     addptr(tmp, offset);
4798 
4799   return RegisterOrConstant(tmp);
4800 }
4801 
4802 
4803 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
4804                                          int extra_slot_offset) {
4805   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
4806   int stackElementSize = Interpreter::stackElementSize;
4807   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
4808 #ifdef ASSERT
4809   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
4810   assert(offset1 - offset == stackElementSize, "correct arithmetic");
4811 #endif
4812   Register             scale_reg    = noreg;
4813   Address::ScaleFactor scale_factor = Address::no_scale;
4814   if (arg_slot.is_constant()) {
4815     offset += arg_slot.as_constant() * stackElementSize;
4816   } else {
4817     scale_reg    = arg_slot.as_register();
4818     scale_factor = Address::times(stackElementSize);
4819   }
4820   offset += wordSize;           // return PC is on stack
4821   return Address(rsp, scale_reg, scale_factor, offset);
4822 }
4823 
4824 
4825 void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
4826   if (!VerifyOops || VerifyAdapterSharing) {
4827     // Below address of the code string confuses VerifyAdapterSharing
4828     // because it may differ between otherwise equivalent adapters.
4829     return;
4830   }
4831 
4832   // Address adjust(addr.base(), addr.index(), addr.scale(), addr.disp() + BytesPerWord);
4833   // Pass register number to verify_oop_subroutine
4834   const char* b = NULL;
4835   {
4836     ResourceMark rm;
4837     stringStream ss;
4838     ss.print("verify_oop_addr: %s", s);
4839     b = code_string(ss.as_string());
4840   }
4841 #ifdef _LP64
4842   push(rscratch1);                    // save r10, trashed by movptr()
4843 #endif
4844   push(rax);                          // save rax,
4845   // addr may contain rsp so we will have to adjust it based on the push
4846   // we just did (and on 64 bit we do two pushes)
4847   // NOTE: 64bit seemed to have had a bug in that it did movq(addr, rax); which
4848   // stores rax into addr which is backwards of what was intended.
4849   if (addr.uses(rsp)) {
4850     lea(rax, addr);
4851     pushptr(Address(rax, LP64_ONLY(2 *) BytesPerWord));
4852   } else {
4853     pushptr(addr);
4854   }
4855 
4856   ExternalAddress buffer((address) b);
4857   // pass msg argument
4858   // avoid using pushptr, as it modifies scratch registers
4859   // and our contract is not to modify anything
4860   movptr(rax, buffer.addr());
4861   push(rax);
4862 
4863   // call indirectly to solve generation ordering problem
4864   movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
4865   call(rax);
4866   // Caller pops the arguments (addr, message) and restores rax, r10.
4867 }
4868 
4869 void MacroAssembler::verify_tlab() {
4870 #ifdef ASSERT
4871   if (UseTLAB && VerifyOops) {
4872     Label next, ok;
4873     Register t1 = rsi;
4874     Register thread_reg = NOT_LP64(rbx) LP64_ONLY(r15_thread);
4875 
4876     push(t1);
4877     NOT_LP64(push(thread_reg));
4878     NOT_LP64(get_thread(thread_reg));
4879 
4880     movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
4881     cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())));
4882     jcc(Assembler::aboveEqual, next);
4883     STOP("assert(top >= start)");
4884     should_not_reach_here();
4885 
4886     bind(next);
4887     movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())));
4888     cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
4889     jcc(Assembler::aboveEqual, ok);
4890     STOP("assert(top <= end)");
4891     should_not_reach_here();
4892 
4893     bind(ok);
4894     NOT_LP64(pop(thread_reg));
4895     pop(t1);
4896   }
4897 #endif
4898 }
4899 
4900 class ControlWord {
4901  public:
4902   int32_t _value;
4903 
4904   int  rounding_control() const        { return  (_value >> 10) & 3      ; }
4905   int  precision_control() const       { return  (_value >>  8) & 3      ; }
4906   bool precision() const               { return ((_value >>  5) & 1) != 0; }
4907   bool underflow() const               { return ((_value >>  4) & 1) != 0; }
4908   bool overflow() const                { return ((_value >>  3) & 1) != 0; }
4909   bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
4910   bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
4911   bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
4912 
4913   void print() const {
4914     // rounding control
4915     const char* rc;
4916     switch (rounding_control()) {
4917       case 0: rc = "round near"; break;
4918       case 1: rc = "round down"; break;
4919       case 2: rc = "round up  "; break;
4920       case 3: rc = "chop      "; break;
4921     };
4922     // precision control
4923     const char* pc;
4924     switch (precision_control()) {
4925       case 0: pc = "24 bits "; break;
4926       case 1: pc = "reserved"; break;
4927       case 2: pc = "53 bits "; break;
4928       case 3: pc = "64 bits "; break;
4929     };
4930     // flags
4931     char f[9];
4932     f[0] = ' ';
4933     f[1] = ' ';
4934     f[2] = (precision   ()) ? 'P' : 'p';
4935     f[3] = (underflow   ()) ? 'U' : 'u';
4936     f[4] = (overflow    ()) ? 'O' : 'o';
4937     f[5] = (zero_divide ()) ? 'Z' : 'z';
4938     f[6] = (denormalized()) ? 'D' : 'd';
4939     f[7] = (invalid     ()) ? 'I' : 'i';
4940     f[8] = '\x0';
4941     // output
4942     printf("%04x  masks = %s, %s, %s", _value & 0xFFFF, f, rc, pc);
4943   }
4944 
4945 };
4946 
4947 class StatusWord {
4948  public:
4949   int32_t _value;
4950 
4951   bool busy() const                    { return ((_value >> 15) & 1) != 0; }
4952   bool C3() const                      { return ((_value >> 14) & 1) != 0; }
4953   bool C2() const                      { return ((_value >> 10) & 1) != 0; }
4954   bool C1() const                      { return ((_value >>  9) & 1) != 0; }
4955   bool C0() const                      { return ((_value >>  8) & 1) != 0; }
4956   int  top() const                     { return  (_value >> 11) & 7      ; }
4957   bool error_status() const            { return ((_value >>  7) & 1) != 0; }
4958   bool stack_fault() const             { return ((_value >>  6) & 1) != 0; }
4959   bool precision() const               { return ((_value >>  5) & 1) != 0; }
4960   bool underflow() const               { return ((_value >>  4) & 1) != 0; }
4961   bool overflow() const                { return ((_value >>  3) & 1) != 0; }
4962   bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
4963   bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
4964   bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
4965 
4966   void print() const {
4967     // condition codes
4968     char c[5];
4969     c[0] = (C3()) ? '3' : '-';
4970     c[1] = (C2()) ? '2' : '-';
4971     c[2] = (C1()) ? '1' : '-';
4972     c[3] = (C0()) ? '0' : '-';
4973     c[4] = '\x0';
4974     // flags
4975     char f[9];
4976     f[0] = (error_status()) ? 'E' : '-';
4977     f[1] = (stack_fault ()) ? 'S' : '-';
4978     f[2] = (precision   ()) ? 'P' : '-';
4979     f[3] = (underflow   ()) ? 'U' : '-';
4980     f[4] = (overflow    ()) ? 'O' : '-';
4981     f[5] = (zero_divide ()) ? 'Z' : '-';
4982     f[6] = (denormalized()) ? 'D' : '-';
4983     f[7] = (invalid     ()) ? 'I' : '-';
4984     f[8] = '\x0';
4985     // output
4986     printf("%04x  flags = %s, cc =  %s, top = %d", _value & 0xFFFF, f, c, top());
4987   }
4988 
4989 };
4990 
4991 class TagWord {
4992  public:
4993   int32_t _value;
4994 
4995   int tag_at(int i) const              { return (_value >> (i*2)) & 3; }
4996 
4997   void print() const {
4998     printf("%04x", _value & 0xFFFF);
4999   }
5000 
5001 };
5002 
5003 class FPU_Register {
5004  public:
5005   int32_t _m0;
5006   int32_t _m1;
5007   int16_t _ex;
5008 
5009   bool is_indefinite() const           {
5010     return _ex == -1 && _m1 == (int32_t)0xC0000000 && _m0 == 0;
5011   }
5012 
5013   void print() const {
5014     char  sign = (_ex < 0) ? '-' : '+';
5015     const char* kind = (_ex == 0x7FFF || _ex == (int16_t)-1) ? "NaN" : "   ";
5016     printf("%c%04hx.%08x%08x  %s", sign, _ex, _m1, _m0, kind);
5017   };
5018 
5019 };
5020 
5021 class FPU_State {
5022  public:
5023   enum {
5024     register_size       = 10,
5025     number_of_registers =  8,
5026     register_mask       =  7
5027   };
5028 
5029   ControlWord  _control_word;
5030   StatusWord   _status_word;
5031   TagWord      _tag_word;
5032   int32_t      _error_offset;
5033   int32_t      _error_selector;
5034   int32_t      _data_offset;
5035   int32_t      _data_selector;
5036   int8_t       _register[register_size * number_of_registers];
5037 
5038   int tag_for_st(int i) const          { return _tag_word.tag_at((_status_word.top() + i) & register_mask); }
5039   FPU_Register* st(int i) const        { return (FPU_Register*)&_register[register_size * i]; }
5040 
5041   const char* tag_as_string(int tag) const {
5042     switch (tag) {
5043       case 0: return "valid";
5044       case 1: return "zero";
5045       case 2: return "special";
5046       case 3: return "empty";
5047     }
5048     ShouldNotReachHere();
5049     return NULL;
5050   }
5051 
5052   void print() const {
5053     // print computation registers
5054     { int t = _status_word.top();
5055       for (int i = 0; i < number_of_registers; i++) {
5056         int j = (i - t) & register_mask;
5057         printf("%c r%d = ST%d = ", (j == 0 ? '*' : ' '), i, j);
5058         st(j)->print();
5059         printf(" %s\n", tag_as_string(_tag_word.tag_at(i)));
5060       }
5061     }
5062     printf("\n");
5063     // print control registers
5064     printf("ctrl = "); _control_word.print(); printf("\n");
5065     printf("stat = "); _status_word .print(); printf("\n");
5066     printf("tags = "); _tag_word    .print(); printf("\n");
5067   }
5068 
5069 };
5070 
5071 class Flag_Register {
5072  public:
5073   int32_t _value;
5074 
5075   bool overflow() const                { return ((_value >> 11) & 1) != 0; }
5076   bool direction() const               { return ((_value >> 10) & 1) != 0; }
5077   bool sign() const                    { return ((_value >>  7) & 1) != 0; }
5078   bool zero() const                    { return ((_value >>  6) & 1) != 0; }
5079   bool auxiliary_carry() const         { return ((_value >>  4) & 1) != 0; }
5080   bool parity() const                  { return ((_value >>  2) & 1) != 0; }
5081   bool carry() const                   { return ((_value >>  0) & 1) != 0; }
5082 
5083   void print() const {
5084     // flags
5085     char f[8];
5086     f[0] = (overflow       ()) ? 'O' : '-';
5087     f[1] = (direction      ()) ? 'D' : '-';
5088     f[2] = (sign           ()) ? 'S' : '-';
5089     f[3] = (zero           ()) ? 'Z' : '-';
5090     f[4] = (auxiliary_carry()) ? 'A' : '-';
5091     f[5] = (parity         ()) ? 'P' : '-';
5092     f[6] = (carry          ()) ? 'C' : '-';
5093     f[7] = '\x0';
5094     // output
5095     printf("%08x  flags = %s", _value, f);
5096   }
5097 
5098 };
5099 
5100 class IU_Register {
5101  public:
5102   int32_t _value;
5103 
5104   void print() const {
5105     printf("%08x  %11d", _value, _value);
5106   }
5107 
5108 };
5109 
5110 class IU_State {
5111  public:
5112   Flag_Register _eflags;
5113   IU_Register   _rdi;
5114   IU_Register   _rsi;
5115   IU_Register   _rbp;
5116   IU_Register   _rsp;
5117   IU_Register   _rbx;
5118   IU_Register   _rdx;
5119   IU_Register   _rcx;
5120   IU_Register   _rax;
5121 
5122   void print() const {
5123     // computation registers
5124     printf("rax,  = "); _rax.print(); printf("\n");
5125     printf("rbx,  = "); _rbx.print(); printf("\n");
5126     printf("rcx  = "); _rcx.print(); printf("\n");
5127     printf("rdx  = "); _rdx.print(); printf("\n");
5128     printf("rdi  = "); _rdi.print(); printf("\n");
5129     printf("rsi  = "); _rsi.print(); printf("\n");
5130     printf("rbp,  = "); _rbp.print(); printf("\n");
5131     printf("rsp  = "); _rsp.print(); printf("\n");
5132     printf("\n");
5133     // control registers
5134     printf("flgs = "); _eflags.print(); printf("\n");
5135   }
5136 };
5137 
5138 
5139 class CPU_State {
5140  public:
5141   FPU_State _fpu_state;
5142   IU_State  _iu_state;
5143 
5144   void print() const {
5145     printf("--------------------------------------------------\n");
5146     _iu_state .print();
5147     printf("\n");
5148     _fpu_state.print();
5149     printf("--------------------------------------------------\n");
5150   }
5151 
5152 };
5153 
5154 
5155 static void _print_CPU_state(CPU_State* state) {
5156   state->print();
5157 };
5158 
5159 
5160 void MacroAssembler::print_CPU_state() {
5161   push_CPU_state();
5162   push(rsp);                // pass CPU state
5163   call(RuntimeAddress(CAST_FROM_FN_PTR(address, _print_CPU_state)));
5164   addptr(rsp, wordSize);       // discard argument
5165   pop_CPU_state();
5166 }
5167 
5168 
5169 static bool _verify_FPU(int stack_depth, char* s, CPU_State* state) {
5170   static int counter = 0;
5171   FPU_State* fs = &state->_fpu_state;
5172   counter++;
5173   // For leaf calls, only verify that the top few elements remain empty.
5174   // We only need 1 empty at the top for C2 code.
5175   if( stack_depth < 0 ) {
5176     if( fs->tag_for_st(7) != 3 ) {
5177       printf("FPR7 not empty\n");
5178       state->print();
5179       assert(false, "error");
5180       return false;
5181     }
5182     return true;                // All other stack states do not matter
5183   }
5184 
5185   assert((fs->_control_word._value & 0xffff) == StubRoutines::_fpu_cntrl_wrd_std,
5186          "bad FPU control word");
5187 
5188   // compute stack depth
5189   int i = 0;
5190   while (i < FPU_State::number_of_registers && fs->tag_for_st(i)  < 3) i++;
5191   int d = i;
5192   while (i < FPU_State::number_of_registers && fs->tag_for_st(i) == 3) i++;
5193   // verify findings
5194   if (i != FPU_State::number_of_registers) {
5195     // stack not contiguous
5196     printf("%s: stack not contiguous at ST%d\n", s, i);
5197     state->print();
5198     assert(false, "error");
5199     return false;
5200   }
5201   // check if computed stack depth corresponds to expected stack depth
5202   if (stack_depth < 0) {
5203     // expected stack depth is -stack_depth or less
5204     if (d > -stack_depth) {
5205       // too many elements on the stack
5206       printf("%s: <= %d stack elements expected but found %d\n", s, -stack_depth, d);
5207       state->print();
5208       assert(false, "error");
5209       return false;
5210     }
5211   } else {
5212     // expected stack depth is stack_depth
5213     if (d != stack_depth) {
5214       // wrong stack depth
5215       printf("%s: %d stack elements expected but found %d\n", s, stack_depth, d);
5216       state->print();
5217       assert(false, "error");
5218       return false;
5219     }
5220   }
5221   // everything is cool
5222   return true;
5223 }
5224 
5225 
5226 void MacroAssembler::verify_FPU(int stack_depth, const char* s) {
5227   if (!VerifyFPU) return;
5228   push_CPU_state();
5229   push(rsp);                // pass CPU state
5230   ExternalAddress msg((address) s);
5231   // pass message string s
5232   pushptr(msg.addr());
5233   push(stack_depth);        // pass stack depth
5234   call(RuntimeAddress(CAST_FROM_FN_PTR(address, _verify_FPU)));
5235   addptr(rsp, 3 * wordSize);   // discard arguments
5236   // check for error
5237   { Label L;
5238     testl(rax, rax);
5239     jcc(Assembler::notZero, L);
5240     int3();                  // break if error condition
5241     bind(L);
5242   }
5243   pop_CPU_state();
5244 }
5245 
5246 void MacroAssembler::restore_cpu_control_state_after_jni() {
5247   // Either restore the MXCSR register after returning from the JNI Call
5248   // or verify that it wasn't changed (with -Xcheck:jni flag).
5249   if (VM_Version::supports_sse()) {
5250     if (RestoreMXCSROnJNICalls) {
5251       ldmxcsr(ExternalAddress(StubRoutines::addr_mxcsr_std()));
5252     } else if (CheckJNICalls) {
5253       call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry()));
5254     }
5255   }
5256   // Clear upper bits of YMM registers to avoid SSE <-> AVX transition penalty.
5257   vzeroupper();
5258   // Reset k1 to 0xffff.
5259 
5260 #ifdef COMPILER2
5261   if (PostLoopMultiversioning && VM_Version::supports_evex()) {
5262     push(rcx);
5263     movl(rcx, 0xffff);
5264     kmovwl(k1, rcx);
5265     pop(rcx);
5266   }
5267 #endif // COMPILER2
5268 
5269 #ifndef _LP64
5270   // Either restore the x87 floating pointer control word after returning
5271   // from the JNI call or verify that it wasn't changed.
5272   if (CheckJNICalls) {
5273     call(RuntimeAddress(StubRoutines::x86::verify_fpu_cntrl_wrd_entry()));
5274   }
5275 #endif // _LP64
5276 }
5277 
5278 // ((OopHandle)result).resolve();
5279 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
5280   assert_different_registers(result, tmp);
5281 
5282   // Only 64 bit platforms support GCs that require a tmp register
5283   // Only IN_HEAP loads require a thread_tmp register
5284   // OopHandle::resolve is an indirection like jobject.
5285   access_load_at(T_OBJECT, IN_NATIVE,
5286                  result, Address(result, 0), tmp, /*tmp_thread*/noreg);
5287 }
5288 
5289 // ((WeakHandle)result).resolve();
5290 void MacroAssembler::resolve_weak_handle(Register rresult, Register rtmp) {
5291   assert_different_registers(rresult, rtmp);
5292   Label resolved;
5293 
5294   // A null weak handle resolves to null.
5295   cmpptr(rresult, 0);
5296   jcc(Assembler::equal, resolved);
5297 
5298   // Only 64 bit platforms support GCs that require a tmp register
5299   // Only IN_HEAP loads require a thread_tmp register
5300   // WeakHandle::resolve is an indirection like jweak.
5301   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
5302                  rresult, Address(rresult, 0), rtmp, /*tmp_thread*/noreg);
5303   bind(resolved);
5304 }
5305 
5306 void MacroAssembler::load_mirror(Register mirror, Register method, Register tmp) {
5307   // get mirror
5308   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
5309   load_method_holder(mirror, method);
5310   movptr(mirror, Address(mirror, mirror_offset));
5311   resolve_oop_handle(mirror, tmp);
5312 }
5313 
5314 void MacroAssembler::load_method_holder_cld(Register rresult, Register rmethod) {
5315   load_method_holder(rresult, rmethod);
5316   movptr(rresult, Address(rresult, InstanceKlass::class_loader_data_offset()));
5317 }
5318 
5319 void MacroAssembler::load_metadata(Register dst, Register src) {
5320   if (UseCompressedClassPointers) {
5321     movl(dst, Address(src, oopDesc::klass_offset_in_bytes()));
5322   } else {
5323     movptr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
5324   }
5325 }
5326 
5327 void MacroAssembler::load_storage_props(Register dst, Register src) {
5328   load_metadata(dst, src);
5329   if (UseCompressedClassPointers) {
5330     shrl(dst, oopDesc::narrow_storage_props_shift);
5331   } else {
5332     shrq(dst, oopDesc::wide_storage_props_shift);
5333   }
5334 }
5335 
5336 void MacroAssembler::load_method_holder(Register holder, Register method) {
5337   movptr(holder, Address(method, Method::const_offset()));                      // ConstMethod*
5338   movptr(holder, Address(holder, ConstMethod::constants_offset()));             // ConstantPool*
5339   movptr(holder, Address(holder, ConstantPool::pool_holder_offset_in_bytes())); // InstanceKlass*
5340 }
5341 
5342 void MacroAssembler::load_klass(Register dst, Register src) {
5343   load_metadata(dst, src);
5344 #ifdef _LP64
5345   if (UseCompressedClassPointers) {
5346     andl(dst, oopDesc::compressed_klass_mask());
5347     decode_klass_not_null(dst);
5348   } else
5349 #endif
5350   {
5351 #ifdef _LP64
5352     shlq(dst, oopDesc::storage_props_nof_bits);
5353     shrq(dst, oopDesc::storage_props_nof_bits);
5354 #else
5355     andl(dst, oopDesc::wide_klass_mask());
5356 #endif
5357   }
5358 }
5359 
5360 void MacroAssembler::load_prototype_header(Register dst, Register src) {
5361   load_klass(dst, src);
5362   movptr(dst, Address(dst, Klass::prototype_header_offset()));
5363 }
5364 
5365 void MacroAssembler::store_klass(Register dst, Register src) {
5366 #ifdef _LP64
5367   if (UseCompressedClassPointers) {
5368     encode_klass_not_null(src);
5369     movl(Address(dst, oopDesc::klass_offset_in_bytes()), src);
5370   } else
5371 #endif
5372     movptr(Address(dst, oopDesc::klass_offset_in_bytes()), src);
5373 }
5374 
5375 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators, Register dst, Address src,
5376                                     Register tmp1, Register thread_tmp) {
5377   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
5378   decorators = AccessInternal::decorator_fixup(decorators);
5379   bool as_raw = (decorators & AS_RAW) != 0;
5380   if (as_raw) {
5381     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
5382   } else {
5383     bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
5384   }
5385 }
5386 
5387 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators, Address dst, Register src,
5388                                      Register tmp1, Register tmp2, Register tmp3) {
5389   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
5390   decorators = AccessInternal::decorator_fixup(decorators);
5391   bool as_raw = (decorators & AS_RAW) != 0;
5392   if (as_raw) {
5393     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, tmp2, tmp3);
5394   } else {
5395     bs->store_at(this, decorators, type, dst, src, tmp1, tmp2, tmp3);
5396   }
5397 }
5398 
5399 void MacroAssembler::resolve(DecoratorSet decorators, Register obj) {
5400   // Use stronger ACCESS_WRITE|ACCESS_READ by default.
5401   if ((decorators & (ACCESS_READ | ACCESS_WRITE)) == 0) {
5402     decorators |= ACCESS_READ | ACCESS_WRITE;
5403   }
5404   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
5405   return bs->resolve(this, decorators, obj);
5406 }
5407 
5408 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
5409                                    Register thread_tmp, DecoratorSet decorators) {
5410   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
5411 }
5412 
5413 // Doesn't do verfication, generates fixed size code
5414 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
5415                                             Register thread_tmp, DecoratorSet decorators) {
5416   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp);
5417 }
5418 
5419 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1,
5420                                     Register tmp2, Register tmp3, DecoratorSet decorators) {
5421   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, tmp2, tmp3);
5422 }
5423 
5424 // Used for storing NULLs.
5425 void MacroAssembler::store_heap_oop_null(Address dst) {
5426   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg, noreg);
5427 }
5428 
5429 #ifdef _LP64
5430 void MacroAssembler::store_klass_gap(Register dst, Register src) {
5431   if (UseCompressedClassPointers) {
5432     // Store to klass gap in destination
5433     movl(Address(dst, oopDesc::klass_gap_offset_in_bytes()), src);
5434   }
5435 }
5436 
5437 #ifdef ASSERT
5438 void MacroAssembler::verify_heapbase(const char* msg) {
5439   assert (UseCompressedOops, "should be compressed");
5440   assert (Universe::heap() != NULL, "java heap should be initialized");
5441   if (CheckCompressedOops) {
5442     Label ok;
5443     push(rscratch1); // cmpptr trashes rscratch1
5444     cmpptr(r12_heapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr()));
5445     jcc(Assembler::equal, ok);
5446     STOP(msg);
5447     bind(ok);
5448     pop(rscratch1);
5449   }
5450 }
5451 #endif
5452 
5453 // Algorithm must match oop.inline.hpp encode_heap_oop.
5454 void MacroAssembler::encode_heap_oop(Register r) {
5455 #ifdef ASSERT
5456   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
5457 #endif
5458   verify_oop(r, "broken oop in encode_heap_oop");
5459   if (CompressedOops::base() == NULL) {
5460     if (CompressedOops::shift() != 0) {
5461       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5462       shrq(r, LogMinObjAlignmentInBytes);
5463     }
5464     return;
5465   }
5466   testq(r, r);
5467   cmovq(Assembler::equal, r, r12_heapbase);
5468   subq(r, r12_heapbase);
5469   shrq(r, LogMinObjAlignmentInBytes);
5470 }
5471 
5472 void MacroAssembler::encode_heap_oop_not_null(Register r) {
5473 #ifdef ASSERT
5474   verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
5475   if (CheckCompressedOops) {
5476     Label ok;
5477     testq(r, r);
5478     jcc(Assembler::notEqual, ok);
5479     STOP("null oop passed to encode_heap_oop_not_null");
5480     bind(ok);
5481   }
5482 #endif
5483   verify_oop(r, "broken oop in encode_heap_oop_not_null");
5484   if (CompressedOops::base() != NULL) {
5485     subq(r, r12_heapbase);
5486   }
5487   if (CompressedOops::shift() != 0) {
5488     assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5489     shrq(r, LogMinObjAlignmentInBytes);
5490   }
5491 }
5492 
5493 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
5494 #ifdef ASSERT
5495   verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
5496   if (CheckCompressedOops) {
5497     Label ok;
5498     testq(src, src);
5499     jcc(Assembler::notEqual, ok);
5500     STOP("null oop passed to encode_heap_oop_not_null2");
5501     bind(ok);
5502   }
5503 #endif
5504   verify_oop(src, "broken oop in encode_heap_oop_not_null2");
5505   if (dst != src) {
5506     movq(dst, src);
5507   }
5508   if (CompressedOops::base() != NULL) {
5509     subq(dst, r12_heapbase);
5510   }
5511   if (CompressedOops::shift() != 0) {
5512     assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5513     shrq(dst, LogMinObjAlignmentInBytes);
5514   }
5515 }
5516 
5517 void  MacroAssembler::decode_heap_oop(Register r) {
5518 #ifdef ASSERT
5519   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
5520 #endif
5521   if (CompressedOops::base() == NULL) {
5522     if (CompressedOops::shift() != 0) {
5523       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5524       shlq(r, LogMinObjAlignmentInBytes);
5525     }
5526   } else {
5527     Label done;
5528     shlq(r, LogMinObjAlignmentInBytes);
5529     jccb(Assembler::equal, done);
5530     addq(r, r12_heapbase);
5531     bind(done);
5532   }
5533   verify_oop(r, "broken oop in decode_heap_oop");
5534 }
5535 
5536 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
5537   // Note: it will change flags
5538   assert (UseCompressedOops, "should only be used for compressed headers");
5539   assert (Universe::heap() != NULL, "java heap should be initialized");
5540   // Cannot assert, unverified entry point counts instructions (see .ad file)
5541   // vtableStubs also counts instructions in pd_code_size_limit.
5542   // Also do not verify_oop as this is called by verify_oop.
5543   if (CompressedOops::shift() != 0) {
5544     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5545     shlq(r, LogMinObjAlignmentInBytes);
5546     if (CompressedOops::base() != NULL) {
5547       addq(r, r12_heapbase);
5548     }
5549   } else {
5550     assert (CompressedOops::base() == NULL, "sanity");
5551   }
5552 }
5553 
5554 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
5555   // Note: it will change flags
5556   assert (UseCompressedOops, "should only be used for compressed headers");
5557   assert (Universe::heap() != NULL, "java heap should be initialized");
5558   // Cannot assert, unverified entry point counts instructions (see .ad file)
5559   // vtableStubs also counts instructions in pd_code_size_limit.
5560   // Also do not verify_oop as this is called by verify_oop.
5561   if (CompressedOops::shift() != 0) {
5562     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5563     if (LogMinObjAlignmentInBytes == Address::times_8) {
5564       leaq(dst, Address(r12_heapbase, src, Address::times_8, 0));
5565     } else {
5566       if (dst != src) {
5567         movq(dst, src);
5568       }
5569       shlq(dst, LogMinObjAlignmentInBytes);
5570       if (CompressedOops::base() != NULL) {
5571         addq(dst, r12_heapbase);
5572       }
5573     }
5574   } else {
5575     assert (CompressedOops::base() == NULL, "sanity");
5576     if (dst != src) {
5577       movq(dst, src);
5578     }
5579   }
5580 }
5581 
5582 void MacroAssembler::encode_klass_not_null(Register r) {
5583   if (CompressedKlassPointers::base() != NULL) {
5584     // Use r12 as a scratch register in which to temporarily load the narrow_klass_base.
5585     assert(r != r12_heapbase, "Encoding a klass in r12");
5586     mov64(r12_heapbase, (int64_t)CompressedKlassPointers::base());
5587     subq(r, r12_heapbase);
5588   }
5589   if (CompressedKlassPointers::shift() != 0) {
5590     assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
5591     shrq(r, LogKlassAlignmentInBytes);
5592   }
5593   if (CompressedKlassPointers::base() != NULL) {
5594     reinit_heapbase();
5595   }
5596 }
5597 
5598 void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
5599   if (dst == src) {
5600     encode_klass_not_null(src);
5601   } else {
5602     if (CompressedKlassPointers::base() != NULL) {
5603       mov64(dst, (int64_t)CompressedKlassPointers::base());
5604       negq(dst);
5605       addq(dst, src);
5606     } else {
5607       movptr(dst, src);
5608     }
5609     if (CompressedKlassPointers::shift() != 0) {
5610       assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
5611       shrq(dst, LogKlassAlignmentInBytes);
5612     }
5613   }
5614 }
5615 
5616 // Function instr_size_for_decode_klass_not_null() counts the instructions
5617 // generated by decode_klass_not_null(register r) and reinit_heapbase(),
5618 // when (Universe::heap() != NULL).  Hence, if the instructions they
5619 // generate change, then this method needs to be updated.
5620 int MacroAssembler::instr_size_for_decode_klass_not_null() {
5621   assert (UseCompressedClassPointers, "only for compressed klass ptrs");
5622   if (CompressedKlassPointers::base() != NULL) {
5623     // mov64 + addq + shlq? + mov64  (for reinit_heapbase()).
5624     return (CompressedKlassPointers::shift() == 0 ? 20 : 24);
5625   } else {
5626     // longest load decode klass function, mov64, leaq
5627     return 16;
5628   }
5629 }
5630 
5631 // !!! If the instructions that get generated here change then function
5632 // instr_size_for_decode_klass_not_null() needs to get updated.
5633 void  MacroAssembler::decode_klass_not_null(Register r) {
5634   // Note: it will change flags
5635   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5636   assert(r != r12_heapbase, "Decoding a klass in r12");
5637   // Cannot assert, unverified entry point counts instructions (see .ad file)
5638   // vtableStubs also counts instructions in pd_code_size_limit.
5639   // Also do not verify_oop as this is called by verify_oop.
5640   if (CompressedKlassPointers::shift() != 0) {
5641     assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
5642     shlq(r, LogKlassAlignmentInBytes);
5643   }
5644   // Use r12 as a scratch register in which to temporarily load the narrow_klass_base.
5645   if (CompressedKlassPointers::base() != NULL) {
5646     mov64(r12_heapbase, (int64_t)CompressedKlassPointers::base());
5647     addq(r, r12_heapbase);
5648     reinit_heapbase();
5649   }
5650 }
5651 
5652 void  MacroAssembler::decode_klass_not_null(Register dst, Register src) {
5653   // Note: it will change flags
5654   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5655   if (dst == src) {
5656     decode_klass_not_null(dst);
5657   } else {
5658     // Cannot assert, unverified entry point counts instructions (see .ad file)
5659     // vtableStubs also counts instructions in pd_code_size_limit.
5660     // Also do not verify_oop as this is called by verify_oop.
5661     mov64(dst, (int64_t)CompressedKlassPointers::base());
5662     if (CompressedKlassPointers::shift() != 0) {
5663       assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
5664       assert(LogKlassAlignmentInBytes == Address::times_8, "klass not aligned on 64bits?");
5665       leaq(dst, Address(dst, src, Address::times_8, 0));
5666     } else {
5667       addq(dst, src);
5668     }
5669   }
5670 }
5671 
5672 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
5673   assert (UseCompressedOops, "should only be used for compressed headers");
5674   assert (Universe::heap() != NULL, "java heap should be initialized");
5675   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5676   int oop_index = oop_recorder()->find_index(obj);
5677   RelocationHolder rspec = oop_Relocation::spec(oop_index);
5678   mov_narrow_oop(dst, oop_index, rspec);
5679 }
5680 
5681 void  MacroAssembler::set_narrow_oop(Address dst, jobject obj) {
5682   assert (UseCompressedOops, "should only be used for compressed headers");
5683   assert (Universe::heap() != NULL, "java heap should be initialized");
5684   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5685   int oop_index = oop_recorder()->find_index(obj);
5686   RelocationHolder rspec = oop_Relocation::spec(oop_index);
5687   mov_narrow_oop(dst, oop_index, rspec);
5688 }
5689 
5690 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
5691   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5692   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5693   int klass_index = oop_recorder()->find_index(k);
5694   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5695   mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5696 }
5697 
5698 void  MacroAssembler::set_narrow_klass(Address dst, Klass* k) {
5699   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5700   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5701   int klass_index = oop_recorder()->find_index(k);
5702   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5703   mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5704 }
5705 
5706 void  MacroAssembler::cmp_narrow_oop(Register dst, jobject obj) {
5707   assert (UseCompressedOops, "should only be used for compressed headers");
5708   assert (Universe::heap() != NULL, "java heap should be initialized");
5709   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5710   int oop_index = oop_recorder()->find_index(obj);
5711   RelocationHolder rspec = oop_Relocation::spec(oop_index);
5712   Assembler::cmp_narrow_oop(dst, oop_index, rspec);
5713 }
5714 
5715 void  MacroAssembler::cmp_narrow_oop(Address dst, jobject obj) {
5716   assert (UseCompressedOops, "should only be used for compressed headers");
5717   assert (Universe::heap() != NULL, "java heap should be initialized");
5718   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5719   int oop_index = oop_recorder()->find_index(obj);
5720   RelocationHolder rspec = oop_Relocation::spec(oop_index);
5721   Assembler::cmp_narrow_oop(dst, oop_index, rspec);
5722 }
5723 
5724 void  MacroAssembler::cmp_narrow_klass(Register dst, Klass* k) {
5725   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5726   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5727   int klass_index = oop_recorder()->find_index(k);
5728   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5729   Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5730 }
5731 
5732 void  MacroAssembler::cmp_narrow_klass(Address dst, Klass* k) {
5733   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5734   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5735   int klass_index = oop_recorder()->find_index(k);
5736   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5737   Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5738 }
5739 
5740 void MacroAssembler::reinit_heapbase() {
5741   if (UseCompressedOops || UseCompressedClassPointers) {
5742     if (Universe::heap() != NULL) {
5743       if (CompressedOops::base() == NULL) {
5744         MacroAssembler::xorptr(r12_heapbase, r12_heapbase);
5745       } else {
5746         mov64(r12_heapbase, (int64_t)CompressedOops::ptrs_base());
5747       }
5748     } else {
5749       movptr(r12_heapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr()));
5750     }
5751   }
5752 }
5753 
5754 #endif // _LP64
5755 
5756 // C2 compiled method's prolog code.
5757 void MacroAssembler::verified_entry(Compile* C, int sp_inc) {
5758   int framesize = C->frame_size_in_bytes();
5759   int bangsize = C->bang_size_in_bytes();
5760   bool fp_mode_24b = C->in_24_bit_fp_mode();
5761   int stack_bang_size = C->need_stack_bang(bangsize) ? bangsize : 0;
5762 
5763   // WARNING: Initial instruction MUST be 5 bytes or longer so that
5764   // NativeJump::patch_verified_entry will be able to patch out the entry
5765   // code safely. The push to verify stack depth is ok at 5 bytes,
5766   // the frame allocation can be either 3 or 6 bytes. So if we don't do
5767   // stack bang then we must use the 6 byte frame allocation even if
5768   // we have no frame. :-(
5769   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
5770 
5771   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
5772   // Remove word for return addr
5773   framesize -= wordSize;
5774   stack_bang_size -= wordSize;
5775 
5776   // Calls to C2R adapters often do not accept exceptional returns.
5777   // We require that their callers must bang for them.  But be careful, because
5778   // some VM calls (such as call site linkage) can use several kilobytes of
5779   // stack.  But the stack safety zone should account for that.
5780   // See bugs 4446381, 4468289, 4497237.
5781   if (stack_bang_size > 0) {
5782     generate_stack_overflow_check(stack_bang_size);
5783 
5784     // We always push rbp, so that on return to interpreter rbp, will be
5785     // restored correctly and we can correct the stack.
5786     push(rbp);
5787     // Save caller's stack pointer into RBP if the frame pointer is preserved.
5788     if (PreserveFramePointer) {
5789       mov(rbp, rsp);
5790     }
5791     // Remove word for ebp
5792     framesize -= wordSize;
5793 
5794     // Create frame
5795     if (framesize) {
5796       subptr(rsp, framesize);
5797     }
5798   } else {
5799     // Create frame (force generation of a 4 byte immediate value)
5800     subptr_imm32(rsp, framesize);
5801 
5802     // Save RBP register now.
5803     framesize -= wordSize;
5804     movptr(Address(rsp, framesize), rbp);
5805     // Save caller's stack pointer into RBP if the frame pointer is preserved.
5806     if (PreserveFramePointer) {
5807       movptr(rbp, rsp);
5808       if (framesize > 0) {
5809         addptr(rbp, framesize);
5810       }
5811     }
5812   }
5813 
5814   if (C->needs_stack_repair()) {
5815     // Save stack increment (also account for fixed framesize and rbp)
5816     assert((sp_inc & (StackAlignmentInBytes-1)) == 0, "stack increment not aligned");
5817     movptr(Address(rsp, C->sp_inc_offset()), sp_inc + framesize + wordSize);
5818   }
5819 
5820   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
5821     framesize -= wordSize;
5822     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
5823   }
5824 
5825 #ifndef _LP64
5826   // If method sets FPU control word do it now
5827   if (fp_mode_24b) {
5828     fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_24()));
5829   }
5830   if (UseSSE >= 2 && VerifyFPU) {
5831     verify_FPU(0, "FPU stack must be clean on entry");
5832   }
5833 #endif
5834 
5835 #ifdef ASSERT
5836   if (VerifyStackAtCalls) {
5837     Label L;
5838     push(rax);
5839     mov(rax, rsp);
5840     andptr(rax, StackAlignmentInBytes-1);
5841     cmpptr(rax, StackAlignmentInBytes-wordSize);
5842     pop(rax);
5843     jcc(Assembler::equal, L);
5844     STOP("Stack is not properly aligned!");
5845     bind(L);
5846   }
5847 #endif
5848 }
5849 
5850 // clear memory of size 'cnt' qwords, starting at 'base' using XMM/YMM registers
5851 void MacroAssembler::xmm_clear_mem(Register base, Register cnt, Register val, XMMRegister xtmp) {
5852   // cnt - number of qwords (8-byte words).
5853   // base - start address, qword aligned.
5854   Label L_zero_64_bytes, L_loop, L_sloop, L_tail, L_end;
5855   movdq(xtmp, val);
5856   if (UseAVX >= 2) {
5857     punpcklqdq(xtmp, xtmp);
5858     vinserti128_high(xtmp, xtmp);
5859   } else {
5860     punpcklqdq(xtmp, xtmp);
5861   }
5862   jmp(L_zero_64_bytes);
5863 
5864   BIND(L_loop);
5865   if (UseAVX >= 2) {
5866     vmovdqu(Address(base,  0), xtmp);
5867     vmovdqu(Address(base, 32), xtmp);
5868   } else {
5869     movdqu(Address(base,  0), xtmp);
5870     movdqu(Address(base, 16), xtmp);
5871     movdqu(Address(base, 32), xtmp);
5872     movdqu(Address(base, 48), xtmp);
5873   }
5874   addptr(base, 64);
5875 
5876   BIND(L_zero_64_bytes);
5877   subptr(cnt, 8);
5878   jccb(Assembler::greaterEqual, L_loop);
5879   addptr(cnt, 4);
5880   jccb(Assembler::less, L_tail);
5881   // Copy trailing 32 bytes
5882   if (UseAVX >= 2) {
5883     vmovdqu(Address(base, 0), xtmp);
5884   } else {
5885     movdqu(Address(base,  0), xtmp);
5886     movdqu(Address(base, 16), xtmp);
5887   }
5888   addptr(base, 32);
5889   subptr(cnt, 4);
5890 
5891   BIND(L_tail);
5892   addptr(cnt, 4);
5893   jccb(Assembler::lessEqual, L_end);
5894   decrement(cnt);
5895 
5896   BIND(L_sloop);
5897   movq(Address(base, 0), xtmp);
5898   addptr(base, 8);
5899   decrement(cnt);
5900   jccb(Assembler::greaterEqual, L_sloop);
5901   BIND(L_end);
5902 }
5903 
5904 int MacroAssembler::store_value_type_fields_to_buf(ciValueKlass* vk, bool from_interpreter) {
5905   // A value type might be returned. If fields are in registers we
5906   // need to allocate a value type instance and initialize it with
5907   // the value of the fields.
5908   Label skip;
5909   // We only need a new buffered value if a new one is not returned
5910   testptr(rax, 1);
5911   jcc(Assembler::zero, skip);
5912   int call_offset = -1;
5913 
5914 #ifdef _LP64
5915   Label slow_case;
5916 
5917   // Try to allocate a new buffered value (from the heap)
5918   if (UseTLAB) {
5919     // FIXME -- for smaller code, the inline allocation (and the slow case) should be moved inside the pack handler.
5920     if (vk != NULL) {
5921       // Called from C1, where the return type is statically known.
5922       movptr(rbx, (intptr_t)vk->get_ValueKlass());
5923       jint lh = vk->layout_helper();
5924       assert(lh != Klass::_lh_neutral_value, "inline class in return type must have been resolved");
5925       movl(r14, lh);
5926     } else {
5927       // Call from interpreter. RAX contains ((the ValueKlass* of the return type) | 0x01)
5928       mov(rbx, rax);
5929       andptr(rbx, -2);
5930       movl(r14, Address(rbx, Klass::layout_helper_offset()));
5931     }
5932 
5933     movptr(r13, Address(r15_thread, in_bytes(JavaThread::tlab_top_offset())));
5934     lea(r14, Address(r13, r14, Address::times_1));
5935     cmpptr(r14, Address(r15_thread, in_bytes(JavaThread::tlab_end_offset())));
5936     jcc(Assembler::above, slow_case);
5937     movptr(Address(r15_thread, in_bytes(JavaThread::tlab_top_offset())), r14);
5938     movptr(Address(r13, oopDesc::mark_offset_in_bytes()), (intptr_t)markWord::always_locked_prototype().value());
5939 
5940     xorl(rax, rax); // use zero reg to clear memory (shorter code)
5941     store_klass_gap(r13, rax);  // zero klass gap for compressed oops
5942 
5943     if (vk == NULL) {
5944       // store_klass corrupts rbx, so save it in rax for later use (interpreter case only).
5945       mov(rax, rbx);
5946     }
5947     store_klass(r13, rbx);  // klass
5948 
5949     // We have our new buffered value, initialize its fields with a
5950     // value class specific handler
5951     if (vk != NULL) {
5952       // FIXME -- do the packing in-line to avoid the runtime call
5953       mov(rax, r13);
5954       call(RuntimeAddress(vk->pack_handler())); // no need for call info as this will not safepoint.
5955     } else {
5956       movptr(rbx, Address(rax, InstanceKlass::adr_valueklass_fixed_block_offset()));
5957       movptr(rbx, Address(rbx, ValueKlass::pack_handler_offset()));
5958       mov(rax, r13);
5959       call(rbx);
5960     }
5961     jmp(skip);
5962   }
5963 
5964   bind(slow_case);
5965   // We failed to allocate a new value, fall back to a runtime
5966   // call. Some oop field may be live in some registers but we can't
5967   // tell. That runtime call will take care of preserving them
5968   // across a GC if there's one.
5969 #endif
5970 
5971   if (from_interpreter) {
5972     super_call_VM_leaf(StubRoutines::store_value_type_fields_to_buf());
5973   } else {
5974     call(RuntimeAddress(StubRoutines::store_value_type_fields_to_buf()));
5975     call_offset = offset();
5976   }
5977 
5978   bind(skip);
5979   return call_offset;
5980 }
5981 
5982 
5983 // Move a value between registers/stack slots and update the reg_state
5984 bool MacroAssembler::move_helper(VMReg from, VMReg to, BasicType bt, RegState reg_state[], int ret_off, int extra_stack_offset) {
5985   if (reg_state[to->value()] == reg_written) {
5986     return true; // Already written
5987   }
5988   if (from != to && bt != T_VOID) {
5989     if (reg_state[to->value()] == reg_readonly) {
5990       return false; // Not yet writable
5991     }
5992     if (from->is_reg()) {
5993       if (to->is_reg()) {
5994         if (from->is_XMMRegister()) {
5995           if (bt == T_DOUBLE) {
5996             movdbl(to->as_XMMRegister(), from->as_XMMRegister());
5997           } else {
5998             assert(bt == T_FLOAT, "must be float");
5999             movflt(to->as_XMMRegister(), from->as_XMMRegister());
6000           }
6001         } else {
6002           movq(to->as_Register(), from->as_Register());
6003         }
6004       } else {
6005         int st_off = to->reg2stack() * VMRegImpl::stack_slot_size + extra_stack_offset;
6006         assert(st_off != ret_off, "overwriting return address at %d", st_off);
6007         Address to_addr = Address(rsp, st_off);
6008         if (from->is_XMMRegister()) {
6009           if (bt == T_DOUBLE) {
6010             movdbl(to_addr, from->as_XMMRegister());
6011           } else {
6012             assert(bt == T_FLOAT, "must be float");
6013             movflt(to_addr, from->as_XMMRegister());
6014           }
6015         } else {
6016           movq(to_addr, from->as_Register());
6017         }
6018       }
6019     } else {
6020       Address from_addr = Address(rsp, from->reg2stack() * VMRegImpl::stack_slot_size + extra_stack_offset);
6021       if (to->is_reg()) {
6022         if (to->is_XMMRegister()) {
6023           if (bt == T_DOUBLE) {
6024             movdbl(to->as_XMMRegister(), from_addr);
6025           } else {
6026             assert(bt == T_FLOAT, "must be float");
6027             movflt(to->as_XMMRegister(), from_addr);
6028           }
6029         } else {
6030           movq(to->as_Register(), from_addr);
6031         }
6032       } else {
6033         int st_off = to->reg2stack() * VMRegImpl::stack_slot_size + extra_stack_offset;
6034         assert(st_off != ret_off, "overwriting return address at %d", st_off);
6035         movq(r13, from_addr);
6036         movq(Address(rsp, st_off), r13);
6037       }
6038     }
6039   }
6040   // Update register states
6041   reg_state[from->value()] = reg_writable;
6042   reg_state[to->value()] = reg_written;
6043   return true;
6044 }
6045 
6046 // Read all fields from a value type oop and store the values in registers/stack slots
6047 bool MacroAssembler::unpack_value_helper(const GrowableArray<SigEntry>* sig, int& sig_index, VMReg from, VMRegPair* regs_to,
6048                                          int& to_index, RegState reg_state[], int ret_off, int extra_stack_offset) {
6049   Register fromReg = from->is_reg() ? from->as_Register() : noreg;
6050   assert(sig->at(sig_index)._bt == T_VOID, "should be at end delimiter");
6051 
6052   int vt = 1;
6053   bool done = true;
6054   bool mark_done = true;
6055   do {
6056     sig_index--;
6057     BasicType bt = sig->at(sig_index)._bt;
6058     if (bt == T_VALUETYPE) {
6059       vt--;
6060     } else if (bt == T_VOID &&
6061                sig->at(sig_index-1)._bt != T_LONG &&
6062                sig->at(sig_index-1)._bt != T_DOUBLE) {
6063       vt++;
6064     } else if (SigEntry::is_reserved_entry(sig, sig_index)) {
6065       to_index--; // Ignore this
6066     } else {
6067       assert(to_index >= 0, "invalid to_index");
6068       VMRegPair pair_to = regs_to[to_index--];
6069       VMReg to = pair_to.first();
6070 
6071       if (bt == T_VOID) continue;
6072 
6073       int idx = (int)to->value();
6074       if (reg_state[idx] == reg_readonly) {
6075          if (idx != from->value()) {
6076            mark_done = false;
6077          }
6078          done = false;
6079          continue;
6080       } else if (reg_state[idx] == reg_written) {
6081         continue;
6082       } else {
6083         assert(reg_state[idx] == reg_writable, "must be writable");
6084         reg_state[idx] = reg_written;
6085        }
6086 
6087       if (fromReg == noreg) {
6088         int st_off = from->reg2stack() * VMRegImpl::stack_slot_size + extra_stack_offset;
6089         movq(r10, Address(rsp, st_off));
6090         fromReg = r10;
6091       }
6092 
6093       int off = sig->at(sig_index)._offset;
6094       assert(off > 0, "offset in object should be positive");
6095       bool is_oop = (bt == T_OBJECT || bt == T_ARRAY);
6096 
6097       Address fromAddr = Address(fromReg, off);
6098       bool is_signed = (bt != T_CHAR) && (bt != T_BOOLEAN);
6099       if (!to->is_XMMRegister()) {
6100         Register dst = to->is_stack() ? r13 : to->as_Register();
6101         if (is_oop) {
6102           load_heap_oop(dst, fromAddr);
6103         } else {
6104           load_sized_value(dst, fromAddr, type2aelembytes(bt), is_signed);
6105         }
6106         if (to->is_stack()) {
6107           int st_off = to->reg2stack() * VMRegImpl::stack_slot_size + extra_stack_offset;
6108           assert(st_off != ret_off, "overwriting return address at %d", st_off);
6109           movq(Address(rsp, st_off), dst);
6110         }
6111       } else {
6112         if (bt == T_DOUBLE) {
6113           movdbl(to->as_XMMRegister(), fromAddr);
6114         } else {
6115           assert(bt == T_FLOAT, "must be float");
6116           movflt(to->as_XMMRegister(), fromAddr);
6117         }
6118       }
6119     }
6120   } while (vt != 0);
6121   if (mark_done && reg_state[from->value()] != reg_written) {
6122     // This is okay because no one else will write to that slot
6123     reg_state[from->value()] = reg_writable;
6124   }
6125   return done;
6126 }
6127 
6128 // Pack fields back into a value type oop
6129 bool MacroAssembler::pack_value_helper(const GrowableArray<SigEntry>* sig, int& sig_index, int vtarg_index,
6130                                        VMReg to, VMRegPair* regs_from, int regs_from_count, int& from_index, RegState reg_state[],
6131                                        int ret_off, int extra_stack_offset) {
6132   assert(sig->at(sig_index)._bt == T_VALUETYPE, "should be at end delimiter");
6133   assert(to->is_valid(), "must be");
6134 
6135   if (reg_state[to->value()] == reg_written) {
6136     skip_unpacked_fields(sig, sig_index, regs_from, regs_from_count, from_index);
6137     return true; // Already written
6138   }
6139 
6140   Register val_array = rax;
6141   Register val_obj_tmp = r11;
6142   Register from_reg_tmp = r10;
6143   Register tmp1 = r14;
6144   Register tmp2 = r13;
6145   Register tmp3 = rbx;
6146   Register val_obj = to->is_stack() ? val_obj_tmp : to->as_Register();
6147 
6148   if (reg_state[to->value()] == reg_readonly) {
6149     if (!is_reg_in_unpacked_fields(sig, sig_index, to, regs_from, regs_from_count, from_index)) {
6150       skip_unpacked_fields(sig, sig_index, regs_from, regs_from_count, from_index);
6151       return false; // Not yet writable
6152     }
6153     val_obj = val_obj_tmp;
6154   }
6155 
6156   int index = arrayOopDesc::base_offset_in_bytes(T_OBJECT) + vtarg_index * type2aelembytes(T_VALUETYPE);
6157   load_heap_oop(val_obj, Address(val_array, index));
6158 
6159   ScalarizedValueArgsStream stream(sig, sig_index, regs_from, regs_from_count, from_index);
6160   VMRegPair from_pair;
6161   BasicType bt;
6162   while (stream.next(from_pair, bt)) {
6163     int off = sig->at(stream.sig_cc_index())._offset;
6164     assert(off > 0, "offset in object should be positive");
6165     bool is_oop = (bt == T_OBJECT || bt == T_ARRAY);
6166     size_t size_in_bytes = is_java_primitive(bt) ? type2aelembytes(bt) : wordSize;
6167 
6168     VMReg from_r1 = from_pair.first();
6169     VMReg from_r2 = from_pair.second();
6170 
6171     // Pack the scalarized field into the value object.
6172     Address dst(val_obj, off);
6173     if (!from_r1->is_XMMRegister()) {
6174       Register from_reg;
6175 
6176       if (from_r1->is_stack()) {
6177         from_reg = from_reg_tmp;
6178         int ld_off = from_r1->reg2stack() * VMRegImpl::stack_slot_size + extra_stack_offset;
6179         load_sized_value(from_reg, Address(rsp, ld_off), size_in_bytes, /* is_signed */ false);
6180       } else {
6181         from_reg = from_r1->as_Register();
6182       }
6183 
6184       if (is_oop) {
6185         DecoratorSet decorators = IN_HEAP | ACCESS_WRITE;
6186         store_heap_oop(dst, from_reg, tmp1, tmp2, tmp3, decorators);
6187       } else {
6188         store_sized_value(dst, from_reg, size_in_bytes);
6189       }
6190     } else {
6191       if (from_r2->is_valid()) {
6192         movdbl(dst, from_r1->as_XMMRegister());
6193       } else {
6194         movflt(dst, from_r1->as_XMMRegister());
6195       }
6196     }
6197     reg_state[from_r1->value()] = reg_writable;
6198   }
6199   sig_index = stream.sig_cc_index();
6200   from_index = stream.regs_cc_index();
6201 
6202   assert(reg_state[to->value()] == reg_writable, "must have already been read");
6203   bool success = move_helper(val_obj->as_VMReg(), to, T_OBJECT, reg_state, ret_off, extra_stack_offset);
6204   assert(success, "to register must be writeable");
6205 
6206   return true;
6207 }
6208 
6209 // Unpack all value type arguments passed as oops
6210 void MacroAssembler::unpack_value_args(Compile* C, bool receiver_only) {
6211   int sp_inc = unpack_value_args_common(C, receiver_only);
6212   // Emit code for verified entry and save increment for stack repair on return
6213   verified_entry(C, sp_inc);
6214 }
6215 
6216 int MacroAssembler::shuffle_value_args(bool is_packing, bool receiver_only, int extra_stack_offset,
6217                                        BasicType* sig_bt, const GrowableArray<SigEntry>* sig_cc,
6218                                        int args_passed, int args_on_stack, VMRegPair* regs,            // from
6219                                        int args_passed_to, int args_on_stack_to, VMRegPair* regs_to) { // to
6220   // Check if we need to extend the stack for packing/unpacking
6221   int sp_inc = (args_on_stack_to - args_on_stack) * VMRegImpl::stack_slot_size;
6222   if (sp_inc > 0) {
6223     sp_inc = align_up(sp_inc, StackAlignmentInBytes);
6224     if (!is_packing) {
6225       // Save the return address, adjust the stack (make sure it is properly
6226       // 16-byte aligned) and copy the return address to the new top of the stack.
6227       // (Note: C1 does this in C1_MacroAssembler::scalarized_entry).
6228       pop(r13);
6229       subptr(rsp, sp_inc);
6230       push(r13);
6231     }
6232   } else {
6233     // The scalarized calling convention needs less stack space than the unscalarized one.
6234     // No need to extend the stack, the caller will take care of these adjustments.
6235     sp_inc = 0;
6236   }
6237 
6238   int ret_off; // make sure we don't overwrite the return address
6239   if (is_packing) {
6240     // For C1 code, the VVEP doesn't have reserved slots, so we store the returned address at
6241     // rsp[0] during shuffling.
6242     ret_off = 0;
6243   } else {
6244     // C2 code ensures that sp_inc is a reserved slot.
6245     ret_off = sp_inc;
6246   }
6247 
6248   return shuffle_value_args_common(is_packing, receiver_only, extra_stack_offset,
6249                                    sig_bt, sig_cc,
6250                                    args_passed, args_on_stack, regs,
6251                                    args_passed_to, args_on_stack_to, regs_to,
6252                                    sp_inc, ret_off);
6253 }
6254 
6255 VMReg MacroAssembler::spill_reg_for(VMReg reg) {
6256   return reg->is_XMMRegister() ? xmm8->as_VMReg() : r14->as_VMReg();
6257 }
6258 
6259 // Restores the stack on return
6260 void MacroAssembler::restore_stack(Compile* C) {
6261   int framesize = C->frame_size_in_bytes();
6262   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
6263   // Remove word for return addr already pushed and RBP
6264   framesize -= 2*wordSize;
6265 
6266   if (C->needs_stack_repair()) {
6267     // Restore rbp and repair rsp by adding the stack increment
6268     movq(rbp, Address(rsp, framesize));
6269     addq(rsp, Address(rsp, C->sp_inc_offset()));
6270   } else {
6271     if (framesize > 0) {
6272       addq(rsp, framesize);
6273     }
6274     pop(rbp);
6275   }
6276 }
6277 
6278 void MacroAssembler::clear_mem(Register base, Register cnt, Register val, XMMRegister xtmp, bool is_large, bool word_copy_only) {
6279   // cnt - number of qwords (8-byte words).
6280   // base - start address, qword aligned.
6281   // is_large - if optimizers know cnt is larger than InitArrayShortSize
6282   assert(base==rdi, "base register must be edi for rep stos");
6283   assert(val==rax,   "tmp register must be eax for rep stos");
6284   assert(cnt==rcx,   "cnt register must be ecx for rep stos");
6285   assert(InitArrayShortSize % BytesPerLong == 0,
6286     "InitArrayShortSize should be the multiple of BytesPerLong");
6287 
6288   Label DONE;
6289 
6290   if (!is_large) {
6291     Label LOOP, LONG;
6292     cmpptr(cnt, InitArrayShortSize/BytesPerLong);
6293     jccb(Assembler::greater, LONG);
6294 
6295     NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM
6296 
6297     decrement(cnt);
6298     jccb(Assembler::negative, DONE); // Zero length
6299 
6300     // Use individual pointer-sized stores for small counts:
6301     BIND(LOOP);
6302     movptr(Address(base, cnt, Address::times_ptr), val);
6303     decrement(cnt);
6304     jccb(Assembler::greaterEqual, LOOP);
6305     jmpb(DONE);
6306 
6307     BIND(LONG);
6308   }
6309 
6310   // Use longer rep-prefixed ops for non-small counts:
6311   if (UseFastStosb && !word_copy_only) {
6312     shlptr(cnt, 3); // convert to number of bytes
6313     rep_stosb();
6314   } else if (UseXMMForObjInit) {
6315     xmm_clear_mem(base, cnt, val, xtmp);
6316   } else {
6317     NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM
6318     rep_stos();
6319   }
6320 
6321   BIND(DONE);
6322 }
6323 
6324 #ifdef COMPILER2
6325 
6326 // IndexOf for constant substrings with size >= 8 chars
6327 // which don't need to be loaded through stack.
6328 void MacroAssembler::string_indexofC8(Register str1, Register str2,
6329                                       Register cnt1, Register cnt2,
6330                                       int int_cnt2,  Register result,
6331                                       XMMRegister vec, Register tmp,
6332                                       int ae) {
6333   ShortBranchVerifier sbv(this);
6334   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
6335   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
6336 
6337   // This method uses the pcmpestri instruction with bound registers
6338   //   inputs:
6339   //     xmm - substring
6340   //     rax - substring length (elements count)
6341   //     mem - scanned string
6342   //     rdx - string length (elements count)
6343   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
6344   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
6345   //   outputs:
6346   //     rcx - matched index in string
6347   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
6348   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
6349   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
6350   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
6351   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
6352 
6353   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
6354         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
6355         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
6356 
6357   // Note, inline_string_indexOf() generates checks:
6358   // if (substr.count > string.count) return -1;
6359   // if (substr.count == 0) return 0;
6360   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
6361 
6362   // Load substring.
6363   if (ae == StrIntrinsicNode::UL) {
6364     pmovzxbw(vec, Address(str2, 0));
6365   } else {
6366     movdqu(vec, Address(str2, 0));
6367   }
6368   movl(cnt2, int_cnt2);
6369   movptr(result, str1); // string addr
6370 
6371   if (int_cnt2 > stride) {
6372     jmpb(SCAN_TO_SUBSTR);
6373 
6374     // Reload substr for rescan, this code
6375     // is executed only for large substrings (> 8 chars)
6376     bind(RELOAD_SUBSTR);
6377     if (ae == StrIntrinsicNode::UL) {
6378       pmovzxbw(vec, Address(str2, 0));
6379     } else {
6380       movdqu(vec, Address(str2, 0));
6381     }
6382     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
6383 
6384     bind(RELOAD_STR);
6385     // We came here after the beginning of the substring was
6386     // matched but the rest of it was not so we need to search
6387     // again. Start from the next element after the previous match.
6388 
6389     // cnt2 is number of substring reminding elements and
6390     // cnt1 is number of string reminding elements when cmp failed.
6391     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
6392     subl(cnt1, cnt2);
6393     addl(cnt1, int_cnt2);
6394     movl(cnt2, int_cnt2); // Now restore cnt2
6395 
6396     decrementl(cnt1);     // Shift to next element
6397     cmpl(cnt1, cnt2);
6398     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
6399 
6400     addptr(result, (1<<scale1));
6401 
6402   } // (int_cnt2 > 8)
6403 
6404   // Scan string for start of substr in 16-byte vectors
6405   bind(SCAN_TO_SUBSTR);
6406   pcmpestri(vec, Address(result, 0), mode);
6407   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
6408   subl(cnt1, stride);
6409   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
6410   cmpl(cnt1, cnt2);
6411   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
6412   addptr(result, 16);
6413   jmpb(SCAN_TO_SUBSTR);
6414 
6415   // Found a potential substr
6416   bind(FOUND_CANDIDATE);
6417   // Matched whole vector if first element matched (tmp(rcx) == 0).
6418   if (int_cnt2 == stride) {
6419     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
6420   } else { // int_cnt2 > 8
6421     jccb(Assembler::overflow, FOUND_SUBSTR);
6422   }
6423   // After pcmpestri tmp(rcx) contains matched element index
6424   // Compute start addr of substr
6425   lea(result, Address(result, tmp, scale1));
6426 
6427   // Make sure string is still long enough
6428   subl(cnt1, tmp);
6429   cmpl(cnt1, cnt2);
6430   if (int_cnt2 == stride) {
6431     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
6432   } else { // int_cnt2 > 8
6433     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
6434   }
6435   // Left less then substring.
6436 
6437   bind(RET_NOT_FOUND);
6438   movl(result, -1);
6439   jmp(EXIT);
6440 
6441   if (int_cnt2 > stride) {
6442     // This code is optimized for the case when whole substring
6443     // is matched if its head is matched.
6444     bind(MATCH_SUBSTR_HEAD);
6445     pcmpestri(vec, Address(result, 0), mode);
6446     // Reload only string if does not match
6447     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
6448 
6449     Label CONT_SCAN_SUBSTR;
6450     // Compare the rest of substring (> 8 chars).
6451     bind(FOUND_SUBSTR);
6452     // First 8 chars are already matched.
6453     negptr(cnt2);
6454     addptr(cnt2, stride);
6455 
6456     bind(SCAN_SUBSTR);
6457     subl(cnt1, stride);
6458     cmpl(cnt2, -stride); // Do not read beyond substring
6459     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
6460     // Back-up strings to avoid reading beyond substring:
6461     // cnt1 = cnt1 - cnt2 + 8
6462     addl(cnt1, cnt2); // cnt2 is negative
6463     addl(cnt1, stride);
6464     movl(cnt2, stride); negptr(cnt2);
6465     bind(CONT_SCAN_SUBSTR);
6466     if (int_cnt2 < (int)G) {
6467       int tail_off1 = int_cnt2<<scale1;
6468       int tail_off2 = int_cnt2<<scale2;
6469       if (ae == StrIntrinsicNode::UL) {
6470         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
6471       } else {
6472         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
6473       }
6474       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
6475     } else {
6476       // calculate index in register to avoid integer overflow (int_cnt2*2)
6477       movl(tmp, int_cnt2);
6478       addptr(tmp, cnt2);
6479       if (ae == StrIntrinsicNode::UL) {
6480         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
6481       } else {
6482         movdqu(vec, Address(str2, tmp, scale2, 0));
6483       }
6484       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
6485     }
6486     // Need to reload strings pointers if not matched whole vector
6487     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
6488     addptr(cnt2, stride);
6489     jcc(Assembler::negative, SCAN_SUBSTR);
6490     // Fall through if found full substring
6491 
6492   } // (int_cnt2 > 8)
6493 
6494   bind(RET_FOUND);
6495   // Found result if we matched full small substring.
6496   // Compute substr offset
6497   subptr(result, str1);
6498   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
6499     shrl(result, 1); // index
6500   }
6501   bind(EXIT);
6502 
6503 } // string_indexofC8
6504 
6505 // Small strings are loaded through stack if they cross page boundary.
6506 void MacroAssembler::string_indexof(Register str1, Register str2,
6507                                     Register cnt1, Register cnt2,
6508                                     int int_cnt2,  Register result,
6509                                     XMMRegister vec, Register tmp,
6510                                     int ae) {
6511   ShortBranchVerifier sbv(this);
6512   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
6513   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
6514 
6515   //
6516   // int_cnt2 is length of small (< 8 chars) constant substring
6517   // or (-1) for non constant substring in which case its length
6518   // is in cnt2 register.
6519   //
6520   // Note, inline_string_indexOf() generates checks:
6521   // if (substr.count > string.count) return -1;
6522   // if (substr.count == 0) return 0;
6523   //
6524   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
6525   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
6526   // This method uses the pcmpestri instruction with bound registers
6527   //   inputs:
6528   //     xmm - substring
6529   //     rax - substring length (elements count)
6530   //     mem - scanned string
6531   //     rdx - string length (elements count)
6532   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
6533   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
6534   //   outputs:
6535   //     rcx - matched index in string
6536   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
6537   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
6538   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
6539   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
6540 
6541   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
6542         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
6543         FOUND_CANDIDATE;
6544 
6545   { //========================================================
6546     // We don't know where these strings are located
6547     // and we can't read beyond them. Load them through stack.
6548     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
6549 
6550     movptr(tmp, rsp); // save old SP
6551 
6552     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
6553       if (int_cnt2 == (1>>scale2)) { // One byte
6554         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
6555         load_unsigned_byte(result, Address(str2, 0));
6556         movdl(vec, result); // move 32 bits
6557       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
6558         // Not enough header space in 32-bit VM: 12+3 = 15.
6559         movl(result, Address(str2, -1));
6560         shrl(result, 8);
6561         movdl(vec, result); // move 32 bits
6562       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
6563         load_unsigned_short(result, Address(str2, 0));
6564         movdl(vec, result); // move 32 bits
6565       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
6566         movdl(vec, Address(str2, 0)); // move 32 bits
6567       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
6568         movq(vec, Address(str2, 0));  // move 64 bits
6569       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
6570         // Array header size is 12 bytes in 32-bit VM
6571         // + 6 bytes for 3 chars == 18 bytes,
6572         // enough space to load vec and shift.
6573         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
6574         if (ae == StrIntrinsicNode::UL) {
6575           int tail_off = int_cnt2-8;
6576           pmovzxbw(vec, Address(str2, tail_off));
6577           psrldq(vec, -2*tail_off);
6578         }
6579         else {
6580           int tail_off = int_cnt2*(1<<scale2);
6581           movdqu(vec, Address(str2, tail_off-16));
6582           psrldq(vec, 16-tail_off);
6583         }
6584       }
6585     } else { // not constant substring
6586       cmpl(cnt2, stride);
6587       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
6588 
6589       // We can read beyond string if srt+16 does not cross page boundary
6590       // since heaps are aligned and mapped by pages.
6591       assert(os::vm_page_size() < (int)G, "default page should be small");
6592       movl(result, str2); // We need only low 32 bits
6593       andl(result, (os::vm_page_size()-1));
6594       cmpl(result, (os::vm_page_size()-16));
6595       jccb(Assembler::belowEqual, CHECK_STR);
6596 
6597       // Move small strings to stack to allow load 16 bytes into vec.
6598       subptr(rsp, 16);
6599       int stk_offset = wordSize-(1<<scale2);
6600       push(cnt2);
6601 
6602       bind(COPY_SUBSTR);
6603       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
6604         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
6605         movb(Address(rsp, cnt2, scale2, stk_offset), result);
6606       } else if (ae == StrIntrinsicNode::UU) {
6607         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
6608         movw(Address(rsp, cnt2, scale2, stk_offset), result);
6609       }
6610       decrement(cnt2);
6611       jccb(Assembler::notZero, COPY_SUBSTR);
6612 
6613       pop(cnt2);
6614       movptr(str2, rsp);  // New substring address
6615     } // non constant
6616 
6617     bind(CHECK_STR);
6618     cmpl(cnt1, stride);
6619     jccb(Assembler::aboveEqual, BIG_STRINGS);
6620 
6621     // Check cross page boundary.
6622     movl(result, str1); // We need only low 32 bits
6623     andl(result, (os::vm_page_size()-1));
6624     cmpl(result, (os::vm_page_size()-16));
6625     jccb(Assembler::belowEqual, BIG_STRINGS);
6626 
6627     subptr(rsp, 16);
6628     int stk_offset = -(1<<scale1);
6629     if (int_cnt2 < 0) { // not constant
6630       push(cnt2);
6631       stk_offset += wordSize;
6632     }
6633     movl(cnt2, cnt1);
6634 
6635     bind(COPY_STR);
6636     if (ae == StrIntrinsicNode::LL) {
6637       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
6638       movb(Address(rsp, cnt2, scale1, stk_offset), result);
6639     } else {
6640       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
6641       movw(Address(rsp, cnt2, scale1, stk_offset), result);
6642     }
6643     decrement(cnt2);
6644     jccb(Assembler::notZero, COPY_STR);
6645 
6646     if (int_cnt2 < 0) { // not constant
6647       pop(cnt2);
6648     }
6649     movptr(str1, rsp);  // New string address
6650 
6651     bind(BIG_STRINGS);
6652     // Load substring.
6653     if (int_cnt2 < 0) { // -1
6654       if (ae == StrIntrinsicNode::UL) {
6655         pmovzxbw(vec, Address(str2, 0));
6656       } else {
6657         movdqu(vec, Address(str2, 0));
6658       }
6659       push(cnt2);       // substr count
6660       push(str2);       // substr addr
6661       push(str1);       // string addr
6662     } else {
6663       // Small (< 8 chars) constant substrings are loaded already.
6664       movl(cnt2, int_cnt2);
6665     }
6666     push(tmp);  // original SP
6667 
6668   } // Finished loading
6669 
6670   //========================================================
6671   // Start search
6672   //
6673 
6674   movptr(result, str1); // string addr
6675 
6676   if (int_cnt2  < 0) {  // Only for non constant substring
6677     jmpb(SCAN_TO_SUBSTR);
6678 
6679     // SP saved at sp+0
6680     // String saved at sp+1*wordSize
6681     // Substr saved at sp+2*wordSize
6682     // Substr count saved at sp+3*wordSize
6683 
6684     // Reload substr for rescan, this code
6685     // is executed only for large substrings (> 8 chars)
6686     bind(RELOAD_SUBSTR);
6687     movptr(str2, Address(rsp, 2*wordSize));
6688     movl(cnt2, Address(rsp, 3*wordSize));
6689     if (ae == StrIntrinsicNode::UL) {
6690       pmovzxbw(vec, Address(str2, 0));
6691     } else {
6692       movdqu(vec, Address(str2, 0));
6693     }
6694     // We came here after the beginning of the substring was
6695     // matched but the rest of it was not so we need to search
6696     // again. Start from the next element after the previous match.
6697     subptr(str1, result); // Restore counter
6698     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
6699       shrl(str1, 1);
6700     }
6701     addl(cnt1, str1);
6702     decrementl(cnt1);   // Shift to next element
6703     cmpl(cnt1, cnt2);
6704     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
6705 
6706     addptr(result, (1<<scale1));
6707   } // non constant
6708 
6709   // Scan string for start of substr in 16-byte vectors
6710   bind(SCAN_TO_SUBSTR);
6711   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
6712   pcmpestri(vec, Address(result, 0), mode);
6713   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
6714   subl(cnt1, stride);
6715   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
6716   cmpl(cnt1, cnt2);
6717   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
6718   addptr(result, 16);
6719 
6720   bind(ADJUST_STR);
6721   cmpl(cnt1, stride); // Do not read beyond string
6722   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
6723   // Back-up string to avoid reading beyond string.
6724   lea(result, Address(result, cnt1, scale1, -16));
6725   movl(cnt1, stride);
6726   jmpb(SCAN_TO_SUBSTR);
6727 
6728   // Found a potential substr
6729   bind(FOUND_CANDIDATE);
6730   // After pcmpestri tmp(rcx) contains matched element index
6731 
6732   // Make sure string is still long enough
6733   subl(cnt1, tmp);
6734   cmpl(cnt1, cnt2);
6735   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
6736   // Left less then substring.
6737 
6738   bind(RET_NOT_FOUND);
6739   movl(result, -1);
6740   jmp(CLEANUP);
6741 
6742   bind(FOUND_SUBSTR);
6743   // Compute start addr of substr
6744   lea(result, Address(result, tmp, scale1));
6745   if (int_cnt2 > 0) { // Constant substring
6746     // Repeat search for small substring (< 8 chars)
6747     // from new point without reloading substring.
6748     // Have to check that we don't read beyond string.
6749     cmpl(tmp, stride-int_cnt2);
6750     jccb(Assembler::greater, ADJUST_STR);
6751     // Fall through if matched whole substring.
6752   } else { // non constant
6753     assert(int_cnt2 == -1, "should be != 0");
6754 
6755     addl(tmp, cnt2);
6756     // Found result if we matched whole substring.
6757     cmpl(tmp, stride);
6758     jcc(Assembler::lessEqual, RET_FOUND);
6759 
6760     // Repeat search for small substring (<= 8 chars)
6761     // from new point 'str1' without reloading substring.
6762     cmpl(cnt2, stride);
6763     // Have to check that we don't read beyond string.
6764     jccb(Assembler::lessEqual, ADJUST_STR);
6765 
6766     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
6767     // Compare the rest of substring (> 8 chars).
6768     movptr(str1, result);
6769 
6770     cmpl(tmp, cnt2);
6771     // First 8 chars are already matched.
6772     jccb(Assembler::equal, CHECK_NEXT);
6773 
6774     bind(SCAN_SUBSTR);
6775     pcmpestri(vec, Address(str1, 0), mode);
6776     // Need to reload strings pointers if not matched whole vector
6777     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
6778 
6779     bind(CHECK_NEXT);
6780     subl(cnt2, stride);
6781     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
6782     addptr(str1, 16);
6783     if (ae == StrIntrinsicNode::UL) {
6784       addptr(str2, 8);
6785     } else {
6786       addptr(str2, 16);
6787     }
6788     subl(cnt1, stride);
6789     cmpl(cnt2, stride); // Do not read beyond substring
6790     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
6791     // Back-up strings to avoid reading beyond substring.
6792 
6793     if (ae == StrIntrinsicNode::UL) {
6794       lea(str2, Address(str2, cnt2, scale2, -8));
6795       lea(str1, Address(str1, cnt2, scale1, -16));
6796     } else {
6797       lea(str2, Address(str2, cnt2, scale2, -16));
6798       lea(str1, Address(str1, cnt2, scale1, -16));
6799     }
6800     subl(cnt1, cnt2);
6801     movl(cnt2, stride);
6802     addl(cnt1, stride);
6803     bind(CONT_SCAN_SUBSTR);
6804     if (ae == StrIntrinsicNode::UL) {
6805       pmovzxbw(vec, Address(str2, 0));
6806     } else {
6807       movdqu(vec, Address(str2, 0));
6808     }
6809     jmp(SCAN_SUBSTR);
6810 
6811     bind(RET_FOUND_LONG);
6812     movptr(str1, Address(rsp, wordSize));
6813   } // non constant
6814 
6815   bind(RET_FOUND);
6816   // Compute substr offset
6817   subptr(result, str1);
6818   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
6819     shrl(result, 1); // index
6820   }
6821   bind(CLEANUP);
6822   pop(rsp); // restore SP
6823 
6824 } // string_indexof
6825 
6826 void MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
6827                                          XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
6828   ShortBranchVerifier sbv(this);
6829   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
6830 
6831   int stride = 8;
6832 
6833   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
6834         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
6835         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
6836         FOUND_SEQ_CHAR, DONE_LABEL;
6837 
6838   movptr(result, str1);
6839   if (UseAVX >= 2) {
6840     cmpl(cnt1, stride);
6841     jcc(Assembler::less, SCAN_TO_CHAR);
6842     cmpl(cnt1, 2*stride);
6843     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
6844     movdl(vec1, ch);
6845     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
6846     vpxor(vec2, vec2);
6847     movl(tmp, cnt1);
6848     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
6849     andl(cnt1,0x0000000F);  //tail count (in chars)
6850 
6851     bind(SCAN_TO_16_CHAR_LOOP);
6852     vmovdqu(vec3, Address(result, 0));
6853     vpcmpeqw(vec3, vec3, vec1, 1);
6854     vptest(vec2, vec3);
6855     jcc(Assembler::carryClear, FOUND_CHAR);
6856     addptr(result, 32);
6857     subl(tmp, 2*stride);
6858     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
6859     jmp(SCAN_TO_8_CHAR);
6860     bind(SCAN_TO_8_CHAR_INIT);
6861     movdl(vec1, ch);
6862     pshuflw(vec1, vec1, 0x00);
6863     pshufd(vec1, vec1, 0);
6864     pxor(vec2, vec2);
6865   }
6866   bind(SCAN_TO_8_CHAR);
6867   cmpl(cnt1, stride);
6868   jcc(Assembler::less, SCAN_TO_CHAR);
6869   if (UseAVX < 2) {
6870     movdl(vec1, ch);
6871     pshuflw(vec1, vec1, 0x00);
6872     pshufd(vec1, vec1, 0);
6873     pxor(vec2, vec2);
6874   }
6875   movl(tmp, cnt1);
6876   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
6877   andl(cnt1,0x00000007);  //tail count (in chars)
6878 
6879   bind(SCAN_TO_8_CHAR_LOOP);
6880   movdqu(vec3, Address(result, 0));
6881   pcmpeqw(vec3, vec1);
6882   ptest(vec2, vec3);
6883   jcc(Assembler::carryClear, FOUND_CHAR);
6884   addptr(result, 16);
6885   subl(tmp, stride);
6886   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
6887   bind(SCAN_TO_CHAR);
6888   testl(cnt1, cnt1);
6889   jcc(Assembler::zero, RET_NOT_FOUND);
6890   bind(SCAN_TO_CHAR_LOOP);
6891   load_unsigned_short(tmp, Address(result, 0));
6892   cmpl(ch, tmp);
6893   jccb(Assembler::equal, FOUND_SEQ_CHAR);
6894   addptr(result, 2);
6895   subl(cnt1, 1);
6896   jccb(Assembler::zero, RET_NOT_FOUND);
6897   jmp(SCAN_TO_CHAR_LOOP);
6898 
6899   bind(RET_NOT_FOUND);
6900   movl(result, -1);
6901   jmpb(DONE_LABEL);
6902 
6903   bind(FOUND_CHAR);
6904   if (UseAVX >= 2) {
6905     vpmovmskb(tmp, vec3);
6906   } else {
6907     pmovmskb(tmp, vec3);
6908   }
6909   bsfl(ch, tmp);
6910   addl(result, ch);
6911 
6912   bind(FOUND_SEQ_CHAR);
6913   subptr(result, str1);
6914   shrl(result, 1);
6915 
6916   bind(DONE_LABEL);
6917 } // string_indexof_char
6918 
6919 // helper function for string_compare
6920 void MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
6921                                         Address::ScaleFactor scale, Address::ScaleFactor scale1,
6922                                         Address::ScaleFactor scale2, Register index, int ae) {
6923   if (ae == StrIntrinsicNode::LL) {
6924     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
6925     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
6926   } else if (ae == StrIntrinsicNode::UU) {
6927     load_unsigned_short(elem1, Address(str1, index, scale, 0));
6928     load_unsigned_short(elem2, Address(str2, index, scale, 0));
6929   } else {
6930     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
6931     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
6932   }
6933 }
6934 
6935 // Compare strings, used for char[] and byte[].
6936 void MacroAssembler::string_compare(Register str1, Register str2,
6937                                     Register cnt1, Register cnt2, Register result,
6938                                     XMMRegister vec1, int ae) {
6939   ShortBranchVerifier sbv(this);
6940   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
6941   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
6942   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
6943   int stride2x2 = 0x40;
6944   Address::ScaleFactor scale = Address::no_scale;
6945   Address::ScaleFactor scale1 = Address::no_scale;
6946   Address::ScaleFactor scale2 = Address::no_scale;
6947 
6948   if (ae != StrIntrinsicNode::LL) {
6949     stride2x2 = 0x20;
6950   }
6951 
6952   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
6953     shrl(cnt2, 1);
6954   }
6955   // Compute the minimum of the string lengths and the
6956   // difference of the string lengths (stack).
6957   // Do the conditional move stuff
6958   movl(result, cnt1);
6959   subl(cnt1, cnt2);
6960   push(cnt1);
6961   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
6962 
6963   // Is the minimum length zero?
6964   testl(cnt2, cnt2);
6965   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
6966   if (ae == StrIntrinsicNode::LL) {
6967     // Load first bytes
6968     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
6969     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
6970   } else if (ae == StrIntrinsicNode::UU) {
6971     // Load first characters
6972     load_unsigned_short(result, Address(str1, 0));
6973     load_unsigned_short(cnt1, Address(str2, 0));
6974   } else {
6975     load_unsigned_byte(result, Address(str1, 0));
6976     load_unsigned_short(cnt1, Address(str2, 0));
6977   }
6978   subl(result, cnt1);
6979   jcc(Assembler::notZero,  POP_LABEL);
6980 
6981   if (ae == StrIntrinsicNode::UU) {
6982     // Divide length by 2 to get number of chars
6983     shrl(cnt2, 1);
6984   }
6985   cmpl(cnt2, 1);
6986   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
6987 
6988   // Check if the strings start at the same location and setup scale and stride
6989   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6990     cmpptr(str1, str2);
6991     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
6992     if (ae == StrIntrinsicNode::LL) {
6993       scale = Address::times_1;
6994       stride = 16;
6995     } else {
6996       scale = Address::times_2;
6997       stride = 8;
6998     }
6999   } else {
7000     scale1 = Address::times_1;
7001     scale2 = Address::times_2;
7002     // scale not used
7003     stride = 8;
7004   }
7005 
7006   if (UseAVX >= 2 && UseSSE42Intrinsics) {
7007     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
7008     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
7009     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
7010     Label COMPARE_TAIL_LONG;
7011     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
7012 
7013     int pcmpmask = 0x19;
7014     if (ae == StrIntrinsicNode::LL) {
7015       pcmpmask &= ~0x01;
7016     }
7017 
7018     // Setup to compare 16-chars (32-bytes) vectors,
7019     // start from first character again because it has aligned address.
7020     if (ae == StrIntrinsicNode::LL) {
7021       stride2 = 32;
7022     } else {
7023       stride2 = 16;
7024     }
7025     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7026       adr_stride = stride << scale;
7027     } else {
7028       adr_stride1 = 8;  //stride << scale1;
7029       adr_stride2 = 16; //stride << scale2;
7030     }
7031 
7032     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
7033     // rax and rdx are used by pcmpestri as elements counters
7034     movl(result, cnt2);
7035     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
7036     jcc(Assembler::zero, COMPARE_TAIL_LONG);
7037 
7038     // fast path : compare first 2 8-char vectors.
7039     bind(COMPARE_16_CHARS);
7040     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7041       movdqu(vec1, Address(str1, 0));
7042     } else {
7043       pmovzxbw(vec1, Address(str1, 0));
7044     }
7045     pcmpestri(vec1, Address(str2, 0), pcmpmask);
7046     jccb(Assembler::below, COMPARE_INDEX_CHAR);
7047 
7048     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7049       movdqu(vec1, Address(str1, adr_stride));
7050       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
7051     } else {
7052       pmovzxbw(vec1, Address(str1, adr_stride1));
7053       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
7054     }
7055     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
7056     addl(cnt1, stride);
7057 
7058     // Compare the characters at index in cnt1
7059     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
7060     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
7061     subl(result, cnt2);
7062     jmp(POP_LABEL);
7063 
7064     // Setup the registers to start vector comparison loop
7065     bind(COMPARE_WIDE_VECTORS);
7066     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7067       lea(str1, Address(str1, result, scale));
7068       lea(str2, Address(str2, result, scale));
7069     } else {
7070       lea(str1, Address(str1, result, scale1));
7071       lea(str2, Address(str2, result, scale2));
7072     }
7073     subl(result, stride2);
7074     subl(cnt2, stride2);
7075     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
7076     negptr(result);
7077 
7078     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
7079     bind(COMPARE_WIDE_VECTORS_LOOP);
7080 
7081 #ifdef _LP64
7082     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
7083       cmpl(cnt2, stride2x2);
7084       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
7085       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
7086       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
7087 
7088       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
7089       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7090         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
7091         evpcmpeqb(k7, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
7092       } else {
7093         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
7094         evpcmpeqb(k7, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
7095       }
7096       kortestql(k7, k7);
7097       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
7098       addptr(result, stride2x2);  // update since we already compared at this addr
7099       subl(cnt2, stride2x2);      // and sub the size too
7100       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
7101 
7102       vpxor(vec1, vec1);
7103       jmpb(COMPARE_WIDE_TAIL);
7104     }//if (VM_Version::supports_avx512vlbw())
7105 #endif // _LP64
7106 
7107 
7108     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
7109     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7110       vmovdqu(vec1, Address(str1, result, scale));
7111       vpxor(vec1, Address(str2, result, scale));
7112     } else {
7113       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
7114       vpxor(vec1, Address(str2, result, scale2));
7115     }
7116     vptest(vec1, vec1);
7117     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
7118     addptr(result, stride2);
7119     subl(cnt2, stride2);
7120     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
7121     // clean upper bits of YMM registers
7122     vpxor(vec1, vec1);
7123 
7124     // compare wide vectors tail
7125     bind(COMPARE_WIDE_TAIL);
7126     testptr(result, result);
7127     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
7128 
7129     movl(result, stride2);
7130     movl(cnt2, result);
7131     negptr(result);
7132     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
7133 
7134     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
7135     bind(VECTOR_NOT_EQUAL);
7136     // clean upper bits of YMM registers
7137     vpxor(vec1, vec1);
7138     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7139       lea(str1, Address(str1, result, scale));
7140       lea(str2, Address(str2, result, scale));
7141     } else {
7142       lea(str1, Address(str1, result, scale1));
7143       lea(str2, Address(str2, result, scale2));
7144     }
7145     jmp(COMPARE_16_CHARS);
7146 
7147     // Compare tail chars, length between 1 to 15 chars
7148     bind(COMPARE_TAIL_LONG);
7149     movl(cnt2, result);
7150     cmpl(cnt2, stride);
7151     jcc(Assembler::less, COMPARE_SMALL_STR);
7152 
7153     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7154       movdqu(vec1, Address(str1, 0));
7155     } else {
7156       pmovzxbw(vec1, Address(str1, 0));
7157     }
7158     pcmpestri(vec1, Address(str2, 0), pcmpmask);
7159     jcc(Assembler::below, COMPARE_INDEX_CHAR);
7160     subptr(cnt2, stride);
7161     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
7162     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7163       lea(str1, Address(str1, result, scale));
7164       lea(str2, Address(str2, result, scale));
7165     } else {
7166       lea(str1, Address(str1, result, scale1));
7167       lea(str2, Address(str2, result, scale2));
7168     }
7169     negptr(cnt2);
7170     jmpb(WHILE_HEAD_LABEL);
7171 
7172     bind(COMPARE_SMALL_STR);
7173   } else if (UseSSE42Intrinsics) {
7174     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
7175     int pcmpmask = 0x19;
7176     // Setup to compare 8-char (16-byte) vectors,
7177     // start from first character again because it has aligned address.
7178     movl(result, cnt2);
7179     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
7180     if (ae == StrIntrinsicNode::LL) {
7181       pcmpmask &= ~0x01;
7182     }
7183     jcc(Assembler::zero, COMPARE_TAIL);
7184     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7185       lea(str1, Address(str1, result, scale));
7186       lea(str2, Address(str2, result, scale));
7187     } else {
7188       lea(str1, Address(str1, result, scale1));
7189       lea(str2, Address(str2, result, scale2));
7190     }
7191     negptr(result);
7192 
7193     // pcmpestri
7194     //   inputs:
7195     //     vec1- substring
7196     //     rax - negative string length (elements count)
7197     //     mem - scanned string
7198     //     rdx - string length (elements count)
7199     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
7200     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
7201     //   outputs:
7202     //     rcx - first mismatched element index
7203     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
7204 
7205     bind(COMPARE_WIDE_VECTORS);
7206     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7207       movdqu(vec1, Address(str1, result, scale));
7208       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
7209     } else {
7210       pmovzxbw(vec1, Address(str1, result, scale1));
7211       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
7212     }
7213     // After pcmpestri cnt1(rcx) contains mismatched element index
7214 
7215     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
7216     addptr(result, stride);
7217     subptr(cnt2, stride);
7218     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
7219 
7220     // compare wide vectors tail
7221     testptr(result, result);
7222     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
7223 
7224     movl(cnt2, stride);
7225     movl(result, stride);
7226     negptr(result);
7227     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7228       movdqu(vec1, Address(str1, result, scale));
7229       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
7230     } else {
7231       pmovzxbw(vec1, Address(str1, result, scale1));
7232       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
7233     }
7234     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
7235 
7236     // Mismatched characters in the vectors
7237     bind(VECTOR_NOT_EQUAL);
7238     addptr(cnt1, result);
7239     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
7240     subl(result, cnt2);
7241     jmpb(POP_LABEL);
7242 
7243     bind(COMPARE_TAIL); // limit is zero
7244     movl(cnt2, result);
7245     // Fallthru to tail compare
7246   }
7247   // Shift str2 and str1 to the end of the arrays, negate min
7248   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7249     lea(str1, Address(str1, cnt2, scale));
7250     lea(str2, Address(str2, cnt2, scale));
7251   } else {
7252     lea(str1, Address(str1, cnt2, scale1));
7253     lea(str2, Address(str2, cnt2, scale2));
7254   }
7255   decrementl(cnt2);  // first character was compared already
7256   negptr(cnt2);
7257 
7258   // Compare the rest of the elements
7259   bind(WHILE_HEAD_LABEL);
7260   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
7261   subl(result, cnt1);
7262   jccb(Assembler::notZero, POP_LABEL);
7263   increment(cnt2);
7264   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
7265 
7266   // Strings are equal up to min length.  Return the length difference.
7267   bind(LENGTH_DIFF_LABEL);
7268   pop(result);
7269   if (ae == StrIntrinsicNode::UU) {
7270     // Divide diff by 2 to get number of chars
7271     sarl(result, 1);
7272   }
7273   jmpb(DONE_LABEL);
7274 
7275 #ifdef _LP64
7276   if (VM_Version::supports_avx512vlbw()) {
7277 
7278     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
7279 
7280     kmovql(cnt1, k7);
7281     notq(cnt1);
7282     bsfq(cnt2, cnt1);
7283     if (ae != StrIntrinsicNode::LL) {
7284       // Divide diff by 2 to get number of chars
7285       sarl(cnt2, 1);
7286     }
7287     addq(result, cnt2);
7288     if (ae == StrIntrinsicNode::LL) {
7289       load_unsigned_byte(cnt1, Address(str2, result));
7290       load_unsigned_byte(result, Address(str1, result));
7291     } else if (ae == StrIntrinsicNode::UU) {
7292       load_unsigned_short(cnt1, Address(str2, result, scale));
7293       load_unsigned_short(result, Address(str1, result, scale));
7294     } else {
7295       load_unsigned_short(cnt1, Address(str2, result, scale2));
7296       load_unsigned_byte(result, Address(str1, result, scale1));
7297     }
7298     subl(result, cnt1);
7299     jmpb(POP_LABEL);
7300   }//if (VM_Version::supports_avx512vlbw())
7301 #endif // _LP64
7302 
7303   // Discard the stored length difference
7304   bind(POP_LABEL);
7305   pop(cnt1);
7306 
7307   // That's it
7308   bind(DONE_LABEL);
7309   if(ae == StrIntrinsicNode::UL) {
7310     negl(result);
7311   }
7312 
7313 }
7314 
7315 // Search for Non-ASCII character (Negative byte value) in a byte array,
7316 // return true if it has any and false otherwise.
7317 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
7318 //   @HotSpotIntrinsicCandidate
7319 //   private static boolean hasNegatives(byte[] ba, int off, int len) {
7320 //     for (int i = off; i < off + len; i++) {
7321 //       if (ba[i] < 0) {
7322 //         return true;
7323 //       }
7324 //     }
7325 //     return false;
7326 //   }
7327 void MacroAssembler::has_negatives(Register ary1, Register len,
7328   Register result, Register tmp1,
7329   XMMRegister vec1, XMMRegister vec2) {
7330   // rsi: byte array
7331   // rcx: len
7332   // rax: result
7333   ShortBranchVerifier sbv(this);
7334   assert_different_registers(ary1, len, result, tmp1);
7335   assert_different_registers(vec1, vec2);
7336   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
7337 
7338   // len == 0
7339   testl(len, len);
7340   jcc(Assembler::zero, FALSE_LABEL);
7341 
7342   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
7343     VM_Version::supports_avx512vlbw() &&
7344     VM_Version::supports_bmi2()) {
7345 
7346     Label test_64_loop, test_tail;
7347     Register tmp3_aliased = len;
7348 
7349     movl(tmp1, len);
7350     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
7351 
7352     andl(tmp1, 64 - 1);   // tail count (in chars) 0x3F
7353     andl(len, ~(64 - 1));    // vector count (in chars)
7354     jccb(Assembler::zero, test_tail);
7355 
7356     lea(ary1, Address(ary1, len, Address::times_1));
7357     negptr(len);
7358 
7359     bind(test_64_loop);
7360     // Check whether our 64 elements of size byte contain negatives
7361     evpcmpgtb(k2, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
7362     kortestql(k2, k2);
7363     jcc(Assembler::notZero, TRUE_LABEL);
7364 
7365     addptr(len, 64);
7366     jccb(Assembler::notZero, test_64_loop);
7367 
7368 
7369     bind(test_tail);
7370     // bail out when there is nothing to be done
7371     testl(tmp1, -1);
7372     jcc(Assembler::zero, FALSE_LABEL);
7373 
7374     // ~(~0 << len) applied up to two times (for 32-bit scenario)
7375 #ifdef _LP64
7376     mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
7377     shlxq(tmp3_aliased, tmp3_aliased, tmp1);
7378     notq(tmp3_aliased);
7379     kmovql(k3, tmp3_aliased);
7380 #else
7381     Label k_init;
7382     jmp(k_init);
7383 
7384     // We could not read 64-bits from a general purpose register thus we move
7385     // data required to compose 64 1's to the instruction stream
7386     // We emit 64 byte wide series of elements from 0..63 which later on would
7387     // be used as a compare targets with tail count contained in tmp1 register.
7388     // Result would be a k register having tmp1 consecutive number or 1
7389     // counting from least significant bit.
7390     address tmp = pc();
7391     emit_int64(0x0706050403020100);
7392     emit_int64(0x0F0E0D0C0B0A0908);
7393     emit_int64(0x1716151413121110);
7394     emit_int64(0x1F1E1D1C1B1A1918);
7395     emit_int64(0x2726252423222120);
7396     emit_int64(0x2F2E2D2C2B2A2928);
7397     emit_int64(0x3736353433323130);
7398     emit_int64(0x3F3E3D3C3B3A3938);
7399 
7400     bind(k_init);
7401     lea(len, InternalAddress(tmp));
7402     // create mask to test for negative byte inside a vector
7403     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
7404     evpcmpgtb(k3, vec1, Address(len, 0), Assembler::AVX_512bit);
7405 
7406 #endif
7407     evpcmpgtb(k2, k3, vec2, Address(ary1, 0), Assembler::AVX_512bit);
7408     ktestq(k2, k3);
7409     jcc(Assembler::notZero, TRUE_LABEL);
7410 
7411     jmp(FALSE_LABEL);
7412   } else {
7413     movl(result, len); // copy
7414 
7415     if (UseAVX >= 2 && UseSSE >= 2) {
7416       // With AVX2, use 32-byte vector compare
7417       Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
7418 
7419       // Compare 32-byte vectors
7420       andl(result, 0x0000001f);  //   tail count (in bytes)
7421       andl(len, 0xffffffe0);   // vector count (in bytes)
7422       jccb(Assembler::zero, COMPARE_TAIL);
7423 
7424       lea(ary1, Address(ary1, len, Address::times_1));
7425       negptr(len);
7426 
7427       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
7428       movdl(vec2, tmp1);
7429       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
7430 
7431       bind(COMPARE_WIDE_VECTORS);
7432       vmovdqu(vec1, Address(ary1, len, Address::times_1));
7433       vptest(vec1, vec2);
7434       jccb(Assembler::notZero, TRUE_LABEL);
7435       addptr(len, 32);
7436       jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
7437 
7438       testl(result, result);
7439       jccb(Assembler::zero, FALSE_LABEL);
7440 
7441       vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
7442       vptest(vec1, vec2);
7443       jccb(Assembler::notZero, TRUE_LABEL);
7444       jmpb(FALSE_LABEL);
7445 
7446       bind(COMPARE_TAIL); // len is zero
7447       movl(len, result);
7448       // Fallthru to tail compare
7449     } else if (UseSSE42Intrinsics) {
7450       // With SSE4.2, use double quad vector compare
7451       Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
7452 
7453       // Compare 16-byte vectors
7454       andl(result, 0x0000000f);  //   tail count (in bytes)
7455       andl(len, 0xfffffff0);   // vector count (in bytes)
7456       jcc(Assembler::zero, COMPARE_TAIL);
7457 
7458       lea(ary1, Address(ary1, len, Address::times_1));
7459       negptr(len);
7460 
7461       movl(tmp1, 0x80808080);
7462       movdl(vec2, tmp1);
7463       pshufd(vec2, vec2, 0);
7464 
7465       bind(COMPARE_WIDE_VECTORS);
7466       movdqu(vec1, Address(ary1, len, Address::times_1));
7467       ptest(vec1, vec2);
7468       jcc(Assembler::notZero, TRUE_LABEL);
7469       addptr(len, 16);
7470       jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
7471 
7472       testl(result, result);
7473       jcc(Assembler::zero, FALSE_LABEL);
7474 
7475       movdqu(vec1, Address(ary1, result, Address::times_1, -16));
7476       ptest(vec1, vec2);
7477       jccb(Assembler::notZero, TRUE_LABEL);
7478       jmpb(FALSE_LABEL);
7479 
7480       bind(COMPARE_TAIL); // len is zero
7481       movl(len, result);
7482       // Fallthru to tail compare
7483     }
7484   }
7485   // Compare 4-byte vectors
7486   andl(len, 0xfffffffc); // vector count (in bytes)
7487   jccb(Assembler::zero, COMPARE_CHAR);
7488 
7489   lea(ary1, Address(ary1, len, Address::times_1));
7490   negptr(len);
7491 
7492   bind(COMPARE_VECTORS);
7493   movl(tmp1, Address(ary1, len, Address::times_1));
7494   andl(tmp1, 0x80808080);
7495   jccb(Assembler::notZero, TRUE_LABEL);
7496   addptr(len, 4);
7497   jcc(Assembler::notZero, COMPARE_VECTORS);
7498 
7499   // Compare trailing char (final 2 bytes), if any
7500   bind(COMPARE_CHAR);
7501   testl(result, 0x2);   // tail  char
7502   jccb(Assembler::zero, COMPARE_BYTE);
7503   load_unsigned_short(tmp1, Address(ary1, 0));
7504   andl(tmp1, 0x00008080);
7505   jccb(Assembler::notZero, TRUE_LABEL);
7506   subptr(result, 2);
7507   lea(ary1, Address(ary1, 2));
7508 
7509   bind(COMPARE_BYTE);
7510   testl(result, 0x1);   // tail  byte
7511   jccb(Assembler::zero, FALSE_LABEL);
7512   load_unsigned_byte(tmp1, Address(ary1, 0));
7513   andl(tmp1, 0x00000080);
7514   jccb(Assembler::notEqual, TRUE_LABEL);
7515   jmpb(FALSE_LABEL);
7516 
7517   bind(TRUE_LABEL);
7518   movl(result, 1);   // return true
7519   jmpb(DONE);
7520 
7521   bind(FALSE_LABEL);
7522   xorl(result, result); // return false
7523 
7524   // That's it
7525   bind(DONE);
7526   if (UseAVX >= 2 && UseSSE >= 2) {
7527     // clean upper bits of YMM registers
7528     vpxor(vec1, vec1);
7529     vpxor(vec2, vec2);
7530   }
7531 }
7532 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
7533 void MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
7534                                    Register limit, Register result, Register chr,
7535                                    XMMRegister vec1, XMMRegister vec2, bool is_char) {
7536   ShortBranchVerifier sbv(this);
7537   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
7538 
7539   int length_offset  = arrayOopDesc::length_offset_in_bytes();
7540   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
7541 
7542   if (is_array_equ) {
7543     // Check the input args
7544     cmpoop(ary1, ary2);
7545     jcc(Assembler::equal, TRUE_LABEL);
7546 
7547     // Need additional checks for arrays_equals.
7548     testptr(ary1, ary1);
7549     jcc(Assembler::zero, FALSE_LABEL);
7550     testptr(ary2, ary2);
7551     jcc(Assembler::zero, FALSE_LABEL);
7552 
7553     // Check the lengths
7554     movl(limit, Address(ary1, length_offset));
7555     cmpl(limit, Address(ary2, length_offset));
7556     jcc(Assembler::notEqual, FALSE_LABEL);
7557   }
7558 
7559   // count == 0
7560   testl(limit, limit);
7561   jcc(Assembler::zero, TRUE_LABEL);
7562 
7563   if (is_array_equ) {
7564     // Load array address
7565     lea(ary1, Address(ary1, base_offset));
7566     lea(ary2, Address(ary2, base_offset));
7567   }
7568 
7569   if (is_array_equ && is_char) {
7570     // arrays_equals when used for char[].
7571     shll(limit, 1);      // byte count != 0
7572   }
7573   movl(result, limit); // copy
7574 
7575   if (UseAVX >= 2) {
7576     // With AVX2, use 32-byte vector compare
7577     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
7578 
7579     // Compare 32-byte vectors
7580     andl(result, 0x0000001f);  //   tail count (in bytes)
7581     andl(limit, 0xffffffe0);   // vector count (in bytes)
7582     jcc(Assembler::zero, COMPARE_TAIL);
7583 
7584     lea(ary1, Address(ary1, limit, Address::times_1));
7585     lea(ary2, Address(ary2, limit, Address::times_1));
7586     negptr(limit);
7587 
7588 #ifdef _LP64
7589     if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
7590       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
7591 
7592       cmpl(limit, -64);
7593       jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
7594 
7595       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
7596 
7597       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
7598       evpcmpeqb(k7, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
7599       kortestql(k7, k7);
7600       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
7601       addptr(limit, 64);  // update since we already compared at this addr
7602       cmpl(limit, -64);
7603       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
7604 
7605       // At this point we may still need to compare -limit+result bytes.
7606       // We could execute the next two instruction and just continue via non-wide path:
7607       //  cmpl(limit, 0);
7608       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
7609       // But since we stopped at the points ary{1,2}+limit which are
7610       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
7611       // (|limit| <= 32 and result < 32),
7612       // we may just compare the last 64 bytes.
7613       //
7614       addptr(result, -64);   // it is safe, bc we just came from this area
7615       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
7616       evpcmpeqb(k7, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
7617       kortestql(k7, k7);
7618       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
7619 
7620       jmp(TRUE_LABEL);
7621 
7622       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
7623 
7624     }//if (VM_Version::supports_avx512vlbw())
7625 #endif //_LP64
7626     bind(COMPARE_WIDE_VECTORS);
7627     vmovdqu(vec1, Address(ary1, limit, Address::times_1));
7628     vmovdqu(vec2, Address(ary2, limit, Address::times_1));
7629     vpxor(vec1, vec2);
7630 
7631     vptest(vec1, vec1);
7632     jcc(Assembler::notZero, FALSE_LABEL);
7633     addptr(limit, 32);
7634     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
7635 
7636     testl(result, result);
7637     jcc(Assembler::zero, TRUE_LABEL);
7638 
7639     vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
7640     vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
7641     vpxor(vec1, vec2);
7642 
7643     vptest(vec1, vec1);
7644     jccb(Assembler::notZero, FALSE_LABEL);
7645     jmpb(TRUE_LABEL);
7646 
7647     bind(COMPARE_TAIL); // limit is zero
7648     movl(limit, result);
7649     // Fallthru to tail compare
7650   } else if (UseSSE42Intrinsics) {
7651     // With SSE4.2, use double quad vector compare
7652     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
7653 
7654     // Compare 16-byte vectors
7655     andl(result, 0x0000000f);  //   tail count (in bytes)
7656     andl(limit, 0xfffffff0);   // vector count (in bytes)
7657     jcc(Assembler::zero, COMPARE_TAIL);
7658 
7659     lea(ary1, Address(ary1, limit, Address::times_1));
7660     lea(ary2, Address(ary2, limit, Address::times_1));
7661     negptr(limit);
7662 
7663     bind(COMPARE_WIDE_VECTORS);
7664     movdqu(vec1, Address(ary1, limit, Address::times_1));
7665     movdqu(vec2, Address(ary2, limit, Address::times_1));
7666     pxor(vec1, vec2);
7667 
7668     ptest(vec1, vec1);
7669     jcc(Assembler::notZero, FALSE_LABEL);
7670     addptr(limit, 16);
7671     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
7672 
7673     testl(result, result);
7674     jcc(Assembler::zero, TRUE_LABEL);
7675 
7676     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
7677     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
7678     pxor(vec1, vec2);
7679 
7680     ptest(vec1, vec1);
7681     jccb(Assembler::notZero, FALSE_LABEL);
7682     jmpb(TRUE_LABEL);
7683 
7684     bind(COMPARE_TAIL); // limit is zero
7685     movl(limit, result);
7686     // Fallthru to tail compare
7687   }
7688 
7689   // Compare 4-byte vectors
7690   andl(limit, 0xfffffffc); // vector count (in bytes)
7691   jccb(Assembler::zero, COMPARE_CHAR);
7692 
7693   lea(ary1, Address(ary1, limit, Address::times_1));
7694   lea(ary2, Address(ary2, limit, Address::times_1));
7695   negptr(limit);
7696 
7697   bind(COMPARE_VECTORS);
7698   movl(chr, Address(ary1, limit, Address::times_1));
7699   cmpl(chr, Address(ary2, limit, Address::times_1));
7700   jccb(Assembler::notEqual, FALSE_LABEL);
7701   addptr(limit, 4);
7702   jcc(Assembler::notZero, COMPARE_VECTORS);
7703 
7704   // Compare trailing char (final 2 bytes), if any
7705   bind(COMPARE_CHAR);
7706   testl(result, 0x2);   // tail  char
7707   jccb(Assembler::zero, COMPARE_BYTE);
7708   load_unsigned_short(chr, Address(ary1, 0));
7709   load_unsigned_short(limit, Address(ary2, 0));
7710   cmpl(chr, limit);
7711   jccb(Assembler::notEqual, FALSE_LABEL);
7712 
7713   if (is_array_equ && is_char) {
7714     bind(COMPARE_BYTE);
7715   } else {
7716     lea(ary1, Address(ary1, 2));
7717     lea(ary2, Address(ary2, 2));
7718 
7719     bind(COMPARE_BYTE);
7720     testl(result, 0x1);   // tail  byte
7721     jccb(Assembler::zero, TRUE_LABEL);
7722     load_unsigned_byte(chr, Address(ary1, 0));
7723     load_unsigned_byte(limit, Address(ary2, 0));
7724     cmpl(chr, limit);
7725     jccb(Assembler::notEqual, FALSE_LABEL);
7726   }
7727   bind(TRUE_LABEL);
7728   movl(result, 1);   // return true
7729   jmpb(DONE);
7730 
7731   bind(FALSE_LABEL);
7732   xorl(result, result); // return false
7733 
7734   // That's it
7735   bind(DONE);
7736   if (UseAVX >= 2) {
7737     // clean upper bits of YMM registers
7738     vpxor(vec1, vec1);
7739     vpxor(vec2, vec2);
7740   }
7741 }
7742 
7743 #endif
7744 
7745 void MacroAssembler::generate_fill(BasicType t, bool aligned,
7746                                    Register to, Register value, Register count,
7747                                    Register rtmp, XMMRegister xtmp) {
7748   ShortBranchVerifier sbv(this);
7749   assert_different_registers(to, value, count, rtmp);
7750   Label L_exit;
7751   Label L_fill_2_bytes, L_fill_4_bytes;
7752 
7753   int shift = -1;
7754   switch (t) {
7755     case T_BYTE:
7756       shift = 2;
7757       break;
7758     case T_SHORT:
7759       shift = 1;
7760       break;
7761     case T_INT:
7762       shift = 0;
7763       break;
7764     default: ShouldNotReachHere();
7765   }
7766 
7767   if (t == T_BYTE) {
7768     andl(value, 0xff);
7769     movl(rtmp, value);
7770     shll(rtmp, 8);
7771     orl(value, rtmp);
7772   }
7773   if (t == T_SHORT) {
7774     andl(value, 0xffff);
7775   }
7776   if (t == T_BYTE || t == T_SHORT) {
7777     movl(rtmp, value);
7778     shll(rtmp, 16);
7779     orl(value, rtmp);
7780   }
7781 
7782   cmpl(count, 2<<shift); // Short arrays (< 8 bytes) fill by element
7783   jcc(Assembler::below, L_fill_4_bytes); // use unsigned cmp
7784   if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
7785     Label L_skip_align2;
7786     // align source address at 4 bytes address boundary
7787     if (t == T_BYTE) {
7788       Label L_skip_align1;
7789       // One byte misalignment happens only for byte arrays
7790       testptr(to, 1);
7791       jccb(Assembler::zero, L_skip_align1);
7792       movb(Address(to, 0), value);
7793       increment(to);
7794       decrement(count);
7795       BIND(L_skip_align1);
7796     }
7797     // Two bytes misalignment happens only for byte and short (char) arrays
7798     testptr(to, 2);
7799     jccb(Assembler::zero, L_skip_align2);
7800     movw(Address(to, 0), value);
7801     addptr(to, 2);
7802     subl(count, 1<<(shift-1));
7803     BIND(L_skip_align2);
7804   }
7805   if (UseSSE < 2) {
7806     Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
7807     // Fill 32-byte chunks
7808     subl(count, 8 << shift);
7809     jcc(Assembler::less, L_check_fill_8_bytes);
7810     align(16);
7811 
7812     BIND(L_fill_32_bytes_loop);
7813 
7814     for (int i = 0; i < 32; i += 4) {
7815       movl(Address(to, i), value);
7816     }
7817 
7818     addptr(to, 32);
7819     subl(count, 8 << shift);
7820     jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
7821     BIND(L_check_fill_8_bytes);
7822     addl(count, 8 << shift);
7823     jccb(Assembler::zero, L_exit);
7824     jmpb(L_fill_8_bytes);
7825 
7826     //
7827     // length is too short, just fill qwords
7828     //
7829     BIND(L_fill_8_bytes_loop);
7830     movl(Address(to, 0), value);
7831     movl(Address(to, 4), value);
7832     addptr(to, 8);
7833     BIND(L_fill_8_bytes);
7834     subl(count, 1 << (shift + 1));
7835     jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
7836     // fall through to fill 4 bytes
7837   } else {
7838     Label L_fill_32_bytes;
7839     if (!UseUnalignedLoadStores) {
7840       // align to 8 bytes, we know we are 4 byte aligned to start
7841       testptr(to, 4);
7842       jccb(Assembler::zero, L_fill_32_bytes);
7843       movl(Address(to, 0), value);
7844       addptr(to, 4);
7845       subl(count, 1<<shift);
7846     }
7847     BIND(L_fill_32_bytes);
7848     {
7849       assert( UseSSE >= 2, "supported cpu only" );
7850       Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
7851       movdl(xtmp, value);
7852       if (UseAVX >= 2 && UseUnalignedLoadStores) {
7853         Label L_check_fill_32_bytes;
7854         if (UseAVX > 2) {
7855           // Fill 64-byte chunks
7856           Label L_fill_64_bytes_loop_avx3, L_check_fill_64_bytes_avx2;
7857 
7858           // If number of bytes to fill < AVX3Threshold, perform fill using AVX2
7859           cmpl(count, AVX3Threshold);
7860           jccb(Assembler::below, L_check_fill_64_bytes_avx2);
7861 
7862           vpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
7863 
7864           subl(count, 16 << shift);
7865           jccb(Assembler::less, L_check_fill_32_bytes);
7866           align(16);
7867 
7868           BIND(L_fill_64_bytes_loop_avx3);
7869           evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit);
7870           addptr(to, 64);
7871           subl(count, 16 << shift);
7872           jcc(Assembler::greaterEqual, L_fill_64_bytes_loop_avx3);
7873           jmpb(L_check_fill_32_bytes);
7874 
7875           BIND(L_check_fill_64_bytes_avx2);
7876         }
7877         // Fill 64-byte chunks
7878         Label L_fill_64_bytes_loop;
7879         vpbroadcastd(xtmp, xtmp, Assembler::AVX_256bit);
7880 
7881         subl(count, 16 << shift);
7882         jcc(Assembler::less, L_check_fill_32_bytes);
7883         align(16);
7884 
7885         BIND(L_fill_64_bytes_loop);
7886         vmovdqu(Address(to, 0), xtmp);
7887         vmovdqu(Address(to, 32), xtmp);
7888         addptr(to, 64);
7889         subl(count, 16 << shift);
7890         jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
7891 
7892         BIND(L_check_fill_32_bytes);
7893         addl(count, 8 << shift);
7894         jccb(Assembler::less, L_check_fill_8_bytes);
7895         vmovdqu(Address(to, 0), xtmp);
7896         addptr(to, 32);
7897         subl(count, 8 << shift);
7898 
7899         BIND(L_check_fill_8_bytes);
7900         // clean upper bits of YMM registers
7901         movdl(xtmp, value);
7902         pshufd(xtmp, xtmp, 0);
7903       } else {
7904         // Fill 32-byte chunks
7905         pshufd(xtmp, xtmp, 0);
7906 
7907         subl(count, 8 << shift);
7908         jcc(Assembler::less, L_check_fill_8_bytes);
7909         align(16);
7910 
7911         BIND(L_fill_32_bytes_loop);
7912 
7913         if (UseUnalignedLoadStores) {
7914           movdqu(Address(to, 0), xtmp);
7915           movdqu(Address(to, 16), xtmp);
7916         } else {
7917           movq(Address(to, 0), xtmp);
7918           movq(Address(to, 8), xtmp);
7919           movq(Address(to, 16), xtmp);
7920           movq(Address(to, 24), xtmp);
7921         }
7922 
7923         addptr(to, 32);
7924         subl(count, 8 << shift);
7925         jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
7926 
7927         BIND(L_check_fill_8_bytes);
7928       }
7929       addl(count, 8 << shift);
7930       jccb(Assembler::zero, L_exit);
7931       jmpb(L_fill_8_bytes);
7932 
7933       //
7934       // length is too short, just fill qwords
7935       //
7936       BIND(L_fill_8_bytes_loop);
7937       movq(Address(to, 0), xtmp);
7938       addptr(to, 8);
7939       BIND(L_fill_8_bytes);
7940       subl(count, 1 << (shift + 1));
7941       jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
7942     }
7943   }
7944   // fill trailing 4 bytes
7945   BIND(L_fill_4_bytes);
7946   testl(count, 1<<shift);
7947   jccb(Assembler::zero, L_fill_2_bytes);
7948   movl(Address(to, 0), value);
7949   if (t == T_BYTE || t == T_SHORT) {
7950     Label L_fill_byte;
7951     addptr(to, 4);
7952     BIND(L_fill_2_bytes);
7953     // fill trailing 2 bytes
7954     testl(count, 1<<(shift-1));
7955     jccb(Assembler::zero, L_fill_byte);
7956     movw(Address(to, 0), value);
7957     if (t == T_BYTE) {
7958       addptr(to, 2);
7959       BIND(L_fill_byte);
7960       // fill trailing byte
7961       testl(count, 1);
7962       jccb(Assembler::zero, L_exit);
7963       movb(Address(to, 0), value);
7964     } else {
7965       BIND(L_fill_byte);
7966     }
7967   } else {
7968     BIND(L_fill_2_bytes);
7969   }
7970   BIND(L_exit);
7971 }
7972 
7973 // encode char[] to byte[] in ISO_8859_1
7974    //@HotSpotIntrinsicCandidate
7975    //private static int implEncodeISOArray(byte[] sa, int sp,
7976    //byte[] da, int dp, int len) {
7977    //  int i = 0;
7978    //  for (; i < len; i++) {
7979    //    char c = StringUTF16.getChar(sa, sp++);
7980    //    if (c > '\u00FF')
7981    //      break;
7982    //    da[dp++] = (byte)c;
7983    //  }
7984    //  return i;
7985    //}
7986 void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
7987   XMMRegister tmp1Reg, XMMRegister tmp2Reg,
7988   XMMRegister tmp3Reg, XMMRegister tmp4Reg,
7989   Register tmp5, Register result) {
7990 
7991   // rsi: src
7992   // rdi: dst
7993   // rdx: len
7994   // rcx: tmp5
7995   // rax: result
7996   ShortBranchVerifier sbv(this);
7997   assert_different_registers(src, dst, len, tmp5, result);
7998   Label L_done, L_copy_1_char, L_copy_1_char_exit;
7999 
8000   // set result
8001   xorl(result, result);
8002   // check for zero length
8003   testl(len, len);
8004   jcc(Assembler::zero, L_done);
8005 
8006   movl(result, len);
8007 
8008   // Setup pointers
8009   lea(src, Address(src, len, Address::times_2)); // char[]
8010   lea(dst, Address(dst, len, Address::times_1)); // byte[]
8011   negptr(len);
8012 
8013   if (UseSSE42Intrinsics || UseAVX >= 2) {
8014     Label L_copy_8_chars, L_copy_8_chars_exit;
8015     Label L_chars_16_check, L_copy_16_chars, L_copy_16_chars_exit;
8016 
8017     if (UseAVX >= 2) {
8018       Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit;
8019       movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vector
8020       movdl(tmp1Reg, tmp5);
8021       vpbroadcastd(tmp1Reg, tmp1Reg, Assembler::AVX_256bit);
8022       jmp(L_chars_32_check);
8023 
8024       bind(L_copy_32_chars);
8025       vmovdqu(tmp3Reg, Address(src, len, Address::times_2, -64));
8026       vmovdqu(tmp4Reg, Address(src, len, Address::times_2, -32));
8027       vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
8028       vptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in  vector
8029       jccb(Assembler::notZero, L_copy_32_chars_exit);
8030       vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
8031       vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector_len */ 1);
8032       vmovdqu(Address(dst, len, Address::times_1, -32), tmp4Reg);
8033 
8034       bind(L_chars_32_check);
8035       addptr(len, 32);
8036       jcc(Assembler::lessEqual, L_copy_32_chars);
8037 
8038       bind(L_copy_32_chars_exit);
8039       subptr(len, 16);
8040       jccb(Assembler::greater, L_copy_16_chars_exit);
8041 
8042     } else if (UseSSE42Intrinsics) {
8043       movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vector
8044       movdl(tmp1Reg, tmp5);
8045       pshufd(tmp1Reg, tmp1Reg, 0);
8046       jmpb(L_chars_16_check);
8047     }
8048 
8049     bind(L_copy_16_chars);
8050     if (UseAVX >= 2) {
8051       vmovdqu(tmp2Reg, Address(src, len, Address::times_2, -32));
8052       vptest(tmp2Reg, tmp1Reg);
8053       jcc(Assembler::notZero, L_copy_16_chars_exit);
8054       vpackuswb(tmp2Reg, tmp2Reg, tmp1Reg, /* vector_len */ 1);
8055       vpermq(tmp3Reg, tmp2Reg, 0xD8, /* vector_len */ 1);
8056     } else {
8057       if (UseAVX > 0) {
8058         movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
8059         movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
8060         vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 0);
8061       } else {
8062         movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
8063         por(tmp2Reg, tmp3Reg);
8064         movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
8065         por(tmp2Reg, tmp4Reg);
8066       }
8067       ptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in  vector
8068       jccb(Assembler::notZero, L_copy_16_chars_exit);
8069       packuswb(tmp3Reg, tmp4Reg);
8070     }
8071     movdqu(Address(dst, len, Address::times_1, -16), tmp3Reg);
8072 
8073     bind(L_chars_16_check);
8074     addptr(len, 16);
8075     jcc(Assembler::lessEqual, L_copy_16_chars);
8076 
8077     bind(L_copy_16_chars_exit);
8078     if (UseAVX >= 2) {
8079       // clean upper bits of YMM registers
8080       vpxor(tmp2Reg, tmp2Reg);
8081       vpxor(tmp3Reg, tmp3Reg);
8082       vpxor(tmp4Reg, tmp4Reg);
8083       movdl(tmp1Reg, tmp5);
8084       pshufd(tmp1Reg, tmp1Reg, 0);
8085     }
8086     subptr(len, 8);
8087     jccb(Assembler::greater, L_copy_8_chars_exit);
8088 
8089     bind(L_copy_8_chars);
8090     movdqu(tmp3Reg, Address(src, len, Address::times_2, -16));
8091     ptest(tmp3Reg, tmp1Reg);
8092     jccb(Assembler::notZero, L_copy_8_chars_exit);
8093     packuswb(tmp3Reg, tmp1Reg);
8094     movq(Address(dst, len, Address::times_1, -8), tmp3Reg);
8095     addptr(len, 8);
8096     jccb(Assembler::lessEqual, L_copy_8_chars);
8097 
8098     bind(L_copy_8_chars_exit);
8099     subptr(len, 8);
8100     jccb(Assembler::zero, L_done);
8101   }
8102 
8103   bind(L_copy_1_char);
8104   load_unsigned_short(tmp5, Address(src, len, Address::times_2, 0));
8105   testl(tmp5, 0xff00);      // check if Unicode char
8106   jccb(Assembler::notZero, L_copy_1_char_exit);
8107   movb(Address(dst, len, Address::times_1, 0), tmp5);
8108   addptr(len, 1);
8109   jccb(Assembler::less, L_copy_1_char);
8110 
8111   bind(L_copy_1_char_exit);
8112   addptr(result, len); // len is negative count of not processed elements
8113 
8114   bind(L_done);
8115 }
8116 
8117 #ifdef _LP64
8118 /**
8119  * Helper for multiply_to_len().
8120  */
8121 void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) {
8122   addq(dest_lo, src1);
8123   adcq(dest_hi, 0);
8124   addq(dest_lo, src2);
8125   adcq(dest_hi, 0);
8126 }
8127 
8128 /**
8129  * Multiply 64 bit by 64 bit first loop.
8130  */
8131 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
8132                                            Register y, Register y_idx, Register z,
8133                                            Register carry, Register product,
8134                                            Register idx, Register kdx) {
8135   //
8136   //  jlong carry, x[], y[], z[];
8137   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
8138   //    huge_128 product = y[idx] * x[xstart] + carry;
8139   //    z[kdx] = (jlong)product;
8140   //    carry  = (jlong)(product >>> 64);
8141   //  }
8142   //  z[xstart] = carry;
8143   //
8144 
8145   Label L_first_loop, L_first_loop_exit;
8146   Label L_one_x, L_one_y, L_multiply;
8147 
8148   decrementl(xstart);
8149   jcc(Assembler::negative, L_one_x);
8150 
8151   movq(x_xstart, Address(x, xstart, Address::times_4,  0));
8152   rorq(x_xstart, 32); // convert big-endian to little-endian
8153 
8154   bind(L_first_loop);
8155   decrementl(idx);
8156   jcc(Assembler::negative, L_first_loop_exit);
8157   decrementl(idx);
8158   jcc(Assembler::negative, L_one_y);
8159   movq(y_idx, Address(y, idx, Address::times_4,  0));
8160   rorq(y_idx, 32); // convert big-endian to little-endian
8161   bind(L_multiply);
8162   movq(product, x_xstart);
8163   mulq(y_idx); // product(rax) * y_idx -> rdx:rax
8164   addq(product, carry);
8165   adcq(rdx, 0);
8166   subl(kdx, 2);
8167   movl(Address(z, kdx, Address::times_4,  4), product);
8168   shrq(product, 32);
8169   movl(Address(z, kdx, Address::times_4,  0), product);
8170   movq(carry, rdx);
8171   jmp(L_first_loop);
8172 
8173   bind(L_one_y);
8174   movl(y_idx, Address(y,  0));
8175   jmp(L_multiply);
8176 
8177   bind(L_one_x);
8178   movl(x_xstart, Address(x,  0));
8179   jmp(L_first_loop);
8180 
8181   bind(L_first_loop_exit);
8182 }
8183 
8184 /**
8185  * Multiply 64 bit by 64 bit and add 128 bit.
8186  */
8187 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, Register z,
8188                                             Register yz_idx, Register idx,
8189                                             Register carry, Register product, int offset) {
8190   //     huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
8191   //     z[kdx] = (jlong)product;
8192 
8193   movq(yz_idx, Address(y, idx, Address::times_4,  offset));
8194   rorq(yz_idx, 32); // convert big-endian to little-endian
8195   movq(product, x_xstart);
8196   mulq(yz_idx);     // product(rax) * yz_idx -> rdx:product(rax)
8197   movq(yz_idx, Address(z, idx, Address::times_4,  offset));
8198   rorq(yz_idx, 32); // convert big-endian to little-endian
8199 
8200   add2_with_carry(rdx, product, carry, yz_idx);
8201 
8202   movl(Address(z, idx, Address::times_4,  offset+4), product);
8203   shrq(product, 32);
8204   movl(Address(z, idx, Address::times_4,  offset), product);
8205 
8206 }
8207 
8208 /**
8209  * Multiply 128 bit by 128 bit. Unrolled inner loop.
8210  */
8211 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, Register y, Register z,
8212                                              Register yz_idx, Register idx, Register jdx,
8213                                              Register carry, Register product,
8214                                              Register carry2) {
8215   //   jlong carry, x[], y[], z[];
8216   //   int kdx = ystart+1;
8217   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
8218   //     huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
8219   //     z[kdx+idx+1] = (jlong)product;
8220   //     jlong carry2  = (jlong)(product >>> 64);
8221   //     product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
8222   //     z[kdx+idx] = (jlong)product;
8223   //     carry  = (jlong)(product >>> 64);
8224   //   }
8225   //   idx += 2;
8226   //   if (idx > 0) {
8227   //     product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
8228   //     z[kdx+idx] = (jlong)product;
8229   //     carry  = (jlong)(product >>> 64);
8230   //   }
8231   //
8232 
8233   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
8234 
8235   movl(jdx, idx);
8236   andl(jdx, 0xFFFFFFFC);
8237   shrl(jdx, 2);
8238 
8239   bind(L_third_loop);
8240   subl(jdx, 1);
8241   jcc(Assembler::negative, L_third_loop_exit);
8242   subl(idx, 4);
8243 
8244   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 8);
8245   movq(carry2, rdx);
8246 
8247   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product, 0);
8248   movq(carry, rdx);
8249   jmp(L_third_loop);
8250 
8251   bind (L_third_loop_exit);
8252 
8253   andl (idx, 0x3);
8254   jcc(Assembler::zero, L_post_third_loop_done);
8255 
8256   Label L_check_1;
8257   subl(idx, 2);
8258   jcc(Assembler::negative, L_check_1);
8259 
8260   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 0);
8261   movq(carry, rdx);
8262 
8263   bind (L_check_1);
8264   addl (idx, 0x2);
8265   andl (idx, 0x1);
8266   subl(idx, 1);
8267   jcc(Assembler::negative, L_post_third_loop_done);
8268 
8269   movl(yz_idx, Address(y, idx, Address::times_4,  0));
8270   movq(product, x_xstart);
8271   mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax)
8272   movl(yz_idx, Address(z, idx, Address::times_4,  0));
8273 
8274   add2_with_carry(rdx, product, yz_idx, carry);
8275 
8276   movl(Address(z, idx, Address::times_4,  0), product);
8277   shrq(product, 32);
8278 
8279   shlq(rdx, 32);
8280   orq(product, rdx);
8281   movq(carry, product);
8282 
8283   bind(L_post_third_loop_done);
8284 }
8285 
8286 /**
8287  * Multiply 128 bit by 128 bit using BMI2. Unrolled inner loop.
8288  *
8289  */
8290 void MacroAssembler::multiply_128_x_128_bmi2_loop(Register y, Register z,
8291                                                   Register carry, Register carry2,
8292                                                   Register idx, Register jdx,
8293                                                   Register yz_idx1, Register yz_idx2,
8294                                                   Register tmp, Register tmp3, Register tmp4) {
8295   assert(UseBMI2Instructions, "should be used only when BMI2 is available");
8296 
8297   //   jlong carry, x[], y[], z[];
8298   //   int kdx = ystart+1;
8299   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
8300   //     huge_128 tmp3 = (y[idx+1] * rdx) + z[kdx+idx+1] + carry;
8301   //     jlong carry2  = (jlong)(tmp3 >>> 64);
8302   //     huge_128 tmp4 = (y[idx]   * rdx) + z[kdx+idx] + carry2;
8303   //     carry  = (jlong)(tmp4 >>> 64);
8304   //     z[kdx+idx+1] = (jlong)tmp3;
8305   //     z[kdx+idx] = (jlong)tmp4;
8306   //   }
8307   //   idx += 2;
8308   //   if (idx > 0) {
8309   //     yz_idx1 = (y[idx] * rdx) + z[kdx+idx] + carry;
8310   //     z[kdx+idx] = (jlong)yz_idx1;
8311   //     carry  = (jlong)(yz_idx1 >>> 64);
8312   //   }
8313   //
8314 
8315   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
8316 
8317   movl(jdx, idx);
8318   andl(jdx, 0xFFFFFFFC);
8319   shrl(jdx, 2);
8320 
8321   bind(L_third_loop);
8322   subl(jdx, 1);
8323   jcc(Assembler::negative, L_third_loop_exit);
8324   subl(idx, 4);
8325 
8326   movq(yz_idx1,  Address(y, idx, Address::times_4,  8));
8327   rorxq(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
8328   movq(yz_idx2, Address(y, idx, Address::times_4,  0));
8329   rorxq(yz_idx2, yz_idx2, 32);
8330 
8331   mulxq(tmp4, tmp3, yz_idx1);  //  yz_idx1 * rdx -> tmp4:tmp3
8332   mulxq(carry2, tmp, yz_idx2); //  yz_idx2 * rdx -> carry2:tmp
8333 
8334   movq(yz_idx1,  Address(z, idx, Address::times_4,  8));
8335   rorxq(yz_idx1, yz_idx1, 32);
8336   movq(yz_idx2, Address(z, idx, Address::times_4,  0));
8337   rorxq(yz_idx2, yz_idx2, 32);
8338 
8339   if (VM_Version::supports_adx()) {
8340     adcxq(tmp3, carry);
8341     adoxq(tmp3, yz_idx1);
8342 
8343     adcxq(tmp4, tmp);
8344     adoxq(tmp4, yz_idx2);
8345 
8346     movl(carry, 0); // does not affect flags
8347     adcxq(carry2, carry);
8348     adoxq(carry2, carry);
8349   } else {
8350     add2_with_carry(tmp4, tmp3, carry, yz_idx1);
8351     add2_with_carry(carry2, tmp4, tmp, yz_idx2);
8352   }
8353   movq(carry, carry2);
8354 
8355   movl(Address(z, idx, Address::times_4, 12), tmp3);
8356   shrq(tmp3, 32);
8357   movl(Address(z, idx, Address::times_4,  8), tmp3);
8358 
8359   movl(Address(z, idx, Address::times_4,  4), tmp4);
8360   shrq(tmp4, 32);
8361   movl(Address(z, idx, Address::times_4,  0), tmp4);
8362 
8363   jmp(L_third_loop);
8364 
8365   bind (L_third_loop_exit);
8366 
8367   andl (idx, 0x3);
8368   jcc(Assembler::zero, L_post_third_loop_done);
8369 
8370   Label L_check_1;
8371   subl(idx, 2);
8372   jcc(Assembler::negative, L_check_1);
8373 
8374   movq(yz_idx1, Address(y, idx, Address::times_4,  0));
8375   rorxq(yz_idx1, yz_idx1, 32);
8376   mulxq(tmp4, tmp3, yz_idx1); //  yz_idx1 * rdx -> tmp4:tmp3
8377   movq(yz_idx2, Address(z, idx, Address::times_4,  0));
8378   rorxq(yz_idx2, yz_idx2, 32);
8379 
8380   add2_with_carry(tmp4, tmp3, carry, yz_idx2);
8381 
8382   movl(Address(z, idx, Address::times_4,  4), tmp3);
8383   shrq(tmp3, 32);
8384   movl(Address(z, idx, Address::times_4,  0), tmp3);
8385   movq(carry, tmp4);
8386 
8387   bind (L_check_1);
8388   addl (idx, 0x2);
8389   andl (idx, 0x1);
8390   subl(idx, 1);
8391   jcc(Assembler::negative, L_post_third_loop_done);
8392   movl(tmp4, Address(y, idx, Address::times_4,  0));
8393   mulxq(carry2, tmp3, tmp4);  //  tmp4 * rdx -> carry2:tmp3
8394   movl(tmp4, Address(z, idx, Address::times_4,  0));
8395 
8396   add2_with_carry(carry2, tmp3, tmp4, carry);
8397 
8398   movl(Address(z, idx, Address::times_4,  0), tmp3);
8399   shrq(tmp3, 32);
8400 
8401   shlq(carry2, 32);
8402   orq(tmp3, carry2);
8403   movq(carry, tmp3);
8404 
8405   bind(L_post_third_loop_done);
8406 }
8407 
8408 /**
8409  * Code for BigInteger::multiplyToLen() instrinsic.
8410  *
8411  * rdi: x
8412  * rax: xlen
8413  * rsi: y
8414  * rcx: ylen
8415  * r8:  z
8416  * r11: zlen
8417  * r12: tmp1
8418  * r13: tmp2
8419  * r14: tmp3
8420  * r15: tmp4
8421  * rbx: tmp5
8422  *
8423  */
8424 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen,
8425                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
8426   ShortBranchVerifier sbv(this);
8427   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx);
8428 
8429   push(tmp1);
8430   push(tmp2);
8431   push(tmp3);
8432   push(tmp4);
8433   push(tmp5);
8434 
8435   push(xlen);
8436   push(zlen);
8437 
8438   const Register idx = tmp1;
8439   const Register kdx = tmp2;
8440   const Register xstart = tmp3;
8441 
8442   const Register y_idx = tmp4;
8443   const Register carry = tmp5;
8444   const Register product  = xlen;
8445   const Register x_xstart = zlen;  // reuse register
8446 
8447   // First Loop.
8448   //
8449   //  final static long LONG_MASK = 0xffffffffL;
8450   //  int xstart = xlen - 1;
8451   //  int ystart = ylen - 1;
8452   //  long carry = 0;
8453   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
8454   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
8455   //    z[kdx] = (int)product;
8456   //    carry = product >>> 32;
8457   //  }
8458   //  z[xstart] = (int)carry;
8459   //
8460 
8461   movl(idx, ylen);      // idx = ylen;
8462   movl(kdx, zlen);      // kdx = xlen+ylen;
8463   xorq(carry, carry);   // carry = 0;
8464 
8465   Label L_done;
8466 
8467   movl(xstart, xlen);
8468   decrementl(xstart);
8469   jcc(Assembler::negative, L_done);
8470 
8471   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
8472 
8473   Label L_second_loop;
8474   testl(kdx, kdx);
8475   jcc(Assembler::zero, L_second_loop);
8476 
8477   Label L_carry;
8478   subl(kdx, 1);
8479   jcc(Assembler::zero, L_carry);
8480 
8481   movl(Address(z, kdx, Address::times_4,  0), carry);
8482   shrq(carry, 32);
8483   subl(kdx, 1);
8484 
8485   bind(L_carry);
8486   movl(Address(z, kdx, Address::times_4,  0), carry);
8487 
8488   // Second and third (nested) loops.
8489   //
8490   // for (int i = xstart-1; i >= 0; i--) { // Second loop
8491   //   carry = 0;
8492   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
8493   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
8494   //                    (z[k] & LONG_MASK) + carry;
8495   //     z[k] = (int)product;
8496   //     carry = product >>> 32;
8497   //   }
8498   //   z[i] = (int)carry;
8499   // }
8500   //
8501   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
8502 
8503   const Register jdx = tmp1;
8504 
8505   bind(L_second_loop);
8506   xorl(carry, carry);    // carry = 0;
8507   movl(jdx, ylen);       // j = ystart+1
8508 
8509   subl(xstart, 1);       // i = xstart-1;
8510   jcc(Assembler::negative, L_done);
8511 
8512   push (z);
8513 
8514   Label L_last_x;
8515   lea(z, Address(z, xstart, Address::times_4, 4)); // z = z + k - j
8516   subl(xstart, 1);       // i = xstart-1;
8517   jcc(Assembler::negative, L_last_x);
8518 
8519   if (UseBMI2Instructions) {
8520     movq(rdx,  Address(x, xstart, Address::times_4,  0));
8521     rorxq(rdx, rdx, 32); // convert big-endian to little-endian
8522   } else {
8523     movq(x_xstart, Address(x, xstart, Address::times_4,  0));
8524     rorq(x_xstart, 32);  // convert big-endian to little-endian
8525   }
8526 
8527   Label L_third_loop_prologue;
8528   bind(L_third_loop_prologue);
8529 
8530   push (x);
8531   push (xstart);
8532   push (ylen);
8533 
8534 
8535   if (UseBMI2Instructions) {
8536     multiply_128_x_128_bmi2_loop(y, z, carry, x, jdx, ylen, product, tmp2, x_xstart, tmp3, tmp4);
8537   } else { // !UseBMI2Instructions
8538     multiply_128_x_128_loop(x_xstart, y, z, y_idx, jdx, ylen, carry, product, x);
8539   }
8540 
8541   pop(ylen);
8542   pop(xlen);
8543   pop(x);
8544   pop(z);
8545 
8546   movl(tmp3, xlen);
8547   addl(tmp3, 1);
8548   movl(Address(z, tmp3, Address::times_4,  0), carry);
8549   subl(tmp3, 1);
8550   jccb(Assembler::negative, L_done);
8551 
8552   shrq(carry, 32);
8553   movl(Address(z, tmp3, Address::times_4,  0), carry);
8554   jmp(L_second_loop);
8555 
8556   // Next infrequent code is moved outside loops.
8557   bind(L_last_x);
8558   if (UseBMI2Instructions) {
8559     movl(rdx, Address(x,  0));
8560   } else {
8561     movl(x_xstart, Address(x,  0));
8562   }
8563   jmp(L_third_loop_prologue);
8564 
8565   bind(L_done);
8566 
8567   pop(zlen);
8568   pop(xlen);
8569 
8570   pop(tmp5);
8571   pop(tmp4);
8572   pop(tmp3);
8573   pop(tmp2);
8574   pop(tmp1);
8575 }
8576 
8577 void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale,
8578   Register result, Register tmp1, Register tmp2, XMMRegister rymm0, XMMRegister rymm1, XMMRegister rymm2){
8579   assert(UseSSE42Intrinsics, "SSE4.2 must be enabled.");
8580   Label VECTOR16_LOOP, VECTOR8_LOOP, VECTOR4_LOOP;
8581   Label VECTOR8_TAIL, VECTOR4_TAIL;
8582   Label VECTOR32_NOT_EQUAL, VECTOR16_NOT_EQUAL, VECTOR8_NOT_EQUAL, VECTOR4_NOT_EQUAL;
8583   Label SAME_TILL_END, DONE;
8584   Label BYTES_LOOP, BYTES_TAIL, BYTES_NOT_EQUAL;
8585 
8586   //scale is in rcx in both Win64 and Unix
8587   ShortBranchVerifier sbv(this);
8588 
8589   shlq(length);
8590   xorq(result, result);
8591 
8592   if ((AVX3Threshold == 0) && (UseAVX > 2) &&
8593       VM_Version::supports_avx512vlbw()) {
8594     Label VECTOR64_LOOP, VECTOR64_NOT_EQUAL, VECTOR32_TAIL;
8595 
8596     cmpq(length, 64);
8597     jcc(Assembler::less, VECTOR32_TAIL);
8598 
8599     movq(tmp1, length);
8600     andq(tmp1, 0x3F);      // tail count
8601     andq(length, ~(0x3F)); //vector count
8602 
8603     bind(VECTOR64_LOOP);
8604     // AVX512 code to compare 64 byte vectors.
8605     evmovdqub(rymm0, Address(obja, result), Assembler::AVX_512bit);
8606     evpcmpeqb(k7, rymm0, Address(objb, result), Assembler::AVX_512bit);
8607     kortestql(k7, k7);
8608     jcc(Assembler::aboveEqual, VECTOR64_NOT_EQUAL);     // mismatch
8609     addq(result, 64);
8610     subq(length, 64);
8611     jccb(Assembler::notZero, VECTOR64_LOOP);
8612 
8613     //bind(VECTOR64_TAIL);
8614     testq(tmp1, tmp1);
8615     jcc(Assembler::zero, SAME_TILL_END);
8616 
8617     //bind(VECTOR64_TAIL);
8618     // AVX512 code to compare upto 63 byte vectors.
8619     mov64(tmp2, 0xFFFFFFFFFFFFFFFF);
8620     shlxq(tmp2, tmp2, tmp1);
8621     notq(tmp2);
8622     kmovql(k3, tmp2);
8623 
8624     evmovdqub(rymm0, k3, Address(obja, result), Assembler::AVX_512bit);
8625     evpcmpeqb(k7, k3, rymm0, Address(objb, result), Assembler::AVX_512bit);
8626 
8627     ktestql(k7, k3);
8628     jcc(Assembler::below, SAME_TILL_END);     // not mismatch
8629 
8630     bind(VECTOR64_NOT_EQUAL);
8631     kmovql(tmp1, k7);
8632     notq(tmp1);
8633     tzcntq(tmp1, tmp1);
8634     addq(result, tmp1);
8635     shrq(result);
8636     jmp(DONE);
8637     bind(VECTOR32_TAIL);
8638   }
8639 
8640   cmpq(length, 8);
8641   jcc(Assembler::equal, VECTOR8_LOOP);
8642   jcc(Assembler::less, VECTOR4_TAIL);
8643 
8644   if (UseAVX >= 2) {
8645     Label VECTOR16_TAIL, VECTOR32_LOOP;
8646 
8647     cmpq(length, 16);
8648     jcc(Assembler::equal, VECTOR16_LOOP);
8649     jcc(Assembler::less, VECTOR8_LOOP);
8650 
8651     cmpq(length, 32);
8652     jccb(Assembler::less, VECTOR16_TAIL);
8653 
8654     subq(length, 32);
8655     bind(VECTOR32_LOOP);
8656     vmovdqu(rymm0, Address(obja, result));
8657     vmovdqu(rymm1, Address(objb, result));
8658     vpxor(rymm2, rymm0, rymm1, Assembler::AVX_256bit);
8659     vptest(rymm2, rymm2);
8660     jcc(Assembler::notZero, VECTOR32_NOT_EQUAL);//mismatch found
8661     addq(result, 32);
8662     subq(length, 32);
8663     jcc(Assembler::greaterEqual, VECTOR32_LOOP);
8664     addq(length, 32);
8665     jcc(Assembler::equal, SAME_TILL_END);
8666     //falling through if less than 32 bytes left //close the branch here.
8667 
8668     bind(VECTOR16_TAIL);
8669     cmpq(length, 16);
8670     jccb(Assembler::less, VECTOR8_TAIL);
8671     bind(VECTOR16_LOOP);
8672     movdqu(rymm0, Address(obja, result));
8673     movdqu(rymm1, Address(objb, result));
8674     vpxor(rymm2, rymm0, rymm1, Assembler::AVX_128bit);
8675     ptest(rymm2, rymm2);
8676     jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
8677     addq(result, 16);
8678     subq(length, 16);
8679     jcc(Assembler::equal, SAME_TILL_END);
8680     //falling through if less than 16 bytes left
8681   } else {//regular intrinsics
8682 
8683     cmpq(length, 16);
8684     jccb(Assembler::less, VECTOR8_TAIL);
8685 
8686     subq(length, 16);
8687     bind(VECTOR16_LOOP);
8688     movdqu(rymm0, Address(obja, result));
8689     movdqu(rymm1, Address(objb, result));
8690     pxor(rymm0, rymm1);
8691     ptest(rymm0, rymm0);
8692     jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
8693     addq(result, 16);
8694     subq(length, 16);
8695     jccb(Assembler::greaterEqual, VECTOR16_LOOP);
8696     addq(length, 16);
8697     jcc(Assembler::equal, SAME_TILL_END);
8698     //falling through if less than 16 bytes left
8699   }
8700 
8701   bind(VECTOR8_TAIL);
8702   cmpq(length, 8);
8703   jccb(Assembler::less, VECTOR4_TAIL);
8704   bind(VECTOR8_LOOP);
8705   movq(tmp1, Address(obja, result));
8706   movq(tmp2, Address(objb, result));
8707   xorq(tmp1, tmp2);
8708   testq(tmp1, tmp1);
8709   jcc(Assembler::notZero, VECTOR8_NOT_EQUAL);//mismatch found
8710   addq(result, 8);
8711   subq(length, 8);
8712   jcc(Assembler::equal, SAME_TILL_END);
8713   //falling through if less than 8 bytes left
8714 
8715   bind(VECTOR4_TAIL);
8716   cmpq(length, 4);
8717   jccb(Assembler::less, BYTES_TAIL);
8718   bind(VECTOR4_LOOP);
8719   movl(tmp1, Address(obja, result));
8720   xorl(tmp1, Address(objb, result));
8721   testl(tmp1, tmp1);
8722   jcc(Assembler::notZero, VECTOR4_NOT_EQUAL);//mismatch found
8723   addq(result, 4);
8724   subq(length, 4);
8725   jcc(Assembler::equal, SAME_TILL_END);
8726   //falling through if less than 4 bytes left
8727 
8728   bind(BYTES_TAIL);
8729   bind(BYTES_LOOP);
8730   load_unsigned_byte(tmp1, Address(obja, result));
8731   load_unsigned_byte(tmp2, Address(objb, result));
8732   xorl(tmp1, tmp2);
8733   testl(tmp1, tmp1);
8734   jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
8735   decq(length);
8736   jcc(Assembler::zero, SAME_TILL_END);
8737   incq(result);
8738   load_unsigned_byte(tmp1, Address(obja, result));
8739   load_unsigned_byte(tmp2, Address(objb, result));
8740   xorl(tmp1, tmp2);
8741   testl(tmp1, tmp1);
8742   jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
8743   decq(length);
8744   jcc(Assembler::zero, SAME_TILL_END);
8745   incq(result);
8746   load_unsigned_byte(tmp1, Address(obja, result));
8747   load_unsigned_byte(tmp2, Address(objb, result));
8748   xorl(tmp1, tmp2);
8749   testl(tmp1, tmp1);
8750   jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
8751   jmp(SAME_TILL_END);
8752 
8753   if (UseAVX >= 2) {
8754     bind(VECTOR32_NOT_EQUAL);
8755     vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_256bit);
8756     vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_256bit);
8757     vpxor(rymm0, rymm0, rymm2, Assembler::AVX_256bit);
8758     vpmovmskb(tmp1, rymm0);
8759     bsfq(tmp1, tmp1);
8760     addq(result, tmp1);
8761     shrq(result);
8762     jmp(DONE);
8763   }
8764 
8765   bind(VECTOR16_NOT_EQUAL);
8766   if (UseAVX >= 2) {
8767     vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_128bit);
8768     vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_128bit);
8769     pxor(rymm0, rymm2);
8770   } else {
8771     pcmpeqb(rymm2, rymm2);
8772     pxor(rymm0, rymm1);
8773     pcmpeqb(rymm0, rymm1);
8774     pxor(rymm0, rymm2);
8775   }
8776   pmovmskb(tmp1, rymm0);
8777   bsfq(tmp1, tmp1);
8778   addq(result, tmp1);
8779   shrq(result);
8780   jmpb(DONE);
8781 
8782   bind(VECTOR8_NOT_EQUAL);
8783   bind(VECTOR4_NOT_EQUAL);
8784   bsfq(tmp1, tmp1);
8785   shrq(tmp1, 3);
8786   addq(result, tmp1);
8787   bind(BYTES_NOT_EQUAL);
8788   shrq(result);
8789   jmpb(DONE);
8790 
8791   bind(SAME_TILL_END);
8792   mov64(result, -1);
8793 
8794   bind(DONE);
8795 }
8796 
8797 //Helper functions for square_to_len()
8798 
8799 /**
8800  * Store the squares of x[], right shifted one bit (divided by 2) into z[]
8801  * Preserves x and z and modifies rest of the registers.
8802  */
8803 void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
8804   // Perform square and right shift by 1
8805   // Handle odd xlen case first, then for even xlen do the following
8806   // jlong carry = 0;
8807   // for (int j=0, i=0; j < xlen; j+=2, i+=4) {
8808   //     huge_128 product = x[j:j+1] * x[j:j+1];
8809   //     z[i:i+1] = (carry << 63) | (jlong)(product >>> 65);
8810   //     z[i+2:i+3] = (jlong)(product >>> 1);
8811   //     carry = (jlong)product;
8812   // }
8813 
8814   xorq(tmp5, tmp5);     // carry
8815   xorq(rdxReg, rdxReg);
8816   xorl(tmp1, tmp1);     // index for x
8817   xorl(tmp4, tmp4);     // index for z
8818 
8819   Label L_first_loop, L_first_loop_exit;
8820 
8821   testl(xlen, 1);
8822   jccb(Assembler::zero, L_first_loop); //jump if xlen is even
8823 
8824   // Square and right shift by 1 the odd element using 32 bit multiply
8825   movl(raxReg, Address(x, tmp1, Address::times_4, 0));
8826   imulq(raxReg, raxReg);
8827   shrq(raxReg, 1);
8828   adcq(tmp5, 0);
8829   movq(Address(z, tmp4, Address::times_4, 0), raxReg);
8830   incrementl(tmp1);
8831   addl(tmp4, 2);
8832 
8833   // Square and  right shift by 1 the rest using 64 bit multiply
8834   bind(L_first_loop);
8835   cmpptr(tmp1, xlen);
8836   jccb(Assembler::equal, L_first_loop_exit);
8837 
8838   // Square
8839   movq(raxReg, Address(x, tmp1, Address::times_4,  0));
8840   rorq(raxReg, 32);    // convert big-endian to little-endian
8841   mulq(raxReg);        // 64-bit multiply rax * rax -> rdx:rax
8842 
8843   // Right shift by 1 and save carry
8844   shrq(tmp5, 1);       // rdx:rax:tmp5 = (tmp5:rdx:rax) >>> 1
8845   rcrq(rdxReg, 1);
8846   rcrq(raxReg, 1);
8847   adcq(tmp5, 0);
8848 
8849   // Store result in z
8850   movq(Address(z, tmp4, Address::times_4, 0), rdxReg);
8851   movq(Address(z, tmp4, Address::times_4, 8), raxReg);
8852 
8853   // Update indices for x and z
8854   addl(tmp1, 2);
8855   addl(tmp4, 4);
8856   jmp(L_first_loop);
8857 
8858   bind(L_first_loop_exit);
8859 }
8860 
8861 
8862 /**
8863  * Perform the following multiply add operation using BMI2 instructions
8864  * carry:sum = sum + op1*op2 + carry
8865  * op2 should be in rdx
8866  * op2 is preserved, all other registers are modified
8867  */
8868 void MacroAssembler::multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, Register tmp2) {
8869   // assert op2 is rdx
8870   mulxq(tmp2, op1, op1);  //  op1 * op2 -> tmp2:op1
8871   addq(sum, carry);
8872   adcq(tmp2, 0);
8873   addq(sum, op1);
8874   adcq(tmp2, 0);
8875   movq(carry, tmp2);
8876 }
8877 
8878 /**
8879  * Perform the following multiply add operation:
8880  * carry:sum = sum + op1*op2 + carry
8881  * Preserves op1, op2 and modifies rest of registers
8882  */
8883 void MacroAssembler::multiply_add_64(Register sum, Register op1, Register op2, Register carry, Register rdxReg, Register raxReg) {
8884   // rdx:rax = op1 * op2
8885   movq(raxReg, op2);
8886   mulq(op1);
8887 
8888   //  rdx:rax = sum + carry + rdx:rax
8889   addq(sum, carry);
8890   adcq(rdxReg, 0);
8891   addq(sum, raxReg);
8892   adcq(rdxReg, 0);
8893 
8894   // carry:sum = rdx:sum
8895   movq(carry, rdxReg);
8896 }
8897 
8898 /**
8899  * Add 64 bit long carry into z[] with carry propogation.
8900  * Preserves z and carry register values and modifies rest of registers.
8901  *
8902  */
8903 void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) {
8904   Label L_fourth_loop, L_fourth_loop_exit;
8905 
8906   movl(tmp1, 1);
8907   subl(zlen, 2);
8908   addq(Address(z, zlen, Address::times_4, 0), carry);
8909 
8910   bind(L_fourth_loop);
8911   jccb(Assembler::carryClear, L_fourth_loop_exit);
8912   subl(zlen, 2);
8913   jccb(Assembler::negative, L_fourth_loop_exit);
8914   addq(Address(z, zlen, Address::times_4, 0), tmp1);
8915   jmp(L_fourth_loop);
8916   bind(L_fourth_loop_exit);
8917 }
8918 
8919 /**
8920  * Shift z[] left by 1 bit.
8921  * Preserves x, len, z and zlen registers and modifies rest of the registers.
8922  *
8923  */
8924 void MacroAssembler::lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
8925 
8926   Label L_fifth_loop, L_fifth_loop_exit;
8927 
8928   // Fifth loop
8929   // Perform primitiveLeftShift(z, zlen, 1)
8930 
8931   const Register prev_carry = tmp1;
8932   const Register new_carry = tmp4;
8933   const Register value = tmp2;
8934   const Register zidx = tmp3;
8935 
8936   // int zidx, carry;
8937   // long value;
8938   // carry = 0;
8939   // for (zidx = zlen-2; zidx >=0; zidx -= 2) {
8940   //    (carry:value)  = (z[i] << 1) | carry ;
8941   //    z[i] = value;
8942   // }
8943 
8944   movl(zidx, zlen);
8945   xorl(prev_carry, prev_carry); // clear carry flag and prev_carry register
8946 
8947   bind(L_fifth_loop);
8948   decl(zidx);  // Use decl to preserve carry flag
8949   decl(zidx);
8950   jccb(Assembler::negative, L_fifth_loop_exit);
8951 
8952   if (UseBMI2Instructions) {
8953      movq(value, Address(z, zidx, Address::times_4, 0));
8954      rclq(value, 1);
8955      rorxq(value, value, 32);
8956      movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
8957   }
8958   else {
8959     // clear new_carry
8960     xorl(new_carry, new_carry);
8961 
8962     // Shift z[i] by 1, or in previous carry and save new carry
8963     movq(value, Address(z, zidx, Address::times_4, 0));
8964     shlq(value, 1);
8965     adcl(new_carry, 0);
8966 
8967     orq(value, prev_carry);
8968     rorq(value, 0x20);
8969     movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
8970 
8971     // Set previous carry = new carry
8972     movl(prev_carry, new_carry);
8973   }
8974   jmp(L_fifth_loop);
8975 
8976   bind(L_fifth_loop_exit);
8977 }
8978 
8979 
8980 /**
8981  * Code for BigInteger::squareToLen() intrinsic
8982  *
8983  * rdi: x
8984  * rsi: len
8985  * r8:  z
8986  * rcx: zlen
8987  * r12: tmp1
8988  * r13: tmp2
8989  * r14: tmp3
8990  * r15: tmp4
8991  * rbx: tmp5
8992  *
8993  */
8994 void MacroAssembler::square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
8995 
8996   Label L_second_loop, L_second_loop_exit, L_third_loop, L_third_loop_exit, L_last_x, L_multiply;
8997   push(tmp1);
8998   push(tmp2);
8999   push(tmp3);
9000   push(tmp4);
9001   push(tmp5);
9002 
9003   // First loop
9004   // Store the squares, right shifted one bit (i.e., divided by 2).
9005   square_rshift(x, len, z, tmp1, tmp3, tmp4, tmp5, rdxReg, raxReg);
9006 
9007   // Add in off-diagonal sums.
9008   //
9009   // Second, third (nested) and fourth loops.
9010   // zlen +=2;
9011   // for (int xidx=len-2,zidx=zlen-4; xidx > 0; xidx-=2,zidx-=4) {
9012   //    carry = 0;
9013   //    long op2 = x[xidx:xidx+1];
9014   //    for (int j=xidx-2,k=zidx; j >= 0; j-=2) {
9015   //       k -= 2;
9016   //       long op1 = x[j:j+1];
9017   //       long sum = z[k:k+1];
9018   //       carry:sum = multiply_add_64(sum, op1, op2, carry, tmp_regs);
9019   //       z[k:k+1] = sum;
9020   //    }
9021   //    add_one_64(z, k, carry, tmp_regs);
9022   // }
9023 
9024   const Register carry = tmp5;
9025   const Register sum = tmp3;
9026   const Register op1 = tmp4;
9027   Register op2 = tmp2;
9028 
9029   push(zlen);
9030   push(len);
9031   addl(zlen,2);
9032   bind(L_second_loop);
9033   xorq(carry, carry);
9034   subl(zlen, 4);
9035   subl(len, 2);
9036   push(zlen);
9037   push(len);
9038   cmpl(len, 0);
9039   jccb(Assembler::lessEqual, L_second_loop_exit);
9040 
9041   // Multiply an array by one 64 bit long.
9042   if (UseBMI2Instructions) {
9043     op2 = rdxReg;
9044     movq(op2, Address(x, len, Address::times_4,  0));
9045     rorxq(op2, op2, 32);
9046   }
9047   else {
9048     movq(op2, Address(x, len, Address::times_4,  0));
9049     rorq(op2, 32);
9050   }
9051 
9052   bind(L_third_loop);
9053   decrementl(len);
9054   jccb(Assembler::negative, L_third_loop_exit);
9055   decrementl(len);
9056   jccb(Assembler::negative, L_last_x);
9057 
9058   movq(op1, Address(x, len, Address::times_4,  0));
9059   rorq(op1, 32);
9060 
9061   bind(L_multiply);
9062   subl(zlen, 2);
9063   movq(sum, Address(z, zlen, Address::times_4,  0));
9064 
9065   // Multiply 64 bit by 64 bit and add 64 bits lower half and upper 64 bits as carry.
9066   if (UseBMI2Instructions) {
9067     multiply_add_64_bmi2(sum, op1, op2, carry, tmp2);
9068   }
9069   else {
9070     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
9071   }
9072 
9073   movq(Address(z, zlen, Address::times_4, 0), sum);
9074 
9075   jmp(L_third_loop);
9076   bind(L_third_loop_exit);
9077 
9078   // Fourth loop
9079   // Add 64 bit long carry into z with carry propogation.
9080   // Uses offsetted zlen.
9081   add_one_64(z, zlen, carry, tmp1);
9082 
9083   pop(len);
9084   pop(zlen);
9085   jmp(L_second_loop);
9086 
9087   // Next infrequent code is moved outside loops.
9088   bind(L_last_x);
9089   movl(op1, Address(x, 0));
9090   jmp(L_multiply);
9091 
9092   bind(L_second_loop_exit);
9093   pop(len);
9094   pop(zlen);
9095   pop(len);
9096   pop(zlen);
9097 
9098   // Fifth loop
9099   // Shift z left 1 bit.
9100   lshift_by_1(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4);
9101 
9102   // z[zlen-1] |= x[len-1] & 1;
9103   movl(tmp3, Address(x, len, Address::times_4, -4));
9104   andl(tmp3, 1);
9105   orl(Address(z, zlen, Address::times_4,  -4), tmp3);
9106 
9107   pop(tmp5);
9108   pop(tmp4);
9109   pop(tmp3);
9110   pop(tmp2);
9111   pop(tmp1);
9112 }
9113 
9114 /**
9115  * Helper function for mul_add()
9116  * Multiply the in[] by int k and add to out[] starting at offset offs using
9117  * 128 bit by 32 bit multiply and return the carry in tmp5.
9118  * Only quad int aligned length of in[] is operated on in this function.
9119  * k is in rdxReg for BMI2Instructions, for others it is in tmp2.
9120  * This function preserves out, in and k registers.
9121  * len and offset point to the appropriate index in "in" & "out" correspondingly
9122  * tmp5 has the carry.
9123  * other registers are temporary and are modified.
9124  *
9125  */
9126 void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in,
9127   Register offset, Register len, Register tmp1, Register tmp2, Register tmp3,
9128   Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
9129 
9130   Label L_first_loop, L_first_loop_exit;
9131 
9132   movl(tmp1, len);
9133   shrl(tmp1, 2);
9134 
9135   bind(L_first_loop);
9136   subl(tmp1, 1);
9137   jccb(Assembler::negative, L_first_loop_exit);
9138 
9139   subl(len, 4);
9140   subl(offset, 4);
9141 
9142   Register op2 = tmp2;
9143   const Register sum = tmp3;
9144   const Register op1 = tmp4;
9145   const Register carry = tmp5;
9146 
9147   if (UseBMI2Instructions) {
9148     op2 = rdxReg;
9149   }
9150 
9151   movq(op1, Address(in, len, Address::times_4,  8));
9152   rorq(op1, 32);
9153   movq(sum, Address(out, offset, Address::times_4,  8));
9154   rorq(sum, 32);
9155   if (UseBMI2Instructions) {
9156     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
9157   }
9158   else {
9159     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
9160   }
9161   // Store back in big endian from little endian
9162   rorq(sum, 0x20);
9163   movq(Address(out, offset, Address::times_4,  8), sum);
9164 
9165   movq(op1, Address(in, len, Address::times_4,  0));
9166   rorq(op1, 32);
9167   movq(sum, Address(out, offset, Address::times_4,  0));
9168   rorq(sum, 32);
9169   if (UseBMI2Instructions) {
9170     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
9171   }
9172   else {
9173     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
9174   }
9175   // Store back in big endian from little endian
9176   rorq(sum, 0x20);
9177   movq(Address(out, offset, Address::times_4,  0), sum);
9178 
9179   jmp(L_first_loop);
9180   bind(L_first_loop_exit);
9181 }
9182 
9183 /**
9184  * Code for BigInteger::mulAdd() intrinsic
9185  *
9186  * rdi: out
9187  * rsi: in
9188  * r11: offs (out.length - offset)
9189  * rcx: len
9190  * r8:  k
9191  * r12: tmp1
9192  * r13: tmp2
9193  * r14: tmp3
9194  * r15: tmp4
9195  * rbx: tmp5
9196  * Multiply the in[] by word k and add to out[], return the carry in rax
9197  */
9198 void MacroAssembler::mul_add(Register out, Register in, Register offs,
9199    Register len, Register k, Register tmp1, Register tmp2, Register tmp3,
9200    Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
9201 
9202   Label L_carry, L_last_in, L_done;
9203 
9204 // carry = 0;
9205 // for (int j=len-1; j >= 0; j--) {
9206 //    long product = (in[j] & LONG_MASK) * kLong +
9207 //                   (out[offs] & LONG_MASK) + carry;
9208 //    out[offs--] = (int)product;
9209 //    carry = product >>> 32;
9210 // }
9211 //
9212   push(tmp1);
9213   push(tmp2);
9214   push(tmp3);
9215   push(tmp4);
9216   push(tmp5);
9217 
9218   Register op2 = tmp2;
9219   const Register sum = tmp3;
9220   const Register op1 = tmp4;
9221   const Register carry =  tmp5;
9222 
9223   if (UseBMI2Instructions) {
9224     op2 = rdxReg;
9225     movl(op2, k);
9226   }
9227   else {
9228     movl(op2, k);
9229   }
9230 
9231   xorq(carry, carry);
9232 
9233   //First loop
9234 
9235   //Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply
9236   //The carry is in tmp5
9237   mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg);
9238 
9239   //Multiply the trailing in[] entry using 64 bit by 32 bit, if any
9240   decrementl(len);
9241   jccb(Assembler::negative, L_carry);
9242   decrementl(len);
9243   jccb(Assembler::negative, L_last_in);
9244 
9245   movq(op1, Address(in, len, Address::times_4,  0));
9246   rorq(op1, 32);
9247 
9248   subl(offs, 2);
9249   movq(sum, Address(out, offs, Address::times_4,  0));
9250   rorq(sum, 32);
9251 
9252   if (UseBMI2Instructions) {
9253     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
9254   }
9255   else {
9256     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
9257   }
9258 
9259   // Store back in big endian from little endian
9260   rorq(sum, 0x20);
9261   movq(Address(out, offs, Address::times_4,  0), sum);
9262 
9263   testl(len, len);
9264   jccb(Assembler::zero, L_carry);
9265 
9266   //Multiply the last in[] entry, if any
9267   bind(L_last_in);
9268   movl(op1, Address(in, 0));
9269   movl(sum, Address(out, offs, Address::times_4,  -4));
9270 
9271   movl(raxReg, k);
9272   mull(op1); //tmp4 * eax -> edx:eax
9273   addl(sum, carry);
9274   adcl(rdxReg, 0);
9275   addl(sum, raxReg);
9276   adcl(rdxReg, 0);
9277   movl(carry, rdxReg);
9278 
9279   movl(Address(out, offs, Address::times_4,  -4), sum);
9280 
9281   bind(L_carry);
9282   //return tmp5/carry as carry in rax
9283   movl(rax, carry);
9284 
9285   bind(L_done);
9286   pop(tmp5);
9287   pop(tmp4);
9288   pop(tmp3);
9289   pop(tmp2);
9290   pop(tmp1);
9291 }
9292 #endif
9293 
9294 /**
9295  * Emits code to update CRC-32 with a byte value according to constants in table
9296  *
9297  * @param [in,out]crc   Register containing the crc.
9298  * @param [in]val       Register containing the byte to fold into the CRC.
9299  * @param [in]table     Register containing the table of crc constants.
9300  *
9301  * uint32_t crc;
9302  * val = crc_table[(val ^ crc) & 0xFF];
9303  * crc = val ^ (crc >> 8);
9304  *
9305  */
9306 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
9307   xorl(val, crc);
9308   andl(val, 0xFF);
9309   shrl(crc, 8); // unsigned shift
9310   xorl(crc, Address(table, val, Address::times_4, 0));
9311 }
9312 
9313 /**
9314 * Fold four 128-bit data chunks
9315 */
9316 void MacroAssembler::fold_128bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
9317   evpclmulhdq(xtmp, xK, xcrc, Assembler::AVX_512bit); // [123:64]
9318   evpclmulldq(xcrc, xK, xcrc, Assembler::AVX_512bit); // [63:0]
9319   evpxorq(xcrc, xcrc, Address(buf, offset), Assembler::AVX_512bit /* vector_len */);
9320   evpxorq(xcrc, xcrc, xtmp, Assembler::AVX_512bit /* vector_len */);
9321 }
9322 
9323 /**
9324  * Fold 128-bit data chunk
9325  */
9326 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
9327   if (UseAVX > 0) {
9328     vpclmulhdq(xtmp, xK, xcrc); // [123:64]
9329     vpclmulldq(xcrc, xK, xcrc); // [63:0]
9330     vpxor(xcrc, xcrc, Address(buf, offset), 0 /* vector_len */);
9331     pxor(xcrc, xtmp);
9332   } else {
9333     movdqa(xtmp, xcrc);
9334     pclmulhdq(xtmp, xK);   // [123:64]
9335     pclmulldq(xcrc, xK);   // [63:0]
9336     pxor(xcrc, xtmp);
9337     movdqu(xtmp, Address(buf, offset));
9338     pxor(xcrc, xtmp);
9339   }
9340 }
9341 
9342 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) {
9343   if (UseAVX > 0) {
9344     vpclmulhdq(xtmp, xK, xcrc);
9345     vpclmulldq(xcrc, xK, xcrc);
9346     pxor(xcrc, xbuf);
9347     pxor(xcrc, xtmp);
9348   } else {
9349     movdqa(xtmp, xcrc);
9350     pclmulhdq(xtmp, xK);
9351     pclmulldq(xcrc, xK);
9352     pxor(xcrc, xbuf);
9353     pxor(xcrc, xtmp);
9354   }
9355 }
9356 
9357 /**
9358  * 8-bit folds to compute 32-bit CRC
9359  *
9360  * uint64_t xcrc;
9361  * timesXtoThe32[xcrc & 0xFF] ^ (xcrc >> 8);
9362  */
9363 void MacroAssembler::fold_8bit_crc32(XMMRegister xcrc, Register table, XMMRegister xtmp, Register tmp) {
9364   movdl(tmp, xcrc);
9365   andl(tmp, 0xFF);
9366   movdl(xtmp, Address(table, tmp, Address::times_4, 0));
9367   psrldq(xcrc, 1); // unsigned shift one byte
9368   pxor(xcrc, xtmp);
9369 }
9370 
9371 /**
9372  * uint32_t crc;
9373  * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
9374  */
9375 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
9376   movl(tmp, crc);
9377   andl(tmp, 0xFF);
9378   shrl(crc, 8);
9379   xorl(crc, Address(table, tmp, Address::times_4, 0));
9380 }
9381 
9382 /**
9383  * @param crc   register containing existing CRC (32-bit)
9384  * @param buf   register pointing to input byte buffer (byte*)
9385  * @param len   register containing number of bytes
9386  * @param table register that will contain address of CRC table
9387  * @param tmp   scratch register
9388  */
9389 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp) {
9390   assert_different_registers(crc, buf, len, table, tmp, rax);
9391 
9392   Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
9393   Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
9394 
9395   // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
9396   // context for the registers used, where all instructions below are using 128-bit mode
9397   // On EVEX without VL and BW, these instructions will all be AVX.
9398   lea(table, ExternalAddress(StubRoutines::crc_table_addr()));
9399   notl(crc); // ~crc
9400   cmpl(len, 16);
9401   jcc(Assembler::less, L_tail);
9402 
9403   // Align buffer to 16 bytes
9404   movl(tmp, buf);
9405   andl(tmp, 0xF);
9406   jccb(Assembler::zero, L_aligned);
9407   subl(tmp,  16);
9408   addl(len, tmp);
9409 
9410   align(4);
9411   BIND(L_align_loop);
9412   movsbl(rax, Address(buf, 0)); // load byte with sign extension
9413   update_byte_crc32(crc, rax, table);
9414   increment(buf);
9415   incrementl(tmp);
9416   jccb(Assembler::less, L_align_loop);
9417 
9418   BIND(L_aligned);
9419   movl(tmp, len); // save
9420   shrl(len, 4);
9421   jcc(Assembler::zero, L_tail_restore);
9422 
9423   // Fold total 512 bits of polynomial on each iteration
9424   if (VM_Version::supports_vpclmulqdq()) {
9425     Label Parallel_loop, L_No_Parallel;
9426 
9427     cmpl(len, 8);
9428     jccb(Assembler::less, L_No_Parallel);
9429 
9430     movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32));
9431     evmovdquq(xmm1, Address(buf, 0), Assembler::AVX_512bit);
9432     movdl(xmm5, crc);
9433     evpxorq(xmm1, xmm1, xmm5, Assembler::AVX_512bit);
9434     addptr(buf, 64);
9435     subl(len, 7);
9436     evshufi64x2(xmm0, xmm0, xmm0, 0x00, Assembler::AVX_512bit); //propagate the mask from 128 bits to 512 bits
9437 
9438     BIND(Parallel_loop);
9439     fold_128bit_crc32_avx512(xmm1, xmm0, xmm5, buf, 0);
9440     addptr(buf, 64);
9441     subl(len, 4);
9442     jcc(Assembler::greater, Parallel_loop);
9443 
9444     vextracti64x2(xmm2, xmm1, 0x01);
9445     vextracti64x2(xmm3, xmm1, 0x02);
9446     vextracti64x2(xmm4, xmm1, 0x03);
9447     jmp(L_fold_512b);
9448 
9449     BIND(L_No_Parallel);
9450   }
9451   // Fold crc into first bytes of vector
9452   movdqa(xmm1, Address(buf, 0));
9453   movdl(rax, xmm1);
9454   xorl(crc, rax);
9455   if (VM_Version::supports_sse4_1()) {
9456     pinsrd(xmm1, crc, 0);
9457   } else {
9458     pinsrw(xmm1, crc, 0);
9459     shrl(crc, 16);
9460     pinsrw(xmm1, crc, 1);
9461   }
9462   addptr(buf, 16);
9463   subl(len, 4); // len > 0
9464   jcc(Assembler::less, L_fold_tail);
9465 
9466   movdqa(xmm2, Address(buf,  0));
9467   movdqa(xmm3, Address(buf, 16));
9468   movdqa(xmm4, Address(buf, 32));
9469   addptr(buf, 48);
9470   subl(len, 3);
9471   jcc(Assembler::lessEqual, L_fold_512b);
9472 
9473   // Fold total 512 bits of polynomial on each iteration,
9474   // 128 bits per each of 4 parallel streams.
9475   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32));
9476 
9477   align(32);
9478   BIND(L_fold_512b_loop);
9479   fold_128bit_crc32(xmm1, xmm0, xmm5, buf,  0);
9480   fold_128bit_crc32(xmm2, xmm0, xmm5, buf, 16);
9481   fold_128bit_crc32(xmm3, xmm0, xmm5, buf, 32);
9482   fold_128bit_crc32(xmm4, xmm0, xmm5, buf, 48);
9483   addptr(buf, 64);
9484   subl(len, 4);
9485   jcc(Assembler::greater, L_fold_512b_loop);
9486 
9487   // Fold 512 bits to 128 bits.
9488   BIND(L_fold_512b);
9489   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
9490   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm2);
9491   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm3);
9492   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm4);
9493 
9494   // Fold the rest of 128 bits data chunks
9495   BIND(L_fold_tail);
9496   addl(len, 3);
9497   jccb(Assembler::lessEqual, L_fold_128b);
9498   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
9499 
9500   BIND(L_fold_tail_loop);
9501   fold_128bit_crc32(xmm1, xmm0, xmm5, buf,  0);
9502   addptr(buf, 16);
9503   decrementl(len);
9504   jccb(Assembler::greater, L_fold_tail_loop);
9505 
9506   // Fold 128 bits in xmm1 down into 32 bits in crc register.
9507   BIND(L_fold_128b);
9508   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr()));
9509   if (UseAVX > 0) {
9510     vpclmulqdq(xmm2, xmm0, xmm1, 0x1);
9511     vpand(xmm3, xmm0, xmm2, 0 /* vector_len */);
9512     vpclmulqdq(xmm0, xmm0, xmm3, 0x1);
9513   } else {
9514     movdqa(xmm2, xmm0);
9515     pclmulqdq(xmm2, xmm1, 0x1);
9516     movdqa(xmm3, xmm0);
9517     pand(xmm3, xmm2);
9518     pclmulqdq(xmm0, xmm3, 0x1);
9519   }
9520   psrldq(xmm1, 8);
9521   psrldq(xmm2, 4);
9522   pxor(xmm0, xmm1);
9523   pxor(xmm0, xmm2);
9524 
9525   // 8 8-bit folds to compute 32-bit CRC.
9526   for (int j = 0; j < 4; j++) {
9527     fold_8bit_crc32(xmm0, table, xmm1, rax);
9528   }
9529   movdl(crc, xmm0); // mov 32 bits to general register
9530   for (int j = 0; j < 4; j++) {
9531     fold_8bit_crc32(crc, table, rax);
9532   }
9533 
9534   BIND(L_tail_restore);
9535   movl(len, tmp); // restore
9536   BIND(L_tail);
9537   andl(len, 0xf);
9538   jccb(Assembler::zero, L_exit);
9539 
9540   // Fold the rest of bytes
9541   align(4);
9542   BIND(L_tail_loop);
9543   movsbl(rax, Address(buf, 0)); // load byte with sign extension
9544   update_byte_crc32(crc, rax, table);
9545   increment(buf);
9546   decrementl(len);
9547   jccb(Assembler::greater, L_tail_loop);
9548 
9549   BIND(L_exit);
9550   notl(crc); // ~c
9551 }
9552 
9553 #ifdef _LP64
9554 // S. Gueron / Information Processing Letters 112 (2012) 184
9555 // Algorithm 4: Computing carry-less multiplication using a precomputed lookup table.
9556 // Input: A 32 bit value B = [byte3, byte2, byte1, byte0].
9557 // Output: the 64-bit carry-less product of B * CONST
9558 void MacroAssembler::crc32c_ipl_alg4(Register in, uint32_t n,
9559                                      Register tmp1, Register tmp2, Register tmp3) {
9560   lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
9561   if (n > 0) {
9562     addq(tmp3, n * 256 * 8);
9563   }
9564   //    Q1 = TABLEExt[n][B & 0xFF];
9565   movl(tmp1, in);
9566   andl(tmp1, 0x000000FF);
9567   shll(tmp1, 3);
9568   addq(tmp1, tmp3);
9569   movq(tmp1, Address(tmp1, 0));
9570 
9571   //    Q2 = TABLEExt[n][B >> 8 & 0xFF];
9572   movl(tmp2, in);
9573   shrl(tmp2, 8);
9574   andl(tmp2, 0x000000FF);
9575   shll(tmp2, 3);
9576   addq(tmp2, tmp3);
9577   movq(tmp2, Address(tmp2, 0));
9578 
9579   shlq(tmp2, 8);
9580   xorq(tmp1, tmp2);
9581 
9582   //    Q3 = TABLEExt[n][B >> 16 & 0xFF];
9583   movl(tmp2, in);
9584   shrl(tmp2, 16);
9585   andl(tmp2, 0x000000FF);
9586   shll(tmp2, 3);
9587   addq(tmp2, tmp3);
9588   movq(tmp2, Address(tmp2, 0));
9589 
9590   shlq(tmp2, 16);
9591   xorq(tmp1, tmp2);
9592 
9593   //    Q4 = TABLEExt[n][B >> 24 & 0xFF];
9594   shrl(in, 24);
9595   andl(in, 0x000000FF);
9596   shll(in, 3);
9597   addq(in, tmp3);
9598   movq(in, Address(in, 0));
9599 
9600   shlq(in, 24);
9601   xorq(in, tmp1);
9602   //    return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
9603 }
9604 
9605 void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
9606                                       Register in_out,
9607                                       uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
9608                                       XMMRegister w_xtmp2,
9609                                       Register tmp1,
9610                                       Register n_tmp2, Register n_tmp3) {
9611   if (is_pclmulqdq_supported) {
9612     movdl(w_xtmp1, in_out); // modified blindly
9613 
9614     movl(tmp1, const_or_pre_comp_const_index);
9615     movdl(w_xtmp2, tmp1);
9616     pclmulqdq(w_xtmp1, w_xtmp2, 0);
9617 
9618     movdq(in_out, w_xtmp1);
9619   } else {
9620     crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3);
9621   }
9622 }
9623 
9624 // Recombination Alternative 2: No bit-reflections
9625 // T1 = (CRC_A * U1) << 1
9626 // T2 = (CRC_B * U2) << 1
9627 // C1 = T1 >> 32
9628 // C2 = T2 >> 32
9629 // T1 = T1 & 0xFFFFFFFF
9630 // T2 = T2 & 0xFFFFFFFF
9631 // T1 = CRC32(0, T1)
9632 // T2 = CRC32(0, T2)
9633 // C1 = C1 ^ T1
9634 // C2 = C2 ^ T2
9635 // CRC = C1 ^ C2 ^ CRC_C
9636 void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
9637                                      XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
9638                                      Register tmp1, Register tmp2,
9639                                      Register n_tmp3) {
9640   crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
9641   crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
9642   shlq(in_out, 1);
9643   movl(tmp1, in_out);
9644   shrq(in_out, 32);
9645   xorl(tmp2, tmp2);
9646   crc32(tmp2, tmp1, 4);
9647   xorl(in_out, tmp2); // we don't care about upper 32 bit contents here
9648   shlq(in1, 1);
9649   movl(tmp1, in1);
9650   shrq(in1, 32);
9651   xorl(tmp2, tmp2);
9652   crc32(tmp2, tmp1, 4);
9653   xorl(in1, tmp2);
9654   xorl(in_out, in1);
9655   xorl(in_out, in2);
9656 }
9657 
9658 // Set N to predefined value
9659 // Subtract from a lenght of a buffer
9660 // execute in a loop:
9661 // CRC_A = 0xFFFFFFFF, CRC_B = 0, CRC_C = 0
9662 // for i = 1 to N do
9663 //  CRC_A = CRC32(CRC_A, A[i])
9664 //  CRC_B = CRC32(CRC_B, B[i])
9665 //  CRC_C = CRC32(CRC_C, C[i])
9666 // end for
9667 // Recombine
9668 void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
9669                                        Register in_out1, Register in_out2, Register in_out3,
9670                                        Register tmp1, Register tmp2, Register tmp3,
9671                                        XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
9672                                        Register tmp4, Register tmp5,
9673                                        Register n_tmp6) {
9674   Label L_processPartitions;
9675   Label L_processPartition;
9676   Label L_exit;
9677 
9678   bind(L_processPartitions);
9679   cmpl(in_out1, 3 * size);
9680   jcc(Assembler::less, L_exit);
9681     xorl(tmp1, tmp1);
9682     xorl(tmp2, tmp2);
9683     movq(tmp3, in_out2);
9684     addq(tmp3, size);
9685 
9686     bind(L_processPartition);
9687       crc32(in_out3, Address(in_out2, 0), 8);
9688       crc32(tmp1, Address(in_out2, size), 8);
9689       crc32(tmp2, Address(in_out2, size * 2), 8);
9690       addq(in_out2, 8);
9691       cmpq(in_out2, tmp3);
9692       jcc(Assembler::less, L_processPartition);
9693     crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
9694             w_xtmp1, w_xtmp2, w_xtmp3,
9695             tmp4, tmp5,
9696             n_tmp6);
9697     addq(in_out2, 2 * size);
9698     subl(in_out1, 3 * size);
9699     jmp(L_processPartitions);
9700 
9701   bind(L_exit);
9702 }
9703 #else
9704 void MacroAssembler::crc32c_ipl_alg4(Register in_out, uint32_t n,
9705                                      Register tmp1, Register tmp2, Register tmp3,
9706                                      XMMRegister xtmp1, XMMRegister xtmp2) {
9707   lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
9708   if (n > 0) {
9709     addl(tmp3, n * 256 * 8);
9710   }
9711   //    Q1 = TABLEExt[n][B & 0xFF];
9712   movl(tmp1, in_out);
9713   andl(tmp1, 0x000000FF);
9714   shll(tmp1, 3);
9715   addl(tmp1, tmp3);
9716   movq(xtmp1, Address(tmp1, 0));
9717 
9718   //    Q2 = TABLEExt[n][B >> 8 & 0xFF];
9719   movl(tmp2, in_out);
9720   shrl(tmp2, 8);
9721   andl(tmp2, 0x000000FF);
9722   shll(tmp2, 3);
9723   addl(tmp2, tmp3);
9724   movq(xtmp2, Address(tmp2, 0));
9725 
9726   psllq(xtmp2, 8);
9727   pxor(xtmp1, xtmp2);
9728 
9729   //    Q3 = TABLEExt[n][B >> 16 & 0xFF];
9730   movl(tmp2, in_out);
9731   shrl(tmp2, 16);
9732   andl(tmp2, 0x000000FF);
9733   shll(tmp2, 3);
9734   addl(tmp2, tmp3);
9735   movq(xtmp2, Address(tmp2, 0));
9736 
9737   psllq(xtmp2, 16);
9738   pxor(xtmp1, xtmp2);
9739 
9740   //    Q4 = TABLEExt[n][B >> 24 & 0xFF];
9741   shrl(in_out, 24);
9742   andl(in_out, 0x000000FF);
9743   shll(in_out, 3);
9744   addl(in_out, tmp3);
9745   movq(xtmp2, Address(in_out, 0));
9746 
9747   psllq(xtmp2, 24);
9748   pxor(xtmp1, xtmp2); // Result in CXMM
9749   //    return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
9750 }
9751 
9752 void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
9753                                       Register in_out,
9754                                       uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
9755                                       XMMRegister w_xtmp2,
9756                                       Register tmp1,
9757                                       Register n_tmp2, Register n_tmp3) {
9758   if (is_pclmulqdq_supported) {
9759     movdl(w_xtmp1, in_out);
9760 
9761     movl(tmp1, const_or_pre_comp_const_index);
9762     movdl(w_xtmp2, tmp1);
9763     pclmulqdq(w_xtmp1, w_xtmp2, 0);
9764     // Keep result in XMM since GPR is 32 bit in length
9765   } else {
9766     crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3, w_xtmp1, w_xtmp2);
9767   }
9768 }
9769 
9770 void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
9771                                      XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
9772                                      Register tmp1, Register tmp2,
9773                                      Register n_tmp3) {
9774   crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
9775   crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
9776 
9777   psllq(w_xtmp1, 1);
9778   movdl(tmp1, w_xtmp1);
9779   psrlq(w_xtmp1, 32);
9780   movdl(in_out, w_xtmp1);
9781 
9782   xorl(tmp2, tmp2);
9783   crc32(tmp2, tmp1, 4);
9784   xorl(in_out, tmp2);
9785 
9786   psllq(w_xtmp2, 1);
9787   movdl(tmp1, w_xtmp2);
9788   psrlq(w_xtmp2, 32);
9789   movdl(in1, w_xtmp2);
9790 
9791   xorl(tmp2, tmp2);
9792   crc32(tmp2, tmp1, 4);
9793   xorl(in1, tmp2);
9794   xorl(in_out, in1);
9795   xorl(in_out, in2);
9796 }
9797 
9798 void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
9799                                        Register in_out1, Register in_out2, Register in_out3,
9800                                        Register tmp1, Register tmp2, Register tmp3,
9801                                        XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
9802                                        Register tmp4, Register tmp5,
9803                                        Register n_tmp6) {
9804   Label L_processPartitions;
9805   Label L_processPartition;
9806   Label L_exit;
9807 
9808   bind(L_processPartitions);
9809   cmpl(in_out1, 3 * size);
9810   jcc(Assembler::less, L_exit);
9811     xorl(tmp1, tmp1);
9812     xorl(tmp2, tmp2);
9813     movl(tmp3, in_out2);
9814     addl(tmp3, size);
9815 
9816     bind(L_processPartition);
9817       crc32(in_out3, Address(in_out2, 0), 4);
9818       crc32(tmp1, Address(in_out2, size), 4);
9819       crc32(tmp2, Address(in_out2, size*2), 4);
9820       crc32(in_out3, Address(in_out2, 0+4), 4);
9821       crc32(tmp1, Address(in_out2, size+4), 4);
9822       crc32(tmp2, Address(in_out2, size*2+4), 4);
9823       addl(in_out2, 8);
9824       cmpl(in_out2, tmp3);
9825       jcc(Assembler::less, L_processPartition);
9826 
9827         push(tmp3);
9828         push(in_out1);
9829         push(in_out2);
9830         tmp4 = tmp3;
9831         tmp5 = in_out1;
9832         n_tmp6 = in_out2;
9833 
9834       crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
9835             w_xtmp1, w_xtmp2, w_xtmp3,
9836             tmp4, tmp5,
9837             n_tmp6);
9838 
9839         pop(in_out2);
9840         pop(in_out1);
9841         pop(tmp3);
9842 
9843     addl(in_out2, 2 * size);
9844     subl(in_out1, 3 * size);
9845     jmp(L_processPartitions);
9846 
9847   bind(L_exit);
9848 }
9849 #endif //LP64
9850 
9851 #ifdef _LP64
9852 // Algorithm 2: Pipelined usage of the CRC32 instruction.
9853 // Input: A buffer I of L bytes.
9854 // Output: the CRC32C value of the buffer.
9855 // Notations:
9856 // Write L = 24N + r, with N = floor (L/24).
9857 // r = L mod 24 (0 <= r < 24).
9858 // Consider I as the concatenation of A|B|C|R, where A, B, C, each,
9859 // N quadwords, and R consists of r bytes.
9860 // A[j] = I [8j+7:8j], j= 0, 1, ..., N-1
9861 // B[j] = I [N + 8j+7:N + 8j], j= 0, 1, ..., N-1
9862 // C[j] = I [2N + 8j+7:2N + 8j], j= 0, 1, ..., N-1
9863 // if r > 0 R[j] = I [3N +j], j= 0, 1, ...,r-1
9864 void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
9865                                           Register tmp1, Register tmp2, Register tmp3,
9866                                           Register tmp4, Register tmp5, Register tmp6,
9867                                           XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
9868                                           bool is_pclmulqdq_supported) {
9869   uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
9870   Label L_wordByWord;
9871   Label L_byteByByteProlog;
9872   Label L_byteByByte;
9873   Label L_exit;
9874 
9875   if (is_pclmulqdq_supported ) {
9876     const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
9877     const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr+1);
9878 
9879     const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
9880     const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
9881 
9882     const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
9883     const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
9884     assert((CRC32C_NUM_PRECOMPUTED_CONSTANTS - 1 ) == 5, "Checking whether you declared all of the constants based on the number of \"chunks\"");
9885   } else {
9886     const_or_pre_comp_const_index[0] = 1;
9887     const_or_pre_comp_const_index[1] = 0;
9888 
9889     const_or_pre_comp_const_index[2] = 3;
9890     const_or_pre_comp_const_index[3] = 2;
9891 
9892     const_or_pre_comp_const_index[4] = 5;
9893     const_or_pre_comp_const_index[5] = 4;
9894    }
9895   crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
9896                     in2, in1, in_out,
9897                     tmp1, tmp2, tmp3,
9898                     w_xtmp1, w_xtmp2, w_xtmp3,
9899                     tmp4, tmp5,
9900                     tmp6);
9901   crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
9902                     in2, in1, in_out,
9903                     tmp1, tmp2, tmp3,
9904                     w_xtmp1, w_xtmp2, w_xtmp3,
9905                     tmp4, tmp5,
9906                     tmp6);
9907   crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
9908                     in2, in1, in_out,
9909                     tmp1, tmp2, tmp3,
9910                     w_xtmp1, w_xtmp2, w_xtmp3,
9911                     tmp4, tmp5,
9912                     tmp6);
9913   movl(tmp1, in2);
9914   andl(tmp1, 0x00000007);
9915   negl(tmp1);
9916   addl(tmp1, in2);
9917   addq(tmp1, in1);
9918 
9919   BIND(L_wordByWord);
9920   cmpq(in1, tmp1);
9921   jcc(Assembler::greaterEqual, L_byteByByteProlog);
9922     crc32(in_out, Address(in1, 0), 4);
9923     addq(in1, 4);
9924     jmp(L_wordByWord);
9925 
9926   BIND(L_byteByByteProlog);
9927   andl(in2, 0x00000007);
9928   movl(tmp2, 1);
9929 
9930   BIND(L_byteByByte);
9931   cmpl(tmp2, in2);
9932   jccb(Assembler::greater, L_exit);
9933     crc32(in_out, Address(in1, 0), 1);
9934     incq(in1);
9935     incl(tmp2);
9936     jmp(L_byteByByte);
9937 
9938   BIND(L_exit);
9939 }
9940 #else
9941 void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
9942                                           Register tmp1, Register  tmp2, Register tmp3,
9943                                           Register tmp4, Register  tmp5, Register tmp6,
9944                                           XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
9945                                           bool is_pclmulqdq_supported) {
9946   uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
9947   Label L_wordByWord;
9948   Label L_byteByByteProlog;
9949   Label L_byteByByte;
9950   Label L_exit;
9951 
9952   if (is_pclmulqdq_supported) {
9953     const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
9954     const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 1);
9955 
9956     const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
9957     const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
9958 
9959     const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
9960     const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
9961   } else {
9962     const_or_pre_comp_const_index[0] = 1;
9963     const_or_pre_comp_const_index[1] = 0;
9964 
9965     const_or_pre_comp_const_index[2] = 3;
9966     const_or_pre_comp_const_index[3] = 2;
9967 
9968     const_or_pre_comp_const_index[4] = 5;
9969     const_or_pre_comp_const_index[5] = 4;
9970   }
9971   crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
9972                     in2, in1, in_out,
9973                     tmp1, tmp2, tmp3,
9974                     w_xtmp1, w_xtmp2, w_xtmp3,
9975                     tmp4, tmp5,
9976                     tmp6);
9977   crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
9978                     in2, in1, in_out,
9979                     tmp1, tmp2, tmp3,
9980                     w_xtmp1, w_xtmp2, w_xtmp3,
9981                     tmp4, tmp5,
9982                     tmp6);
9983   crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
9984                     in2, in1, in_out,
9985                     tmp1, tmp2, tmp3,
9986                     w_xtmp1, w_xtmp2, w_xtmp3,
9987                     tmp4, tmp5,
9988                     tmp6);
9989   movl(tmp1, in2);
9990   andl(tmp1, 0x00000007);
9991   negl(tmp1);
9992   addl(tmp1, in2);
9993   addl(tmp1, in1);
9994 
9995   BIND(L_wordByWord);
9996   cmpl(in1, tmp1);
9997   jcc(Assembler::greaterEqual, L_byteByByteProlog);
9998     crc32(in_out, Address(in1,0), 4);
9999     addl(in1, 4);
10000     jmp(L_wordByWord);
10001 
10002   BIND(L_byteByByteProlog);
10003   andl(in2, 0x00000007);
10004   movl(tmp2, 1);
10005 
10006   BIND(L_byteByByte);
10007   cmpl(tmp2, in2);
10008   jccb(Assembler::greater, L_exit);
10009     movb(tmp1, Address(in1, 0));
10010     crc32(in_out, tmp1, 1);
10011     incl(in1);
10012     incl(tmp2);
10013     jmp(L_byteByByte);
10014 
10015   BIND(L_exit);
10016 }
10017 #endif // LP64
10018 #undef BIND
10019 #undef BLOCK_COMMENT
10020 
10021 // Compress char[] array to byte[].
10022 //   ..\jdk\src\java.base\share\classes\java\lang\StringUTF16.java
10023 //   @HotSpotIntrinsicCandidate
10024 //   private static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) {
10025 //     for (int i = 0; i < len; i++) {
10026 //       int c = src[srcOff++];
10027 //       if (c >>> 8 != 0) {
10028 //         return 0;
10029 //       }
10030 //       dst[dstOff++] = (byte)c;
10031 //     }
10032 //     return len;
10033 //   }
10034 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
10035   XMMRegister tmp1Reg, XMMRegister tmp2Reg,
10036   XMMRegister tmp3Reg, XMMRegister tmp4Reg,
10037   Register tmp5, Register result) {
10038   Label copy_chars_loop, return_length, return_zero, done;
10039 
10040   // rsi: src
10041   // rdi: dst
10042   // rdx: len
10043   // rcx: tmp5
10044   // rax: result
10045 
10046   // rsi holds start addr of source char[] to be compressed
10047   // rdi holds start addr of destination byte[]
10048   // rdx holds length
10049 
10050   assert(len != result, "");
10051 
10052   // save length for return
10053   push(len);
10054 
10055   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
10056     VM_Version::supports_avx512vlbw() &&
10057     VM_Version::supports_bmi2()) {
10058 
10059     Label copy_32_loop, copy_loop_tail, below_threshold;
10060 
10061     // alignment
10062     Label post_alignment;
10063 
10064     // if length of the string is less than 16, handle it in an old fashioned way
10065     testl(len, -32);
10066     jcc(Assembler::zero, below_threshold);
10067 
10068     // First check whether a character is compressable ( <= 0xFF).
10069     // Create mask to test for Unicode chars inside zmm vector
10070     movl(result, 0x00FF);
10071     evpbroadcastw(tmp2Reg, result, Assembler::AVX_512bit);
10072 
10073     testl(len, -64);
10074     jcc(Assembler::zero, post_alignment);
10075 
10076     movl(tmp5, dst);
10077     andl(tmp5, (32 - 1));
10078     negl(tmp5);
10079     andl(tmp5, (32 - 1));
10080 
10081     // bail out when there is nothing to be done
10082     testl(tmp5, 0xFFFFFFFF);
10083     jcc(Assembler::zero, post_alignment);
10084 
10085     // ~(~0 << len), where len is the # of remaining elements to process
10086     movl(result, 0xFFFFFFFF);
10087     shlxl(result, result, tmp5);
10088     notl(result);
10089     kmovdl(k3, result);
10090 
10091     evmovdquw(tmp1Reg, k3, Address(src, 0), Assembler::AVX_512bit);
10092     evpcmpuw(k2, k3, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
10093     ktestd(k2, k3);
10094     jcc(Assembler::carryClear, return_zero);
10095 
10096     evpmovwb(Address(dst, 0), k3, tmp1Reg, Assembler::AVX_512bit);
10097 
10098     addptr(src, tmp5);
10099     addptr(src, tmp5);
10100     addptr(dst, tmp5);
10101     subl(len, tmp5);
10102 
10103     bind(post_alignment);
10104     // end of alignment
10105 
10106     movl(tmp5, len);
10107     andl(tmp5, (32 - 1));    // tail count (in chars)
10108     andl(len, ~(32 - 1));    // vector count (in chars)
10109     jcc(Assembler::zero, copy_loop_tail);
10110 
10111     lea(src, Address(src, len, Address::times_2));
10112     lea(dst, Address(dst, len, Address::times_1));
10113     negptr(len);
10114 
10115     bind(copy_32_loop);
10116     evmovdquw(tmp1Reg, Address(src, len, Address::times_2), Assembler::AVX_512bit);
10117     evpcmpuw(k2, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
10118     kortestdl(k2, k2);
10119     jcc(Assembler::carryClear, return_zero);
10120 
10121     // All elements in current processed chunk are valid candidates for
10122     // compression. Write a truncated byte elements to the memory.
10123     evpmovwb(Address(dst, len, Address::times_1), tmp1Reg, Assembler::AVX_512bit);
10124     addptr(len, 32);
10125     jcc(Assembler::notZero, copy_32_loop);
10126 
10127     bind(copy_loop_tail);
10128     // bail out when there is nothing to be done
10129     testl(tmp5, 0xFFFFFFFF);
10130     jcc(Assembler::zero, return_length);
10131 
10132     movl(len, tmp5);
10133 
10134     // ~(~0 << len), where len is the # of remaining elements to process
10135     movl(result, 0xFFFFFFFF);
10136     shlxl(result, result, len);
10137     notl(result);
10138 
10139     kmovdl(k3, result);
10140 
10141     evmovdquw(tmp1Reg, k3, Address(src, 0), Assembler::AVX_512bit);
10142     evpcmpuw(k2, k3, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
10143     ktestd(k2, k3);
10144     jcc(Assembler::carryClear, return_zero);
10145 
10146     evpmovwb(Address(dst, 0), k3, tmp1Reg, Assembler::AVX_512bit);
10147     jmp(return_length);
10148 
10149     bind(below_threshold);
10150   }
10151 
10152   if (UseSSE42Intrinsics) {
10153     Label copy_32_loop, copy_16, copy_tail;
10154 
10155     movl(result, len);
10156 
10157     movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vectors
10158 
10159     // vectored compression
10160     andl(len, 0xfffffff0);    // vector count (in chars)
10161     andl(result, 0x0000000f);    // tail count (in chars)
10162     testl(len, len);
10163     jcc(Assembler::zero, copy_16);
10164 
10165     // compress 16 chars per iter
10166     movdl(tmp1Reg, tmp5);
10167     pshufd(tmp1Reg, tmp1Reg, 0);   // store Unicode mask in tmp1Reg
10168     pxor(tmp4Reg, tmp4Reg);
10169 
10170     lea(src, Address(src, len, Address::times_2));
10171     lea(dst, Address(dst, len, Address::times_1));
10172     negptr(len);
10173 
10174     bind(copy_32_loop);
10175     movdqu(tmp2Reg, Address(src, len, Address::times_2));     // load 1st 8 characters
10176     por(tmp4Reg, tmp2Reg);
10177     movdqu(tmp3Reg, Address(src, len, Address::times_2, 16)); // load next 8 characters
10178     por(tmp4Reg, tmp3Reg);
10179     ptest(tmp4Reg, tmp1Reg);       // check for Unicode chars in next vector
10180     jcc(Assembler::notZero, return_zero);
10181     packuswb(tmp2Reg, tmp3Reg);    // only ASCII chars; compress each to 1 byte
10182     movdqu(Address(dst, len, Address::times_1), tmp2Reg);
10183     addptr(len, 16);
10184     jcc(Assembler::notZero, copy_32_loop);
10185 
10186     // compress next vector of 8 chars (if any)
10187     bind(copy_16);
10188     movl(len, result);
10189     andl(len, 0xfffffff8);    // vector count (in chars)
10190     andl(result, 0x00000007);    // tail count (in chars)
10191     testl(len, len);
10192     jccb(Assembler::zero, copy_tail);
10193 
10194     movdl(tmp1Reg, tmp5);
10195     pshufd(tmp1Reg, tmp1Reg, 0);   // store Unicode mask in tmp1Reg
10196     pxor(tmp3Reg, tmp3Reg);
10197 
10198     movdqu(tmp2Reg, Address(src, 0));
10199     ptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in vector
10200     jccb(Assembler::notZero, return_zero);
10201     packuswb(tmp2Reg, tmp3Reg);    // only LATIN1 chars; compress each to 1 byte
10202     movq(Address(dst, 0), tmp2Reg);
10203     addptr(src, 16);
10204     addptr(dst, 8);
10205 
10206     bind(copy_tail);
10207     movl(len, result);
10208   }
10209   // compress 1 char per iter
10210   testl(len, len);
10211   jccb(Assembler::zero, return_length);
10212   lea(src, Address(src, len, Address::times_2));
10213   lea(dst, Address(dst, len, Address::times_1));
10214   negptr(len);
10215 
10216   bind(copy_chars_loop);
10217   load_unsigned_short(result, Address(src, len, Address::times_2));
10218   testl(result, 0xff00);      // check if Unicode char
10219   jccb(Assembler::notZero, return_zero);
10220   movb(Address(dst, len, Address::times_1), result);  // ASCII char; compress to 1 byte
10221   increment(len);
10222   jcc(Assembler::notZero, copy_chars_loop);
10223 
10224   // if compression succeeded, return length
10225   bind(return_length);
10226   pop(result);
10227   jmpb(done);
10228 
10229   // if compression failed, return 0
10230   bind(return_zero);
10231   xorl(result, result);
10232   addptr(rsp, wordSize);
10233 
10234   bind(done);
10235 }
10236 
10237 // Inflate byte[] array to char[].
10238 //   ..\jdk\src\java.base\share\classes\java\lang\StringLatin1.java
10239 //   @HotSpotIntrinsicCandidate
10240 //   private static void inflate(byte[] src, int srcOff, char[] dst, int dstOff, int len) {
10241 //     for (int i = 0; i < len; i++) {
10242 //       dst[dstOff++] = (char)(src[srcOff++] & 0xff);
10243 //     }
10244 //   }
10245 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
10246   XMMRegister tmp1, Register tmp2) {
10247   Label copy_chars_loop, done, below_threshold, avx3_threshold;
10248   // rsi: src
10249   // rdi: dst
10250   // rdx: len
10251   // rcx: tmp2
10252 
10253   // rsi holds start addr of source byte[] to be inflated
10254   // rdi holds start addr of destination char[]
10255   // rdx holds length
10256   assert_different_registers(src, dst, len, tmp2);
10257   movl(tmp2, len);
10258   if ((UseAVX > 2) && // AVX512
10259     VM_Version::supports_avx512vlbw() &&
10260     VM_Version::supports_bmi2()) {
10261 
10262     Label copy_32_loop, copy_tail;
10263     Register tmp3_aliased = len;
10264 
10265     // if length of the string is less than 16, handle it in an old fashioned way
10266     testl(len, -16);
10267     jcc(Assembler::zero, below_threshold);
10268 
10269     testl(len, -1 * AVX3Threshold);
10270     jcc(Assembler::zero, avx3_threshold);
10271 
10272     // In order to use only one arithmetic operation for the main loop we use
10273     // this pre-calculation
10274     andl(tmp2, (32 - 1)); // tail count (in chars), 32 element wide loop
10275     andl(len, -32);     // vector count
10276     jccb(Assembler::zero, copy_tail);
10277 
10278     lea(src, Address(src, len, Address::times_1));
10279     lea(dst, Address(dst, len, Address::times_2));
10280     negptr(len);
10281 
10282 
10283     // inflate 32 chars per iter
10284     bind(copy_32_loop);
10285     vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_512bit);
10286     evmovdquw(Address(dst, len, Address::times_2), tmp1, Assembler::AVX_512bit);
10287     addptr(len, 32);
10288     jcc(Assembler::notZero, copy_32_loop);
10289 
10290     bind(copy_tail);
10291     // bail out when there is nothing to be done
10292     testl(tmp2, -1); // we don't destroy the contents of tmp2 here
10293     jcc(Assembler::zero, done);
10294 
10295     // ~(~0 << length), where length is the # of remaining elements to process
10296     movl(tmp3_aliased, -1);
10297     shlxl(tmp3_aliased, tmp3_aliased, tmp2);
10298     notl(tmp3_aliased);
10299     kmovdl(k2, tmp3_aliased);
10300     evpmovzxbw(tmp1, k2, Address(src, 0), Assembler::AVX_512bit);
10301     evmovdquw(Address(dst, 0), k2, tmp1, Assembler::AVX_512bit);
10302 
10303     jmp(done);
10304     bind(avx3_threshold);
10305   }
10306   if (UseSSE42Intrinsics) {
10307     Label copy_16_loop, copy_8_loop, copy_bytes, copy_new_tail, copy_tail;
10308 
10309     if (UseAVX > 1) {
10310       andl(tmp2, (16 - 1));
10311       andl(len, -16);
10312       jccb(Assembler::zero, copy_new_tail);
10313     } else {
10314       andl(tmp2, 0x00000007);   // tail count (in chars)
10315       andl(len, 0xfffffff8);    // vector count (in chars)
10316       jccb(Assembler::zero, copy_tail);
10317     }
10318 
10319     // vectored inflation
10320     lea(src, Address(src, len, Address::times_1));
10321     lea(dst, Address(dst, len, Address::times_2));
10322     negptr(len);
10323 
10324     if (UseAVX > 1) {
10325       bind(copy_16_loop);
10326       vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_256bit);
10327       vmovdqu(Address(dst, len, Address::times_2), tmp1);
10328       addptr(len, 16);
10329       jcc(Assembler::notZero, copy_16_loop);
10330 
10331       bind(below_threshold);
10332       bind(copy_new_tail);
10333       movl(len, tmp2);
10334       andl(tmp2, 0x00000007);
10335       andl(len, 0xFFFFFFF8);
10336       jccb(Assembler::zero, copy_tail);
10337 
10338       pmovzxbw(tmp1, Address(src, 0));
10339       movdqu(Address(dst, 0), tmp1);
10340       addptr(src, 8);
10341       addptr(dst, 2 * 8);
10342 
10343       jmp(copy_tail, true);
10344     }
10345 
10346     // inflate 8 chars per iter
10347     bind(copy_8_loop);
10348     pmovzxbw(tmp1, Address(src, len, Address::times_1));  // unpack to 8 words
10349     movdqu(Address(dst, len, Address::times_2), tmp1);
10350     addptr(len, 8);
10351     jcc(Assembler::notZero, copy_8_loop);
10352 
10353     bind(copy_tail);
10354     movl(len, tmp2);
10355 
10356     cmpl(len, 4);
10357     jccb(Assembler::less, copy_bytes);
10358 
10359     movdl(tmp1, Address(src, 0));  // load 4 byte chars
10360     pmovzxbw(tmp1, tmp1);
10361     movq(Address(dst, 0), tmp1);
10362     subptr(len, 4);
10363     addptr(src, 4);
10364     addptr(dst, 8);
10365 
10366     bind(copy_bytes);
10367   } else {
10368     bind(below_threshold);
10369   }
10370 
10371   testl(len, len);
10372   jccb(Assembler::zero, done);
10373   lea(src, Address(src, len, Address::times_1));
10374   lea(dst, Address(dst, len, Address::times_2));
10375   negptr(len);
10376 
10377   // inflate 1 char per iter
10378   bind(copy_chars_loop);
10379   load_unsigned_byte(tmp2, Address(src, len, Address::times_1));  // load byte char
10380   movw(Address(dst, len, Address::times_2), tmp2);  // inflate byte char to word
10381   increment(len);
10382   jcc(Assembler::notZero, copy_chars_loop);
10383 
10384   bind(done);
10385 }
10386 
10387 #ifdef _LP64
10388 void MacroAssembler::cache_wb(Address line)
10389 {
10390   // 64 bit cpus always support clflush
10391   assert(VM_Version::supports_clflush(), "clflush should be available");
10392   bool optimized = VM_Version::supports_clflushopt();
10393   bool no_evict = VM_Version::supports_clwb();
10394 
10395   // prefer clwb (writeback without evict) otherwise
10396   // prefer clflushopt (potentially parallel writeback with evict)
10397   // otherwise fallback on clflush (serial writeback with evict)
10398 
10399   if (optimized) {
10400     if (no_evict) {
10401       clwb(line);
10402     } else {
10403       clflushopt(line);
10404     }
10405   } else {
10406     // no need for fence when using CLFLUSH
10407     clflush(line);
10408   }
10409 }
10410 
10411 void MacroAssembler::cache_wbsync(bool is_pre)
10412 {
10413   assert(VM_Version::supports_clflush(), "clflush should be available");
10414   bool optimized = VM_Version::supports_clflushopt();
10415   bool no_evict = VM_Version::supports_clwb();
10416 
10417   // pick the correct implementation
10418 
10419   if (!is_pre && (optimized || no_evict)) {
10420     // need an sfence for post flush when using clflushopt or clwb
10421     // otherwise no no need for any synchroniaztion
10422 
10423     sfence();
10424   }
10425 }
10426 #endif // _LP64
10427 
10428 Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
10429   switch (cond) {
10430     // Note some conditions are synonyms for others
10431     case Assembler::zero:         return Assembler::notZero;
10432     case Assembler::notZero:      return Assembler::zero;
10433     case Assembler::less:         return Assembler::greaterEqual;
10434     case Assembler::lessEqual:    return Assembler::greater;
10435     case Assembler::greater:      return Assembler::lessEqual;
10436     case Assembler::greaterEqual: return Assembler::less;
10437     case Assembler::below:        return Assembler::aboveEqual;
10438     case Assembler::belowEqual:   return Assembler::above;
10439     case Assembler::above:        return Assembler::belowEqual;
10440     case Assembler::aboveEqual:   return Assembler::below;
10441     case Assembler::overflow:     return Assembler::noOverflow;
10442     case Assembler::noOverflow:   return Assembler::overflow;
10443     case Assembler::negative:     return Assembler::positive;
10444     case Assembler::positive:     return Assembler::negative;
10445     case Assembler::parity:       return Assembler::noParity;
10446     case Assembler::noParity:     return Assembler::parity;
10447   }
10448   ShouldNotReachHere(); return Assembler::overflow;
10449 }
10450 
10451 SkipIfEqual::SkipIfEqual(
10452     MacroAssembler* masm, const bool* flag_addr, bool value) {
10453   _masm = masm;
10454   _masm->cmp8(ExternalAddress((address)flag_addr), value);
10455   _masm->jcc(Assembler::equal, _label);
10456 }
10457 
10458 SkipIfEqual::~SkipIfEqual() {
10459   _masm->bind(_label);
10460 }
10461 
10462 // 32-bit Windows has its own fast-path implementation
10463 // of get_thread
10464 #if !defined(WIN32) || defined(_LP64)
10465 
10466 // This is simply a call to Thread::current()
10467 void MacroAssembler::get_thread(Register thread) {
10468   if (thread != rax) {
10469     push(rax);
10470   }
10471   LP64_ONLY(push(rdi);)
10472   LP64_ONLY(push(rsi);)
10473   push(rdx);
10474   push(rcx);
10475 #ifdef _LP64
10476   push(r8);
10477   push(r9);
10478   push(r10);
10479   push(r11);
10480 #endif
10481 
10482   MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, Thread::current), 0);
10483 
10484 #ifdef _LP64
10485   pop(r11);
10486   pop(r10);
10487   pop(r9);
10488   pop(r8);
10489 #endif
10490   pop(rcx);
10491   pop(rdx);
10492   LP64_ONLY(pop(rsi);)
10493   LP64_ONLY(pop(rdi);)
10494   if (thread != rax) {
10495     mov(thread, rax);
10496     pop(rax);
10497   }
10498 }
10499 
10500 #endif